# Extracting Audio Features and Downloading Audio Files


#### Imports


In [9]:
from tqdm import tqdm
import warnings
from src.utils.audio_utils import dl_and_extract_features
import pandas as pd
import os
import sys
sys.path.append('..')
warnings.filterwarnings('ignore')

### Feature extraction and download


In [5]:
csv_path = "../data/csv_files/chartex_final.csv"
audio_dir = "../data/audio_wav"

df = pd.read_csv(csv_path)

In [7]:
def batch_extract(df, start_index, end_index):
    for i in range(start_index, end_index):
        print(f'Processing track {i+1} of {len(df)}')
        track_data = df.iloc[i]
        features = dl_and_extract_features(track_data)
        if not features:
            print(
                f'No features found for track {i+start_index} of {len(df)}, skipping...')
            continue
        # add features to dataframe
        for feature_names in features.keys():
            df.loc[i, feature_names] = features[feature_names]
        # save dataframe
        df.to_csv("../data/audio_features.csv", index=False)

We want to skip songs that have been previously downloaded


In [None]:
downloaded_songs = os.listdir('../data/audio_wav/')
num_of_songs = len(df.index)

for i in range(num_of_songs):
    curr_song = df.iloc[i]
    if not curr_song['id'] + '.wav' in downloaded_songs:
        batch_extract(df, i, i+1)

We couldn't download most of those missing songs. We will ignore them.

Next, we will convert the audio files from MP4 format to MP3 format in order to be able to use it in the torchaudio library:


In [13]:
os.mkdir("../data/audio")

In [None]:
import subprocess

for song in tqdm(downloaded_songs):
    song_path = "../data/audio_wav/" + song
    conv_song_path = "../data/audio/" + song[:-4] + ".mp3"
    ffmpeg_command = f"ffmpeg -i {song_path} -vn -acodec libmp3lame -q:a 4 -ar 22050 {conv_song_path}"

    subprocess.run(ffmpeg_command, shell=True)

### Creating the dataset:


Now let's finish by creating the .csv of the dataset, using the threshold of $5e5$. We will first drop all features that the model cannot infer from the audio and the new features we created:


In [5]:
converted_songs = [song[:-4] for song in os.listdir('../data/audio/')]


df['viral'] = (df['number_of_videos'] > 5e5).astype('int32')
df.drop(['track_name', 'track_pop', 'artist', 'artist_pop', 'album', 'number_of_videos'
        'time_signature', 'artist_name', 'total_likes_count', 'number_of_videos',
         'chroma_stft', 'rmse', 'spec_cent', 'spec_bw', 'rolloff', 'zcr', 'mfcc'], axis=1, errors='ignore', inplace=True)

and delete all songs from the dataframe that we couldn't download:


In [6]:
df = df[(df["id"].isin(converted_songs))]

We notice that duration_ms is not correct and thus we will fix it:


In [7]:
import ffmpeg


def get_duration_ffmpeg(file_path):
    probe = ffmpeg.probe(file_path)
    stream = next(
        (stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
    duration = float(stream['duration'])
    return duration

In [8]:
for i in tqdm(df.index):
    df.loc[i, 'duration_ms'] = get_duration_ffmpeg(
        '../data/audio/' + df.loc[i, 'id'] + '.mp3') * 1000

100%|██████████| 3915/3915 [01:45<00:00, 36.97it/s]


and we will delete all songs that are shorter than 30 seconds and longer than 5 minutes:


In [9]:
df = df[(df['duration_ms'] < 5*60*1000) & (df['duration_ms'] >= 30*1000)]

In [13]:
df.to_csv("../data/metadata.csv")