# Extracting Audio Features and Downloading Audio Files


#### Imports


In [42]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
import ffmpeg
import subprocess
import os
import pandas as pd
from src.utils.audio_utils import dl_and_extract_features
from tqdm import tqdm
from src.utils.file_utils import create_dirs_if_not_exist

### Feature extraction and download


In [43]:
CSV_PATH = "../data/audio_features.csv"
WAV_PATH = "../data/audio_wav"
MP3_PATH = "../data/audio_mp3"

df = pd.read_csv(CSV_PATH)

In [44]:
def batch_extract(df, start_index, end_index):
    for i in range(start_index, end_index):
        print(f"Processing track {i+1} of {len(df)}")
        track_data = df.iloc[i]
        features = dl_and_extract_features(track_data)
        if not features:
            print(
                f"No features found for track {i+start_index} of {len(df)}, skipping..."
            )
            continue
        # add features to dataframe
        for feature_names in features.keys():
            df.loc[i, feature_names] = features[feature_names]
        # save dataframe
        df.to_csv(CSV_PATH, index=False)

We want to skip songs that have been previously downloaded


In [45]:
downloaded_songs = os.listdir(WAV_PATH)
num_of_songs = len(df.index)
print(num_of_songs)

for i in range(num_of_songs):
    curr_song = df.iloc[i]
    if not curr_song["id"] + ".wav" in downloaded_songs:
        batch_extract(df, i, i + 1)

3932
Processing track 425 of 3932
Found youtube url: https://www.youtube.com/watch?v=kNI44lzXjDY&pp=ygUgRW5nZWx3b29kIENyeXN0YWwgRG9scGhpbiBseXJpY3M%3D
Downloading track 6kx8Hf1Udk4V0Ivq6zpoyG.wav
Could not download 6kx8Hf1Udk4V0Ivq6zpoyG.wav. Error: kNI44lzXjDY is age restricted, and can't be accessed without logging in.
No features found for track 848 of 3932, skipping...
Processing track 738 of 3932
Found youtube url: https://www.youtube.com/watch?v=jfKfPfyJRdk&pp=ygUoU3MwZmkxIENoaWxsIGxvZmkgdHlwZSBiZWF0ICh6elMpIGx5cmljcw%3D%3D
Downloading track 4Ef75ndRbnsRvKEmNnLIyc.wav
Could not download 4Ef75ndRbnsRvKEmNnLIyc.wav. Error: jfKfPfyJRdk is streaming live and cannot be loaded
No features found for track 1474 of 3932, skipping...
Processing track 1240 of 3932
Found youtube url: https://www.youtube.com/watch?v=xH4PmSBYaW4&pp=ygUlTW9yZWFydCDQryDQsdGD0LTRgyDQtdCx0LDRgtGMIGx5cmljcw%3D%3D
Downloading track 0R1oBtQDX9N7ohedYIYbqT.wav
Could not download 0R1oBtQDX9N7ohedYIYbqT.wav. Error: xH4P

We couldn't download most of those missing songs. We will ignore them.

Next, we will convert the audio files from MP4 format to MP3 format in order to be able to use it in the torchaudio library:


In [48]:
create_dirs_if_not_exist(MP3_PATH)

for song in tqdm(downloaded_songs):
    song_path = os.path.join(WAV_PATH, song)
    conv_song_path = os.path.join(MP3_PATH, song[:-4] + ".mp3")
    # check if song is already converted
    if os.path.exists(conv_song_path):
        # print(f"Song {song} already converted, skipping...")
        continue
    ffmpeg_command = f"ffmpeg -hide_banner -loglevel error -i {song_path} -vn -acodec libmp3lame -q:a 4 -ar 22050 {conv_song_path}"
    subprocess.run(ffmpeg_command, shell=True)

 27%|██▋       | 1049/3924 [01:31<50:30,  1.05s/it] 

Deleting songs that could not be downloaded:

In [None]:
df = pd.read_csv(../data/audio_features.csv)
converted_songs = [song[:-4] for song in os.listdir(MP3_PATH)]
df = df[(df["id"].isin(converted_songs))]

We notice that duration_ms is not correct and thus we will fix it:


In [None]:
def get_duration_ffmpeg(file_path):
    probe = ffmpeg.probe(file_path)
    stream = next(
        (stream for stream in probe["streams"]
         if stream["codec_type"] == "audio"), None
    )
    duration = float(stream["duration"])
    return duration

In [None]:
for i in tqdm(df.index):
    df.loc[i, "duration_ms"] = (
        get_duration_ffmpeg("../data/audio/" + df.loc[i, "id"] + ".mp3") * 1000
    )

0it [00:00, ?it/s]


and we will delete all songs that are shorter than 30 seconds and longer than 5 minutes:


In [None]:
df = df[(df["duration_ms"] < 5 * 60 * 1000) & (df["duration_ms"] >= 30 * 1000)]

Saving to CSV

In [None]:
df.to_csv(CSV_PATH, index=False)