In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# Misc
import os, sys
from glob import glob

In [3]:
import random

In [4]:
# Dataframes and such
import pandas as pd
import numpy as np

In [5]:
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Ipython stuff
import IPython
import IPython.display as ipd

In [7]:
# Audio stuff
import librosa
import librosa.display

In [47]:
from tqdm.notebook import tqdm

In [9]:
DATA_FOLDER = "../data/gtzan"

## Sample audio files

In [10]:
# Had to remove jazz file #54 because of corruption, apparently

In [11]:
# Make a list of all the wav files in the dataset and store them in a variable
audio_files = glob(f"{DATA_FOLDER}/*/*.wav")

In [12]:
random_file = random.choice(audio_files)
print(f"Showing random file {os.path.basename(random_file)}")
ipd.Audio(random_file)

Showing random file blues.00094.wav


### MEL Spectrograms

In [13]:
hop_length = 512

n_fft = 2048
n_mels = 256

## Dataset creation

In [14]:
wav_files = {
    'path': [],
    'genre': [],
}

for af in tqdm(audio_files):
    af_arr = af.split('/')
    genre = af_arr[-2]
    fname = af_arr[-1]
    
    out_file = f"{genre}/{fname}"
    
    wav_files['path'].append(out_file)
    wav_files['genre'].append(genre)
    
df = pd.DataFrame(wav_files)

  0%|          | 0/999 [00:00<?, ?it/s]

### Do the split first

In [15]:
# 10% for test
test_size = 0.1

In [16]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['genre'])

df_train['subset'] = 'train'
df_test['subset'] = 'test'

df_full = pd.concat((df_train, df_test)).sample(frac=1.0) # frac is to shuffle

df_full.head(n=10)

Unnamed: 0,path,genre,subset
145,reggae/reggae.00061.wav,reggae,test
739,classical/classical.00046.wav,classical,train
974,hiphop/hiphop.00053.wav,hiphop,train
881,disco/disco.00089.wav,disco,train
982,hiphop/hiphop.00004.wav,hiphop,train
860,disco/disco.00034.wav,disco,train
953,hiphop/hiphop.00058.wav,hiphop,test
917,hiphop/hiphop.00000.wav,hiphop,train
782,classical/classical.00006.wav,classical,train
527,blues/blues.00039.wav,blues,train


In [57]:
OUT_FOLDER = "../data/gtzan_augmented_256_test"

VARIATIONS_PER_SONG = 3

In [18]:
import random

In [37]:
from librosa.effects import time_stretch

def time_stretch_random(y):
    
    # Shift between three semitones up or down
    rate = 1 + random.uniform(-0.2, 0.25)
#     rate =1.25
    
    y_stretch = time_stretch(y, rate=rate)
    
    return y_stretch

In [38]:
from librosa.effects import pitch_shift


def shift_pitch_random(y, sr):
    
    # Shift between three semitones up or down
    n_steps = random.uniform(-1.5, 1.5)
    
    y_pitch = pitch_shift(y, sr=sr, n_steps=n_steps)
    
    return y_pitch

In [56]:
def get_fixed_window(S_db_mel, width=1024):
    
    # Create a window exactly 1024 wide
    # This is needed because time stretching might make the audio shorter
    max_starting_point = S_db_mel.shape[1] - width
    start_j = random.randint(0, max_starting_point)

    return S_db_mel[:, start_j:start_j+width]

In [58]:
processed_files = {
    'path': [],
    'genre': [],
    'subset': [],
}

means = []
stds = []


for _, r in tqdm(df_full.iterrows(), total=len(df_full)):
#     print(r)

    genre = af_arr[-2]
    fname = r['path'].split('/')[-1]
    subset = r['subset']
    
    genre_folder = f"{OUT_FOLDER}/{genre}"
    
    os.makedirs(genre_folder, exist_ok=True)
    
    # Load audio file and create spectrogram
    y, sr = librosa.load(os.path.join(DATA_FOLDER, r['path']))
   
    # First save the spectrogram of the original version of the file
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length)
    
    S_db_mel = librosa.amplitude_to_db(S, ref=np.max)
    
    S_db_mel = get_fixed_window(S_db_mel)
        
    # (for my own) sanity check
    assert S_db_mel.shape[1] == 1024
    assert S_db_mel.shape[0] == 256
    
    if subset != 'test':
        means.append(S_db_mel.mean())
        stds.append(S_db_mel.std())
    
#     print(S_db_mel.mean())
#     print(S_db_mel.std())
#     1/0
    
    out_file = f"{genre_folder}/{fname[:-4]}-orig.npy"
    
    processed_files['path'].append(out_file)
    processed_files['genre'].append(genre)
    
    np.save(out_file, S_db_mel)
    
    # Now Create variations by adding pitch_shift and other things
    
    for i in range(1, VARIATIONS_PER_SONG+1):
        
        y_aug = y # This is just so that I can move stuff up and down without going crazy
        y_aug = shift_pitch_random(y_aug, sr)
        y_aug = time_stretch_random(y_aug)
        
        
        S = librosa.feature.melspectrogram(
            y=y_aug, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length)

        S_db_mel = librosa.amplitude_to_db(S, ref=np.max)
        
        # Check the size
        assert S_db_mel.shape[1] >= 1024
        
        S_db_mel = get_fixed_window(S_db_mel)
        
        # (for my own) sanity check
        assert S_db_mel.shape[1] == 1024
        assert S_db_mel.shape[0] == 256
        
        out_file = f"{genre_folder}/{fname[:-4]}-aug-{i}.npy"

        processed_files['path'].append(out_file)
        processed_files['genre'].append(genre)
        processed_files['subset'].append(subset)
        

        np.save(out_file, S_db_mel)
        

df = pd.DataFrame(processed_files)
df.to_csv(f"{OUT_FOLDER}/metadata.csv", index=False)

  0%|          | 0/999 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [22]:
df.head()

Unnamed: 0,path,genre
0,metal/metal.00000.npy,metal
1,metal/metal.00032.npy,metal
2,metal/metal.00034.npy,metal
3,metal/metal.00063.npy,metal
4,metal/metal.00090.npy,metal


### Train/test split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['genre'])

In [25]:
df_train['subset'] = 'train'
df_test['subset'] = 'test'

In [26]:
df_full = pd.concat((df_train, df_test)).sample(frac=1.0) # frac is to shuffle

In [27]:
df_full.head(n=10)

Unnamed: 0,path,genre,subset
28,metal/metal.00076.npy,metal,test
615,pop/pop.00016.npy,pop,test
152,reggae/reggae.00036.npy,reggae,test
285,jazz/jazz.00018.npy,jazz,train
714,classical/classical.00022.npy,classical,test
98,metal/metal.00041.npy,metal,train
654,pop/pop.00087.npy,pop,train
673,pop/pop.00051.npy,pop,test
306,rock/rock.00060.npy,rock,train
523,blues/blues.00065.npy,blues,train


In [28]:
df_full.to_csv(f"{OUT_FOLDER}/metadata.csv", index=False)

### Wav dataset

In [13]:
OUT_FOLDER = "../data/gtzan"

In [17]:
wav_files = {
    'path': [],
    'genre': [],
}

for af in tqdm(audio_files):
    af_arr = af.split('/')
    genre = af_arr[-2]
    fname = af_arr[-1]
    
    out_file = f"{genre}/{fname}"
    
    wav_files['path'].append(out_file)
    wav_files['genre'].append(genre)
    


  0%|          | 0/999 [00:00<?, ?it/s]

In [20]:
df = pd.DataFrame(wav_files)

In [21]:
df.head()

Unnamed: 0,path,genre
0,metal/metal.00000.wav,metal
1,metal/metal.00032.wav,metal
2,metal/metal.00034.wav,metal
3,metal/metal.00063.wav,metal
4,metal/metal.00090.wav,metal


### Train/test split

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['genre'])

In [24]:
df_train['subset'] = 'train'
df_test['subset'] = 'test'

In [25]:
df_full = pd.concat((df_train, df_test)).sample(frac=1.0) # frac is to shuffle

In [26]:
df_full.head(n=10)

Unnamed: 0,path,genre,subset
995,hiphop/hiphop.00070.wav,hiphop,train
323,rock/rock.00044.wav,rock,train
387,rock/rock.00069.wav,rock,train
90,metal/metal.00084.wav,metal,train
30,metal/metal.00071.wav,metal,test
113,reggae/reggae.00005.wav,reggae,test
787,classical/classical.00010.wav,classical,train
155,reggae/reggae.00099.wav,reggae,train
104,reggae/reggae.00084.wav,reggae,train
939,hiphop/hiphop.00085.wav,hiphop,train


In [27]:
df_full.to_csv(f"{OUT_FOLDER}/metadata_t20.csv", index=False)