In [1]:
import pandas as pd
import numpy as np
import boto3
import io


In [2]:
!conda install -c conda-forge librosa -y

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:
- 
  - conda-forge/noarch::seaborn-base==0.11.1=pyhd8ed1ab_1
  - conda-forge/noarch::nbclassic==0.2.6=pyhd8ed1ab_0
  - conda-forge/noarch::typing-extensions==3.7.4.3=0
  - conda-forge/linux-64::pluggy==0.13.1=py36h5fab9bb_4
  - conda-forge/linux-64::blaze==0.11.3=py36_0
  - conda-forge/linux-64::matplotlib==3.3.4=py36h5fab9bb_0
  - defaults/linux-64::_anaconda_depends==5.1.0=py36_2
  - conda-forge/noarch::python-language-server==0.36.2=pyhd8ed1ab_0
  - conda-forge/noarch::jupyterlab_server==2.3.0=pyhd8ed1ab_0
  - conda-forge/noarch::pyls-black==0.4.6=pyh9f0ad1d_0
  - conda-forge/linux-64::scikit-image==0.16.2=py36hb3f55d8_0
  - conda-forge/noarch::path.py==12.5.0=0
  - conda-forge/noarch::qdarkstyle==2.8.1=pyhd8ed1ab_2
  - conda-forge/noarch::ipywidgets==7.6.3=pyhd3deb0d_0
  - con

In [3]:
import librosa


In [4]:
data_dir = 's3://musicgenredatalake/'

genres = ['blues', 'classical', 'country', 'disco',
          'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
window_size = 3

In [5]:
columns = ['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
           'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
           'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
           'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
           'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
           'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
           'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
           'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
           'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
           'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
           'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
           'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
           'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var', 'label']


In [6]:
def extract_features(window):
    ''' Pendiente documentar '''

    chromogram = librosa.feature.chroma_stft(window)
    rms = librosa.feature.rms(window)
    centroid = librosa.feature.spectral_centroid(window)
    bandwidth = librosa.feature.spectral_bandwidth(window)
    roll_off = librosa.feature.spectral_rolloff(window)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(window)
    harmony, perceptual = librosa.effects.hpss(window)
    tempo, _ = librosa.beat.beat_track(window)
    mfcc = librosa.feature.mfcc(window)

    mfcc_values = []
    for mean, var in zip(mfcc.mean(axis=1), np.var(mfcc, axis=1)):
        mfcc_values.append(mean)
        mfcc_values.append(var)

    return [
        chromogram.mean(),
        np.var(chromogram),
        rms.mean(),
        np.var(rms),
        centroid.mean(),
        np.var(centroid),
        bandwidth.mean(),
        np.var(bandwidth),
        roll_off.mean(),
        np.var(roll_off),
        zero_crossing_rate.mean(),
        np.var(zero_crossing_rate),
        harmony.mean(),
        np.var(harmony),
        perceptual.mean(),
        np.var(perceptual),
        tempo,
        *mfcc_values
    ]


In [7]:
bucket = 'musicgenredatalake'
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')

In [8]:

features = []
for genre in genres:
    page_iterator = paginator.paginate(Bucket=bucket,Prefix='raw/songs/'+f'{genre}'+'/')    
    for files in page_iterator:
        for file_content in files['Contents']:            
            obj = client.get_object(Bucket=bucket, Key=file_content['Key'])
            data = io.BytesIO(obj['Body'].read())                 
            try:
                #signal, sample_rate = librosa.load(f'{data_dir}/file_content['Key']')
                signal, sample_rate = librosa.load(data)
                
                # Get rid of silence at the begining and end
                signal, _ = librosa.effects.trim(signal)
                n_points = window_size * sample_rate
                for i in range(int(len(signal) / n_points)):
                    print(genre, file_content['Key'], i, end='\r')
                    window = signal[i * n_points:(i + 1) * n_points]
                    if len(window):
                        values = extract_features(window)
                        values = [file_content['Key'].replace('.wav', f'.{i}.wav'), len(
                            window), *values, genre]
                        features.append(values)
            except Exception as ex:
                print(ex)
                print(file_content['Key'])                


    

Error opening <_io.BytesIO object at 0x7f19f987f990>: Format not recognised.
raw/songs/jazz/jazz.00054.wav
3000 secondsgs/rock/rock.00099.wav 9.wav 9


In [9]:
df = pd.DataFrame(features)
df.columns = columns

In [10]:
df

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,raw/songs/blues/blues.00000.0.wav,66150,0.335434,0.091088,0.130405,0.003521,1773.285877,168244.728456,1972.723622,117298.851602,...,39.725567,-3.241225,36.486431,0.721986,38.096764,-5.043307,33.608330,-0.237658,43.827778,blues
1,raw/songs/blues/blues.00000.1.wav,66150,0.343020,0.086142,0.112699,0.001450,1816.195860,90703.325338,2009.201574,65548.531299,...,65.312981,-6.100084,40.738815,0.219240,50.587795,-2.875456,96.975441,5.800725,60.006115,blues
2,raw/songs/blues/blues.00000.2.wav,66150,0.346838,0.092210,0.132002,0.004620,1788.642783,111322.536900,2085.045996,74755.260774,...,67.582199,-1.801321,28.132212,2.308316,48.102489,-1.931482,53.116806,2.522431,33.136238,blues
3,raw/songs/blues/blues.00000.3.wav,66150,0.363671,0.086856,0.132562,0.002447,1654.902168,112316.264418,1959.202709,83672.222471,...,47.059677,-3.859234,27.984928,1.255384,35.093281,-3.619866,51.420628,3.639336,31.930040,blues
4,raw/songs/blues/blues.00000.4.wav,66150,0.335927,0.088291,0.143289,0.001701,1630.737016,79648.228139,1948.459295,60221.594973,...,30.307617,0.629568,44.883640,1.709780,51.706696,-3.402105,26.686306,0.546948,29.212286,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975,raw/songs/rock/rock.00099.5.wav,66150,0.349242,0.080523,0.050019,0.000097,1499.033450,164296.529369,1718.741487,85910.800675,...,42.411774,-9.072774,38.522137,-4.239303,31.064833,-5.625191,48.812855,1.794132,38.993576,rock
9976,raw/songs/rock/rock.00099.6.wav,66150,0.372667,0.082638,0.057897,0.000088,1847.993568,281007.291156,1906.381942,99815.431761,...,32.480297,-12.389524,65.873482,-3.085007,54.283779,-11.963290,63.390671,0.404298,18.759731,rock
9977,raw/songs/rock/rock.00099.7.wav,66150,0.347207,0.088840,0.052402,0.000701,1346.166433,662977.658874,1562.051939,139002.058162,...,78.050781,-2.524270,21.777952,4.799636,25.962271,1.797535,48.307678,-0.320112,41.750011,rock
9978,raw/songs/rock/rock.00099.8.wav,66150,0.387354,0.084762,0.066430,0.000320,2084.876439,203148.324687,2019.066228,22200.073698,...,28.323929,-5.370701,17.280422,6.469846,21.370140,2.359090,24.827066,0.688447,12.747363,rock


In [14]:
def mahalanobis(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    #if not cov:
        #cov = np.cov(x.T)
    inv_cov = np.linalg.inv(cov)
    mahalanobis_distances = np.diag(np.dot((x_mu @ inv_cov), x_mu.T))
   
    return mahalanobis_distances

In [16]:
from sklearn.preprocessing import MinMaxScaler
df['label'] = df['label'].astype('category')
df['length'] = pd.to_numeric(df['length'])
df_x = df.drop(['filename', 'label', 'length'], axis=1)
x = np.array(df_x)
y = df['label'].cat.codes.values
scaler = MinMaxScaler()
x_norm = scaler.fit_transform(x)
data = df.iloc[:, 2:-1]
data_norm = scaler.fit_transform(data)
cov_h_norm = np.cov(x_norm.T)
cov_i_norm = cov_h_norm + 10 * np.eye(57, 57)
mahalanobis_dis_i_norm = mahalanobis(x=x_norm, data=data_norm, cov=cov_i_norm)
p25 = np.percentile(mahalanobis_dis_i_norm, 25)
p75 = np.percentile(mahalanobis_dis_i_norm, 75)
threshold = p75 + 3 * (p75 - p25)
x_fn = x_norm[mahalanobis_dis_i_norm < threshold]
y_fn = y[mahalanobis_dis_i_norm < threshold]
cols = df_x.columns
df_X_fn = pd.DataFrame(x_fn)

In [25]:
cols = ['chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
           'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
           'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
           'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
           'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
           'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
           'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
           'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
           'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
           'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
           'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
           'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
           'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var', 'label']

In [26]:
df_X_fn['57']=y_fn
df_fn=df_X_fn
df_fn.columns=cols

In [28]:
df_fn.to_csv('s3://musicgenredatalake/trusted/dataset/features_3_sec.csv', index=False)
