In [1]:
import glob
import time
import IPython.display as ipd
import librosa
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import librosa.display
import sklearn

# Feature extraction

### Load data

In [2]:
data=pd.read_csv("data/FMA/Features/small_tracks2.csv")
data.sample(frac=0.001)

Unnamed: 0,filename,genre,genre id
2564,data/FMA/Wav/099096.wav,Hip-Hop,3
468,data/FMA/Wav/145457.wav,Instrumental,4
965,data/FMA/Wav/071248.wav,Hip-Hop,3
1701,data/FMA/Wav/067357.wav,Pop,6
2883,data/FMA/Wav/098569.wav,Hip-Hop,3
3840,data/FMA/Wav/004779.wav,Rock,7
3576,data/FMA/Wav/140788.wav,Folk,2
1899,data/FMA/Wav/149099.wav,Pop,6


In [11]:
data.shape

(7997, 3)

In [3]:
def dataframe_representation():
    dataframe={"filename":[],"genre":[],"genre id":[]}
    for folder in glob.glob("data/GTZAN/*"):
        for file in glob.glob(folder+"/*"):
            dataframe["genre"].append(folder.split("/")[-1])
            dataframe["genre id"].append(genres[folder.split('/')[-1]])
            dataframe["filename"].append(file)
            
    dataframe=pd.DataFrame(dataframe)
    return dataframe
genre_names=["blues","classical","country","disco","pop","metal","jazz","rock","reggae","hiphop"]
genres={name:value for name,value in zip(genre_names,[i for i in range(len(genre_names))])}
data=dataframe_representation()
data.head()

Unnamed: 0,filename,genre,genre id
0,data/GTZAN/disco/disco.00075.wav,disco,3
1,data/GTZAN/disco/disco.00092.wav,disco,3
2,data/GTZAN/disco/disco.00012.wav,disco,3
3,data/GTZAN/disco/disco.00021.wav,disco,3
4,data/GTZAN/disco/disco.00014.wav,disco,3


### Low-level features
Low-level features pertain to content-based features. The chosen content-based features along with their representations are listed below.
- Spectral centroid (mean)
- Spectral rolloff (std)
- Chroma (mean)
- Zero crossing rate (mean)
- MFCCs (20 each represented by mean)

In [12]:
def low_level_features():
    
    n=len(data)
    
    features_dataframe={"spectral_centroid":np.zeros(n),"spectral_rolloff":np.zeros(n),"chroma":np.zeros(n),
                        "zero_crossing_rate":np.zeros(n),"rms_mean":np.zeros(n),
                       "rms_std":np.zeros(n)}
    
    for index,row in data.iterrows(): 
        x,sr=librosa.load(row["filename"])
        
        #extract spectral centroid
        spectral_centroid=librosa.feature.spectral_centroid(x, sr=sr)
        features_dataframe["spectral_centroid"][index]=np.std(spectral_centroid)
        
        #extract spectral rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(x, sr=sr)
        features_dataframe["spectral_rolloff"][index]=np.std(spectral_rolloff)
        
        
        #extract chroma
        chroma=librosa.feature.chroma_stft(x, sr=sr)
        features_dataframe["chroma"][index]=np.mean(chroma)
        
        #extract zero crossing rate
        zcr=librosa.feature.zero_crossing_rate(x)
        features_dataframe["zero_crossing_rate"][index]=(np.mean(zcr))
        
        #extract root mean square energy
        rms=librosa.feature.rmse(x)
        features_dataframe["rms_mean"][index]=np.mean(rms)
        features_dataframe["rms_std"][index]=np.std(rms)
        
#         #extract tempo
#         onset_env = librosa.onset.onset_strength(x, sr=sr)
#         tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
#         features_dataframe["tempo"][index]=tempo[0]
        
        #extract mfccs (n mfccs)
        mfccs=librosa.feature.mfcc(x, sr=sr)
        mfccs=np.mean(mfccs,axis=1)
        
        for i in range(len(mfccs)):
            key="mfcc"+str(i+1)
            if key in features_dataframe.keys():
                features_dataframe[key][index]=(mfccs[i])
            else:
                features_dataframe[key]=np.zeros(n)
                features_dataframe[key][index]=(mfccs[i])
                
        #extract spectral contrast
        spectral_contrast = librosa.feature.spectral_contrast(x, sr=sr)
        spectral_contrast=np.mean(spectral_contrast,axis=1)
        
        for i in range(len(spectral_contrast)):
            key="sc"+str(i+1)
            if key in features_dataframe.keys():
                features_dataframe[key][index]=(spectral_contrast[i])
            else:
                features_dataframe[key]=np.zeros(n)
                features_dataframe[key][index]=(spectral_contrast[i])
                
    features_dataframe["label"]=data["genre id"].values
    features_dataframe=pd.DataFrame(features_dataframe)
    sklearn.utils.shuffle(features_dataframe)
    
    return features_dataframe

In [91]:
song,sr=librosa.load("data/GTZAN/disco/disco.00075.wav")
spectral_contrast = librosa.feature.spectral_contrast(song, sr=sr)
spectral_contrast=np.mean(spectral_contrast,axis=1)
print(spectral_contrast)
spectral_contrast.shape

[ 23.5310785   13.95174217  17.26574072  17.02327043  17.31074785
  16.38186677  42.83155136]


(7,)

In [None]:
start=time.time()
x=low_level_features()
x.to_csv("long.csv",index=False)
end=time.time()
print("Feature extraction duration :",end-start)
x.head()



In [327]:
X=x.loc[:,(x.columns!="label") & (x.columns!="tempo")].values
Y=x.loc[:,(x.columns=="label") & (x.columns!="tempo")].values

In [328]:
X=(X-np.mean(X,axis=0))/(np.std(X,axis=0)+1e-8)

## Splitting

In [329]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.1, random_state=42)

## kNN classification

In [330]:
from sklearn.neighbors import KNeighborsClassifier

In [332]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, Y_train.ravel())
y_tr=neigh.predict(X_train)
y_p=(neigh.predict(X_test))
print("Train",np.count_nonzero(y_tr==Y_train.flatten())/len(y_tr))
print("Test",np.count_nonzero(y_p==Y_test.flatten())/len(y_p))

Train 0.8411111111111111
Test 0.7


# Naive bayes

In [103]:
from sklearn.naive_bayes import GaussianNB

In [334]:
gnb = GaussianNB()
y_tr = gnb.fit(X_train, Y_train.ravel()).predict(X_train)
y_pred = gnb.fit(X_train, Y_train.ravel()).predict(X_test)
print("Train",np.count_nonzero(y_tr==Y_train.flatten())/len(y_tr))
print("Test",np.count_nonzero(y_pred==Y_test.flatten())/len(y_pred))

Train 0.5477777777777778
Test 0.52


# Logistic regression OVA

In [335]:
from sklearn.linear_model import LogisticRegression

In [337]:
clf = LogisticRegression(random_state=0,solver="sag",max_iter=500).fit(X_train, Y_train.ravel())
print("Train",clf.score(X_train, Y_train))
print("Test",clf.score(X_test,Y_test.ravel()))

Train 0.754444444444
Test 0.72


# MLP

In [338]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation="identity",max_iter=1500,solver="adam",hidden_layer_sizes=(16)).fit(X_train,Y_train.ravel(),)
a=clf.score(X_train, Y_train.ravel())
b=clf.score(X_test, Y_test.ravel())
print(a)
print(b)

0.784444444444
0.7


# SVM

In [339]:
 from sklearn import svm

In [345]:
lin_clf = svm.SVC()
lin_clf.fit(X_train, Y_train.ravel())
y_tr = lin_clf.predict(X_train)
y_pred = lin_clf.predict(X_test)
print("Train",np.count_nonzero(y_tr==Y_train.flatten())/len(y_tr))
print("Test",np.count_nonzero(y_pred==Y_test.flatten())/len(y_pred))

Train 0.8822222222222222
Test 0.73


### High-level features

A high-level feature would be a spectrogram obtained from the short time fourier transform of a given audio file. We may choose to compute the mel-spectrogram instead of the spectrogram. A mel-spectrogram is obtained by scaling a spectrogram to the mel-scale (pitch measure) this means that equally distances in pitch sound equally distant to the human ear. A visualization of the spectrogram and mel-spectrogram are shown below.

In [None]:
audio_path='data/GTZAN/blues/blues.00002.wav'
x,sr=librosa.load(audio_path)
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))

chroma=librosa.feature.chroma_stft(x, sr=sr)
print(chroma.shape)

plt.figure()
plt.title("Spectrogram")
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')

plt.figure()
plt.title("Mel-spectrogram")
y, sr = librosa.load(audio_path)
mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)
mel_spect = librosa.power_to_db(X, ref=np.max)
librosa.display.specshow(mel_spect, y_axis='mel', fmax=8000, x_axis='time')

plt.show()

In [None]:
def high_level_features(mode):
    features_dataframe={}
    if mode=="spectrogram":
        audio_paths=data["path"].values
        for audio in audio_paths:
            x,sr=librosa.load(audio)
            X = librosa.stft(x)
            
            Xdb = librosa.amplitude_to_db(abs(X))
            X_resized=image_resized = resize(Xdb,(512,512),anti_aliasing=True)
            break
        plt.figure()
        plt.imshow(Xdb)
        #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
        plt.figure()
        #librosa.display.specshow(X_resized, sr=sr, x_axis='time', y_axis='hz')
        plt.imshow(X_resized)
    pass

high_level_features("spectrogram")

In [None]:
ipd.Audio(audio_path)