Extraction of soundnet feature vectors for ESC-50

The ESC-50 dataset can be found [here](https://github.com/karoldvl/ESC-50). You need to clone the repo. 

In [1]:
#--mesure time
import time 
#soundnet in pytorch
import extract_features as ex
import pandas as pd 
import librosa
from tqdm import tqdm
import numpy as np

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

Extracting all feature vectors for ESC-10

In [3]:
import librosa

def extract_vectors_resampl(filepath):
    audio,sr = librosa.load(path=filepath,sr=22050)
    features = ex.extract_pytorch_feature_nooutput(audio,'sound8.pth')
   
    #print([x.shape for x in features])
    
    ##extract vector
    vectors = []
    for idlayer in range(len(features)):
        vectors.append(ex.extract_vector(features,idlayer)) #features vector 
    return vectors, len(audio)/sr

In [4]:
vec,siz= extract_vectors_resampl('/home/nfarrugi/git/ESC-50/audio/4-195497-A-11.wav')

In [5]:
len(vec)

7

Fetching the file list for ESC10

In [6]:
import pandas as pd

Df = pd.read_csv('/home/nfarrugi/git/ESC-50/meta/esc50.csv')

This cell will extract all feature vectors of SoundNet from conv1 to conv7 on the ESC-10 dataset (subset of ESC-50). 
No data augmentation is performed. 

In [7]:
import os 
from tqdm import tqdm
pathesc = '/home/nfarrugi/git/ESC-50/audio/'
listfiles = os.listdir(pathesc)

all_fv_esc10 = dict()

#for curfile in tqdm(listfiles[:10]):

for curfile in tqdm(Df[Df['esc10']]['filename']):
    
    curid = curfile[:-4]
        
    vec,siz= extract_vectors_resampl(os.path.join(pathesc,curfile))
    
    all_fv_esc10[curid] = vec
    

100%|██████████| 400/400 [02:00<00:00,  3.36it/s]


NB : problem with saving and loading, not sure why, maybe because all_fv_esc10 is a dictionnary. TO BE MODIFIED

In [10]:
#np.savez_compressed('/home/nfarrugi/esc10-allfv.npz',all_fv_esc10 = all_fv_esc10)

In [6]:
#all_fv_esc10 = np.load('/home/nfarrugi/esc10-allfv.npz')['all_fv_esc10']

Let's build the X and y for classification. We will use the predefined folds

In [8]:
Df_esc10 = Df[Df['esc10']]

y = Df_esc10['category']

group = Df_esc10['fold']

As mentionned in the original soundnet paper, feature vectors are averaged across time windows. Here with 5 second long files it seems we get four feature vectors, so we average across the four.

In [9]:
layer = 6 # 6 for conv7 layer

X = []

for curstim in Df_esc10['filename']:
    curid = curstim[:-4]
    curfv = all_fv_esc10[curid][layer].mean(axis=0)
    #curfv = all_fv_esc10[curid][layer]
    X.append(curfv)
X = np.stack(X)

In [11]:
X= X.reshape(400,-1)

In [12]:
X.shape

(400, 1024)

In [13]:
from sklearn.model_selection import LeaveOneGroupOut,cross_validate

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

est = make_pipeline(StandardScaler(),MLPClassifier(max_iter=5000))

scores = cross_validate(estimator=est,X=X,y=y,groups=group,cv = LeaveOneGroupOut(),return_train_score=False)

print(scores['test_score'])

[0.7375 0.65   0.6625 0.7875 0.725 ]


Can adding an encoding model that estimates fMRI from conv5 help training on ESC-10 ? 

In [16]:
from joblib import dump, load

estim = load('fmri_video/code/encoding_conv7.joblib')

Estimate brain activity using ESC-10 feature vectors

In [29]:
brain_predictions = estim.predict(X)

In [30]:
brain_predictions.shape

(400, 44690)

In [14]:
from sklearn.feature_selection import SelectKBest

X_brain = np.hstack([X,brain_predictions])

est = make_pipeline(StandardScaler(),MLPClassifier(max_iter=1000,verbose=0))

scores = cross_validate(estimator=est,X=X_brain,y=y,groups=group,cv = LeaveOneGroupOut(),return_train_score=False)

print(scores['test_score'])


NameError: name 'brain_predictions' is not defined

Same tests on ESC-50

The following cell will extract all features vectors of SoundNet from conv1 to conv7 on the whole ESC-50, without data augmentation

In [128]:
pathesc = '/home/nfarrugi/git/ESC-50/audio/'
listfiles = os.listdir(pathesc)

all_fv_esc50 = dict()

for curfile in (tqdm(listfiles)):
    
    curid = curfile[:-4]
        
    vec,siz= extract_vectors_resampl(os.path.join(pathesc,curfile))
    
    all_fv_esc50[curid] = vec

  6%|▌         | 112/2000 [00:32<09:21,  3.37it/s]

KeyboardInterrupt: 

there seems to be a problem when saving and reloading, maybe because it's a dictionnary.. To BE SOLVED

In [11]:
#np.savez_compressed('/home/nfarrugi/esc50-allfv.npz',all_fv_esc50 = all_fv_esc50)

In [129]:
#all_fv_esc50 = np.load('/home/nfarrugi/esc50-allfv.npz')['all_fv_esc50']

As mentionned in the original soundnet paper, feature vectors are averaged across time windows. Here with 5 second long files it seems we get four feature vectors, so we average across the four.

In [130]:
y = Df['category']

group = Df['fold']

layer = 4 # 6 for conv7 layer

X = []

for curstim in Df['filename']:
    curid = curstim[:-4]
    curfv = all_fv_esc50[curid][layer].mean(axis=0)    
    X.append(curfv)
X = np.stack(X)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [119]:

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


est = make_pipeline(StandardScaler(),LinearSVC(max_iter=5000,C=0.01))


scores = cross_validate(estimator=est,X=X,y=y,groups=group,cv = LeaveOneGroupOut(),return_train_score=False)

print(scores['test_score'])

[0.4225 0.42   0.455  0.5    0.475 ]
