In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm

In [2]:
path = './summer_2022_project/'
audio_path = path + 'audio/'

df = pd.read_csv(path+'development.csv', dtype={'filename':'string', 'emotion':'string'}) # set string as type
eval_df = pd.read_csv(path+'evaluation.csv', dtype={'filename':'string'}) # set string as type



In [3]:
eval_df.head()

Unnamed: 0,filename
0,9597.wav
1,9598.wav
2,9599.wav
3,9600.wav
4,9601.wav


In [4]:
df.head()

Unnamed: 0,emotion,filename
0,Disgusted,0.wav
1,Fearful,1.wav
2,Neutral,2.wav
3,Happy,3.wav
4,Angry,4.wav


In [5]:
labels = df.emotion.unique()
labels_dict = {v:k for k,v in enumerate(labels)} #needed to transform categorical labels

df['encoded_emotion'] = df['emotion'].map(labels_dict)
df.head()

Unnamed: 0,emotion,filename,encoded_emotion
0,Disgusted,0.wav,0
1,Fearful,1.wav,1
2,Neutral,2.wav,2
3,Happy,3.wav,3
4,Angry,4.wav,4


In [6]:
from scipy.io.wavfile import read
from scipy import signal

d = dict()
for filename in df.filename:
    sample_rate, samples = read(audio_path+f'/{filename}')
    frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
    d[(int)(filename.split('.')[0])] = {'samples':samples, 'sample_rate':sample_rate, 'time_length':len(samples)/sample_rate, 'frequencies':frequencies, 'spectrogram':spectrogram}  

temp_df = pd.DataFrame.from_dict(d).T    
temp_df['emotion'] = df.emotion
temp_df['encoded_emotion'] = df.encoded_emotion
temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
temp_df = temp_df.astype({'time_length':float})
temp_df.head()

Unnamed: 0,samples,sample_rate,time_length,frequencies,spectrogram,emotion,encoded_emotion,max_sample
0,"[-46, -103, -88, -91, -70, -55, -39, -15, 20, ...",8000,2.169,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[3.3801324, 12.052663, 0.16704863, 0.397491, ...",Disgusted,0,6632
1,"[117, 301, 296, 302, 275, 264, 244, 213, 161, ...",8000,2.6695,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.47520238, 13.986137, 2.8514621, 17.818419,...",Fearful,1,2606
2,"[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -...",8000,3.837375,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0001733794, 3.5454577e-05, 9.601911e-06, 7...",Neutral,2,3447
3,"[-1, -1, -1, -1, 0, -1, -1, -1, -1, -1, 0, 0, ...",8000,3.403625,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0003778362, 0.00011231503, 6.29442e-05, 0....",Happy,3,3375
4,"[77, 208, 212, 207, 193, 212, 213, 205, 210, 1...",8000,2.86975,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[4.822137, 6.9771757, 0.14937395, 1.0288435, ...",Angry,4,32767


In [76]:
def divide_matrix(ary:np.array, num_columns:int, num_rows:int):    
    
    if (num_columns > ary.shape[1] or num_rows > ary.shape[0]):
        return 'Error, the requested number of columns or rows exceed dimensions!'
    
    first = np.array_split(ary, num_rows, axis=0) # num rows
    second = []
    for array in first:
        second.append(np.array_split(array, num_columns, axis=1)) # num columns
    flattened_list = flatten(second)
    return flattened_list

def flatten(l:list):
    flattened_list = []
    for item in l:
        for i in item:
            flattened_list.append(i)
    return flattened_list
    
def compute_feature(list_of_arrays:list, prefix=None) -> dict:
    
    d = dict()
    
    if prefix == None:
      for id in range(len(list_of_arrays)):
          d[f'{id}_mean_feature'] = np.mean(list_of_arrays[id])
          d[f'{id}_std_feature'] = np.std(list_of_arrays[id])
    else:
      for id in range(len(list_of_arrays)):
          d[f'{id}_{prefix}_mean_feature'] = np.mean(list_of_arrays[id])
          d[f'{id}_{prefix}_std_feature'] = np.std(list_of_arrays[id])

    return d

def create_dict_dataset(path:str, is_eval:bool=False) -> dict:
    """
    function used to read the dataset. It creates a dictionary in
    the following form:
    id: {label, frequencies, times, spectrogram}.
    If is_eval == True, no label.

    Args:
        path (str): path used to read the dataset
        is_eval (bool, optional): boolean flag to set to True
        if the dataset you want to read is the evaluation one.
        Defaults to False.

    Returns:
        dict: dataset in dictionary form
    """
    all_samples = os.listdir(path)
    d = dict()
    for sample in all_samples:
        if is_eval:
            id = int(sample.split('.')[0])
        else:
            id = int(sample.split('_')[0])
        sample_rate, samples = read(path + sample)
        frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
        if is_eval:
            d_temp = {'frequencies': frequencies,
                      'times': times,
                      'spectrogram': spectrogram} 
        else:
            d_temp = {'label': int(sample.split('_')[1][:-4]), # till -4 since it has the .wav extension
                    'frequencies': frequencies,
                      'times': times,
                      'spectrogram': spectrogram} 
        d[id] = d_temp 
    return d

In [8]:
shrunk_df = temp_df[['time_length', 'spectrogram', 'max_sample', 'encoded_emotion']]


In [9]:
from imblearn.over_sampling import BorderlineSMOTE


dictionary = dict()
for row in tqdm(shrunk_df.index):
    z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = shrunk_df.loc[row]['time_length']
    z['max_sample'] = shrunk_df.loc[row]['max_sample']
    z['encoded_emotion'] = shrunk_df.loc[row].encoded_emotion
    dictionary[row] = z
    
new_df = pd.DataFrame.from_dict(dictionary).T

np.random.seed(0) #to make experiments reproducible


smote = BorderlineSMOTE(random_state=42)

X, y = new_df[new_df.columns[:-1]], new_df[new_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)
y_res = y_res.astype(int)

100%|██████████| 9597/9597 [01:11<00:00, 133.88it/s]


In [10]:
y_res

0        0
1        1
2        2
3        3
4        4
        ..
11370    6
11371    6
11372    6
11373    6
11374    6
Name: encoded_emotion, Length: 11375, dtype: int64

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features = 'log2', n_estimators=300).fit(X_res, y_res)


# eval

In [12]:
from scipy.io.wavfile import read
from scipy import signal

d = dict()
for filename in eval_df.filename:
    sample_rate, samples = read(audio_path+f'/{filename}')
    frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
    d[(int)(filename.split('.')[0])] = {'samples':samples, 'sample_rate':sample_rate, 'time_length':len(samples)/sample_rate, 'frequencies':frequencies, 'spectrogram':spectrogram}  

new_eval_df = pd.DataFrame.from_dict(d).T    
new_eval_df['max_sample'] = new_eval_df['samples'].map(lambda x: max(x))
new_eval_df = new_eval_df.astype({'time_length':float})

Unnamed: 0,samples,sample_rate,time_length,frequencies,spectrogram,max_sample
9597,"[6, 20, 17, 18, 31, 39, 30, 19, 36, 53, 45, 34...",8000,2.2825,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.024010094, 0.4403729, 0.7793636, 1.5887632...",3849
9598,"[-106, -232, -300, -415, -417, -342, -314, -43...",8000,2.936375,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[13.073401, 2.9347198, 3.5555453, 2.4982939, ...",9525
9599,"[-2, -4, -3, -4, -2, -2, -1, -1, -1, 0, 1, 1, ...",8000,2.675875,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.5504999, 1.1110619, 0.5763831, 0.004715546...",2217
9600,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8000,3.637125,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.79...",6896
9601,"[68, 175, 156, 144, 142, 138, 137, 145, 160, 1...",8000,2.3025,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.21942894, 18.0716, 2.2244778, 0.19599216, ...",2090


In [30]:
eval_shrunk_df = new_eval_df[['time_length', 'spectrogram', 'max_sample']]


In [31]:
dictionary = dict()
for row in tqdm(eval_shrunk_df.index):
    z = compute_feature(divide_matrix(eval_shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = eval_shrunk_df.loc[row]['time_length']
    z['max_sample'] = eval_shrunk_df.loc[row]['max_sample']
    dictionary[row] = z
    
eval_features_df = pd.DataFrame.from_dict(dictionary).T


100%|██████████| 3201/3201 [00:23<00:00, 136.30it/s]


In [50]:
preds = rf.predict(eval_features_df)

In [56]:
preds

array([5, 3, 5, ..., 2, 2, 4])

In [33]:
{v:k for k,v in labels_dict.items()}

{0: 'Disgusted',
 1: 'Fearful',
 2: 'Neutral',
 3: 'Happy',
 4: 'Angry',
 5: 'Sad',
 6: 'Suprised'}

In [59]:
lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(eval_shrunk_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))

In [62]:
df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale

Unnamed: 0,Id,Predicted
0,9597.wav,Sad
1,9598.wav,Happy
2,9599.wav,Sad
3,9600.wav,Suprised
4,9601.wav,Sad
...,...,...
3196,12793.wav,Suprised
3197,12794.wav,Happy
3198,12795.wav,Neutral
3199,12796.wav,Neutral


In [67]:
df_finale.to_csv(path_or_buf=path+'/results.csv', header=True, index=False)

In [42]:
preds = pd.DataFrame(preds, index=eval_shrunk_df.index)

#preds.map({v:k for k,v in labels_dict.items()})


0            Sad
1          Happy
2            Sad
3       Suprised
4            Sad
          ...   
3196    Suprised
3197       Happy
3198     Neutral
3199     Neutral
3200       Angry
Length: 3201, dtype: object

# risultato 0.619

# prova con train test e split


In [None]:
'''from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import train_test_split


dictionary = dict()
for row in tqdm(shrunk_df.index):
    # z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=n, num_columns=n*ratio)) # with ratio
    z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = shrunk_df.loc[row]['time_length']
    z['max_sample'] = shrunk_df.loc[row]['max_sample']
    z['encoded_emotion'] = shrunk_df.loc[row].encoded_emotion
    dictionary[row] = z
    
new_df = pd.DataFrame.from_dict(dictionary).T

np.random.seed(0) #to make experiments reproducible


smote = BorderlineSMOTE(random_state=42)

X, y = new_df[new_df.columns[:-1]], new_df[new_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.2, random_state=42)

y_train = y_train.astype(int)
y_test = y_test.astype(int)



from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features = 'log2', n_estimators=300).fit(X_train, y_train)


from sklearn.metrics import f1_score


preds_test = rf.predict(X_test)
score = f1_score(y_test, preds_test, average='macro')


preds = rf.predict(eval_features_df)

lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(eval_shrunk_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))
    

df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale.to_csv(path_or_buf=path+'/train_test_split_results.csv', header=True, index=False)'''

100%|██████████| 9597/9597 [01:14<00:00, 128.07it/s]


con train test e split risulta avere performance inferiori. Trasformo tutto in testo

# prova con mfccs features concatenate e divisi in blocchi

In [79]:
import librosa

d = dict()
for filename in tqdm(df.filename):
    d1 = dict()
    y, sr = librosa.load(audio_path+filename, sr=None)
    d1['duration'] = y.shape[0]/sr
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfccs, order=1)
    delta2_mfcc = librosa.feature.delta(mfccs, order=2)
    d1['mfccs'] = np.concatenate([mfccs, delta_mfcc, delta2_mfcc])
    
    d1['zcr'] = librosa.feature.zero_crossing_rate(y)    
    d[(int)(filename.split('.')[0])] = d1

temp_df = pd.DataFrame.from_dict(d).T    

temp_df['encoded_emotion'] = df.encoded_emotion
#temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
temp_df = temp_df.astype({'duration':float})
temp_df.head()

100%|██████████| 9597/9597 [01:38<00:00, 97.80it/s] 


Unnamed: 0,duration,mfccs,zcr,encoded_emotion
0,2.169,"[[-493.01044, -460.0894, -458.284, -457.54904,...","[[0.02392578125, 0.0380859375, 0.05126953125, ...",0
1,2.6695,"[[-463.8445, -448.02594, -447.8502, -450.42685...","[[0.02734375, 0.041015625, 0.05126953125, 0.05...",1
2,3.837375,"[[-766.99445, -767.2107, -767.1353, -765.2035,...","[[0.0107421875, 0.0234375, 0.029296875, 0.0263...",2
3,3.403625,"[[-798.0724, -794.187, -791.83185, -790.31226,...","[[0.0146484375, 0.02978515625, 0.0537109375, 0...",3
4,2.86975,"[[-435.62903, -414.33167, -411.86377, -410.227...","[[0.02880859375, 0.04443359375, 0.0595703125, ...",4


In [80]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

from imblearn.over_sampling import SMOTE

np.random.seed(0) #to make experiments reproducible

n = 2

dictionary = dict()
features = ['mfccs']

prova = dict()
for row in temp_df.index:
    for f in features:
        prova[row] = compute_feature(flatten(divide_matrix(temp_df[f].loc[row], num_rows=n, num_columns=n)), prefix=f)
    

n_df = pd.DataFrame.from_dict(prova).T
n_df['duration'] = temp_df.duration
n_df['min_zcr'] = temp_df.zcr.map(lambda x: min(flatten(list(x))))
n_df['max_zcr'] = temp_df.zcr.map(lambda x: max(flatten(list(x))))
n_df['encoded_emotion'] = temp_df.encoded_emotion
n_df.head()

np.random.seed(0) #to make experiments reproducible


smote = SMOTE(random_state=42)

X, y = n_df[n_df.columns[:-1]], n_df[n_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)



y_res = y_res.astype(int)


print('Fitting RF...')
rf = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=300).fit(X_res, y_res) # plain rf

Fitting RF...


In [82]:
# eval
import librosa

d = dict()
for filename in tqdm(eval_df.filename):
    d1 = dict()
    y, sr = librosa.load(audio_path+filename, sr=None)
    d1['duration'] = y.shape[0]/sr
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfccs, order=1)
    delta2_mfcc = librosa.feature.delta(mfccs, order=2)
    d1['mfccs'] = np.concatenate([mfccs, delta_mfcc, delta2_mfcc])
    
    d1['zcr'] = librosa.feature.zero_crossing_rate(y)    
    d[(int)(filename.split('.')[0])] = d1

eval_temp_df = pd.DataFrame.from_dict(d).T    

#temp_df['encoded_emotion'] = df.encoded_emotion
#temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
eval_temp_df = eval_temp_df.astype({'duration':float})
eval_temp_df.head()

prova = dict()
for row in eval_temp_df.index:
    for f in features:
        prova[row] = compute_feature(flatten(divide_matrix(eval_temp_df[f].loc[row], num_rows=n, num_columns=n)), prefix=f)
    

n_df = pd.DataFrame.from_dict(prova).T
n_df['duration'] = eval_temp_df.duration
n_df['min_zcr'] = eval_temp_df.zcr.map(lambda x: min(flatten(list(x))))
n_df['max_zcr'] = eval_temp_df.zcr.map(lambda x: max(flatten(list(x))))
n_df.head()

100%|██████████| 3201/3201 [00:29<00:00, 108.00it/s]


Unnamed: 0,0_mfccs_mean_feature,0_mfccs_std_feature,1_mfccs_mean_feature,1_mfccs_std_feature,2_mfccs_mean_feature,2_mfccs_std_feature,3_mfccs_mean_feature,3_mfccs_std_feature,4_mfccs_mean_feature,4_mfccs_std_feature,...,74_mfccs_std_feature,75_mfccs_mean_feature,75_mfccs_std_feature,76_mfccs_mean_feature,76_mfccs_std_feature,77_mfccs_mean_feature,77_mfccs_std_feature,duration,min_zcr,max_zcr
9597,-320.519836,74.853836,103.438644,56.498116,4.665295,35.114296,12.317742,23.253265,-33.349213,19.099802,...,1.653221,-0.206009,1.223777,0.04824,0.674263,0.589253,0.888443,2.2825,0.034668,0.276367
9598,-308.033569,100.41404,99.541992,20.507776,17.022968,25.972168,40.653038,11.565622,-24.281771,22.499645,...,0.772197,-0.056933,0.392606,-0.105714,0.939864,-0.003015,0.571525,2.936375,0.041992,0.197266
9599,-491.930298,70.499458,76.218567,37.889694,-2.75162,18.351789,33.211555,23.14465,-29.880909,24.282358,...,1.15795,0.685957,1.248281,0.141231,1.491758,0.180289,0.892869,2.675875,0.001953,0.142578
9600,-449.358185,221.991562,24.551893,25.689325,-28.414059,39.085045,3.468338,15.615293,-29.747904,20.348951,...,1.915192,-0.040379,2.258775,0.185653,2.498885,0.048344,2.056435,3.637125,0.0,0.448242
9601,-400.826111,47.842373,143.278778,9.556885,20.32435,16.925394,32.500134,12.000834,-8.241503,8.901287,...,0.271295,0.025242,1.173827,0.032443,0.642198,0.019437,1.053577,2.3025,0.033203,0.143066


In [84]:
print('Predicting using RF...')
preds = rf.predict(n_df)

lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(n_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))
df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale.to_csv(path_or_buf=path+'/mfccs_features_results.csv', header=True, index=False)

Predicting using RF...


risultato: 0.649

# prova con gli mfccs mediati sull'asse del tempo

In [1]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm

path = './summer_2022_project/'
audio_path = path + 'audio/'

df = pd.read_csv(path+'development.csv', dtype={'filename':'string', 'emotion':'string'}) # set string as type
eval_df = pd.read_csv(path+'evaluation.csv', dtype={'filename':'string'}) # set string as type

labels = df.emotion.unique()
labels_dict = {v:k for k,v in enumerate(labels)} #needed to transform categorical labels

df['encoded_emotion'] = df['emotion'].map(labels_dict)
df.head()

d = dict()
for filename in tqdm(df.filename):
    d1 = dict()
    signal, sr = librosa.load(audio_path + filename, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
    mfccs_mean= np.mean(mfccs, axis=-1) #mean along time axis. 1 value for each coefficient, the mean one
    mfccs_std = np.std(mfccs, axis=-1) #std along time axis. 1 value for each coefficient, the std one
    # delta_mfccs_mean = np.mean(librosa.feature.delta(mfccs), axis=-1)
    # delta_mfccs_std = np.std(librosa.feature.delta(mfccs), axis=-1)
    # delta2_mfccs_mean = np.mean(librosa.feature.delta(mfccs, order=2), axis=-1)
    # delta2_mfccs_std = np.std(librosa.feature.delta(mfccs, order=2), axis=-1)
    
    
    
    d1 = {f'feature_{k}':v for k,v in enumerate(
        np.concatenate(
            [
             mfccs_mean, mfccs_std,
             # delta_mfccs_mean, delta_mfccs_std,
             # delta2_mfccs_mean, delta2_mfccs_std
             ]
            )
        )
          } # extract mfccs
    
    d1['duration'] = signal.shape[0]/sr
    d1['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
    d1['zcr_std'] = np.std(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
    d[(int)(filename.split('.')[0])] = d1  
    
temp_df = pd.DataFrame.from_dict(d).T    

temp_df['encoded_emotion'] = df.encoded_emotion
#temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
temp_df = temp_df.astype({'duration':float})
temp_df.head()

  1%|▏         | 137/9597 [00:11<13:29, 11.68it/s]


KeyboardInterrupt: 

In [86]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

np.random.seed(0) #to make experiments reproducible


smote = SMOTE(random_state=42)

X, y = temp_df[temp_df.columns[:-1]], temp_df[temp_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)

print('Fitting RF...')
rf = RandomForestClassifier(n_estimators=300).fit(X_res, y_res)


Fitting RF...


In [87]:
# eval

d = dict()
for filename in tqdm(eval_df.filename):
    d1 = dict()
    signal, sr = librosa.load(audio_path + filename, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
    mfccs_mean= np.mean(mfccs, axis=-1) #mean along time axis. 1 value for each coefficient, the mean one
    mfccs_std = np.std(mfccs, axis=-1) #std along time axis. 1 value for each coefficient, the std one
    
    d1 = {f'feature_{k}':v for k,v in enumerate(
        np.concatenate(
            [
             mfccs_mean, mfccs_std,
             # delta_mfccs_mean, delta_mfccs_std,
             # delta2_mfccs_mean, delta2_mfccs_std
             ]
            )
        )
          } # extract mfccs
    
    d1['duration'] = signal.shape[0]/sr
    d1['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
    d1['zcr_std'] = np.std(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
    d[(int)(filename.split('.')[0])] = d1  
    
temp_df = pd.DataFrame.from_dict(d).T    

temp_df = temp_df.astype({'duration':float})
temp_df.head()

100%|██████████| 3201/3201 [02:01<00:00, 26.26it/s]


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,feature_79,duration,zcr_std
9597,-474.49057,159.717789,4.103325,14.497741,27.404995,-12.609978,-7.191964,3.460248,-13.124554,-5.937153,...,15.142671,11.507565,7.562842,8.036141,9.551684,12.981726,14.16363,12.458191,2.28254,0.031684
9598,-440.675568,172.425888,5.024957,18.420645,41.157837,-4.22086,-9.091953,5.894474,-5.287843,-1.727015,...,3.642219,3.595762,4.088684,4.057915,3.956775,4.02279,3.53197,3.921232,2.936417,0.019737
9599,-605.522278,126.822273,7.875051,12.791359,37.479416,7.83647,-10.693128,-5.920121,-7.73813,-5.083761,...,18.729128,12.578185,11.052258,12.608357,14.08679,15.128099,17.860332,18.306728,2.675918,0.016635
9600,-566.757141,102.762611,-30.938261,-9.103293,13.912674,-17.346018,-10.659954,2.589712,-9.852112,-4.575284,...,5.239999,4.753079,3.36419,3.632443,3.571674,3.976341,3.9273,3.291255,3.637143,0.049536
9601,-529.296509,225.323792,4.93076,14.476372,38.15152,-8.969285,-0.393866,12.759765,-12.909903,-3.926751,...,5.205548,6.265561,6.211026,5.268485,4.901586,3.795907,4.440202,4.161255,2.30254,0.011468


In [88]:
print('Predicting using RF...')
preds = rf.predict(temp_df)

lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(temp_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))
df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale.to_csv(path_or_buf=path+'/results/mfccs_features_results.csv', header=True, index=False)

Predicting using RF...


SCORE: 0.680

USE best PCA (n=55) and best SVM(C=3) with:
- mfccs mean and std
- delta mean
- delta 2 mean
- chroma_stft mean
- duration
- zcr mean
- rms

In [21]:
import librosa
from scipy.io import wavfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm

from utils import *

path = './summer_2022_project/'
audio_path = path + 'audio/'

df = pd.read_csv(path+'development.csv', dtype={'filename':'string', 'emotion':'string'}) # set string as type
eval_df = pd.read_csv(path+'evaluation.csv', dtype={'filename':'string'}) # set string as type

labels = df.emotion.unique()
labels_dict = {v:k for k,v in enumerate(labels)} #needed to transform categorical labels

df['encoded_emotion'] = df['emotion'].map(labels_dict)
df.head()

d = dict()
# all the files have been sampled using the same sample rate. Hence we extract it
sr, _ = wavfile.read(audio_path + df.filename.iloc[0])
for filename in tqdm(df.filename):
    d1 = dict()
    features = []
    signal, _ = librosa.load(audio_path+filename, sr=None, res_type='kaiser_fast')
    try:
        mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
        mfccs_mean = np.mean(mfccs, axis=-1) #mean along time axis. 1 value for each coefficient, the mean one
        mfccs_std = np.std(mfccs, axis=-1) #std along time axis. 1 value for each coefficient, the std one
        res = np.hstack((mfccs_mean, mfccs_std))
        delta_mfccs_mean = np.mean(librosa.feature.delta(mfccs), axis=-1)
        res = np.hstack((res, delta_mfccs_mean))
        delta2_mfccs_mean = np.mean(librosa.feature.delta(mfccs, order=2), axis=-1)
        res = np.hstack((res, delta2_mfccs_mean))
        #mel_mean = np.mean(librosa.feature.melspectrogram(signal, sr), axis=-1)
        #mel_std = np.std(librosa.feature.melspectrogram(signal, sr), axis=-1)
        stft = np.abs(librosa.stft(signal))
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr), axis=-1)
        res = np.hstack((res, chroma_stft))
        
        
        
        d1 = {f'feature_{k}':v for k,v in enumerate(res)} # extract mfccs
        
        d1['duration'] = signal.shape[0]/sr
        d1['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
        #d1['zcr_std'] = np.std(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
        d1['rms'] = np.mean(librosa.feature.rms(y=signal), axis=-1)[0]
    except:
        print(filename)
        continue
    d[(int)(filename.split('.')[0])] = d1
    
temp_df = pd.DataFrame.from_dict(d).T    

temp_df['encoded_emotion'] = df.encoded_emotion
#temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
temp_df = temp_df.astype({'duration':float})
temp_df.head()

  return f(*args, **kwargs)
100%|██████████| 9597/9597 [03:26<00:00, 46.46it/s]


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_166,feature_167,feature_168,feature_169,feature_170,feature_171,duration,zcr_mean,rms,encoded_emotion
0,-317.810974,130.704483,-12.395517,30.511158,-20.439678,3.617439,-10.423512,7.929957,-14.021802,-2.930775,...,0.689658,0.661096,0.632921,0.7749,0.737488,0.563047,2.169,0.101017,0.022118,0
1,-396.103882,144.796555,18.632975,41.45097,-8.002732,10.079015,-11.788765,6.078516,-8.80768,-0.6461,...,0.428917,0.520044,0.722071,0.826758,0.766087,0.470141,2.6695,0.067057,0.011704,1
2,-556.759644,61.434036,-2.091408,13.791684,-8.940442,-2.057548,-14.456757,4.195601,-8.435582,5.987458,...,0.622826,0.710356,0.70244,0.702173,0.701752,0.685739,3.837375,0.081445,0.006343,2
3,-570.33844,53.714615,-5.728349,19.4942,-12.612115,1.432346,-10.988893,8.573183,-2.57633,6.206134,...,0.632984,0.664319,0.652212,0.693982,0.707066,0.807382,3.403625,0.094916,0.005601,3
4,-167.229965,97.665352,-14.925374,16.654722,-36.905621,4.324324,-21.103331,-0.462943,-10.539716,-1.4369,...,0.564764,0.571357,0.526096,0.596822,0.739495,0.727407,2.86975,0.123644,0.160394,4


In [37]:
# PCA tuning

from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE


np.random.seed(0) #to make experiments reproducible


smote = SMOTE(random_state=42)

X, y = temp_df[temp_df.columns[:-1]], temp_df[temp_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)

# ----- PCA ----- #
from sklearn.decomposition import PCA

pca = PCA(n_components=55, random_state=42)
pca_X_res = pca.fit_transform(X_res)


minmax = MinMaxScaler()
svm_X_res = minmax.fit_transform(pca_X_res)
svm_X_train, svm_X_test, svm_y_train, svm_y_test = train_test_split(svm_X_res, y_res, test_size=.2, random_state=42)


print('Fitting SVC...')
svm = SVC(C=3, random_state=42).fit(svm_X_res, y_res)
# print('Predicting using SVC...')
# svm_preds = svm.predict(svm_X_test)
# svm_score = f1_score(svm_y_test, svm_preds, average='macro')
# print(svm, svm_score)


Fitting SVC...
Predicting using SVC...
SVC(C=3, random_state=42) 0.9003010321672338


In [23]:
# eval

d = dict()
for filename in tqdm(eval_df.filename):
    d1 = dict()
    features = []
    signal, _ = librosa.load(audio_path+filename, sr=None, res_type='kaiser_fast')
    try:
        mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
        mfccs_mean = np.mean(mfccs, axis=-1) #mean along time axis. 1 value for each coefficient, the mean one
        mfccs_std = np.std(mfccs, axis=-1) #std along time axis. 1 value for each coefficient, the std one
        res = np.hstack((mfccs_mean, mfccs_std))
        delta_mfccs_mean = np.mean(librosa.feature.delta(mfccs), axis=-1)
        res = np.hstack((res, delta_mfccs_mean))
        delta2_mfccs_mean = np.mean(librosa.feature.delta(mfccs, order=2), axis=-1)
        res = np.hstack((res, delta2_mfccs_mean))
        #mel_mean = np.mean(librosa.feature.melspectrogram(signal, sr), axis=-1)
        #mel_std = np.std(librosa.feature.melspectrogram(signal, sr), axis=-1)
        stft = np.abs(librosa.stft(signal))
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr), axis=-1)
        res = np.hstack((res, chroma_stft))
        
        
        
        d1 = {f'feature_{k}':v for k,v in enumerate(res)} # extract mfccs
        
        d1['duration'] = signal.shape[0]/sr
        d1['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
        #d1['zcr_std'] = np.std(librosa.feature.zero_crossing_rate(y=signal), axis=-1)[0]
        d1['rms'] = np.mean(librosa.feature.rms(y=signal), axis=-1)[0]
    except:
        print(filename)
        continue
    d[(int)(filename.split('.')[0])] = d1
    
eval_temp_df = pd.DataFrame.from_dict(d).T    

#temp_df['encoded_emotion'] = df.encoded_emotion
#temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
eval_temp_df = eval_temp_df.astype({'duration':float})
eval_temp_df.head()
indexes = eval_temp_df.index

100%|██████████| 3201/3201 [01:05<00:00, 48.87it/s]


In [38]:
test_df = pca.transform(eval_temp_df) # apply PCA
test_df = minmax.transform(test_df) # Standardize
print('Predicting using SVM with PCA...')
preds = svm.predict(test_df)

lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(eval_temp_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))
df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale.to_csv(path_or_buf=path+'/results/svm_pca.csv', header=True, index=False)

Predicting using SVM with PCA...


In [39]:
preds

array([5, 3, 5, ..., 5, 5, 3])

results on leaderboard: 0.711