In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm

In [2]:
path = './summer_2022_project/'
audio_path = path + 'audio/'

df = pd.read_csv(path+'development.csv', dtype={'filename':'string', 'emotion':'string'}) # set string as type
eval_df = pd.read_csv(path+'evaluation.csv', dtype={'filename':'string'}) # set string as type



In [3]:
eval_df.head()

Unnamed: 0,filename
0,9597.wav
1,9598.wav
2,9599.wav
3,9600.wav
4,9601.wav


In [4]:
df.head()

Unnamed: 0,emotion,filename
0,Disgusted,0.wav
1,Fearful,1.wav
2,Neutral,2.wav
3,Happy,3.wav
4,Angry,4.wav


In [5]:
labels = df.emotion.unique()
labels_dict = {v:k for k,v in enumerate(labels)} #needed to transform categorical labels

df['encoded_emotion'] = df['emotion'].map(labels_dict)
df.head()

Unnamed: 0,emotion,filename,encoded_emotion
0,Disgusted,0.wav,0
1,Fearful,1.wav,1
2,Neutral,2.wav,2
3,Happy,3.wav,3
4,Angry,4.wav,4


In [6]:
from scipy.io.wavfile import read
from scipy import signal

d = dict()
for filename in df.filename:
    sample_rate, samples = read(audio_path+f'/{filename}')
    frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
    d[(int)(filename.split('.')[0])] = {'samples':samples, 'sample_rate':sample_rate, 'time_length':len(samples)/sample_rate, 'frequencies':frequencies, 'spectrogram':spectrogram}  

temp_df = pd.DataFrame.from_dict(d).T    
temp_df['emotion'] = df.emotion
temp_df['encoded_emotion'] = df.encoded_emotion
temp_df['max_sample'] = temp_df['samples'].map(lambda x: max(x))
temp_df = temp_df.astype({'time_length':float})
temp_df.head()

Unnamed: 0,samples,sample_rate,time_length,frequencies,spectrogram,emotion,encoded_emotion,max_sample
0,"[-46, -103, -88, -91, -70, -55, -39, -15, 20, ...",8000,2.169,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[3.3801324, 12.052663, 0.16704863, 0.397491, ...",Disgusted,0,6632
1,"[117, 301, 296, 302, 275, 264, 244, 213, 161, ...",8000,2.6695,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.47520238, 13.986137, 2.8514621, 17.818419,...",Fearful,1,2606
2,"[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -...",8000,3.837375,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0001733794, 3.5454577e-05, 9.601911e-06, 7...",Neutral,2,3447
3,"[-1, -1, -1, -1, 0, -1, -1, -1, -1, -1, 0, 0, ...",8000,3.403625,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0003778362, 0.00011231503, 6.29442e-05, 0....",Happy,3,3375
4,"[77, 208, 212, 207, 193, 212, 213, 205, 210, 1...",8000,2.86975,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[4.822137, 6.9771757, 0.14937395, 1.0288435, ...",Angry,4,32767


In [7]:
def divide_matrix(ary:np.array, num_columns:int, num_rows:int):    
    
    if (num_columns > ary.shape[1] or num_rows > ary.shape[0]):
        return 'Error, the requested number of columns or rows exceed dimensions!'
    
    first = np.array_split(ary, num_rows, axis=0) # num rows
    second = []
    for array in first:
        second.append(np.array_split(array, num_columns, axis=1)) # num columns
    flattened_list = flatten(second)
    return flattened_list

def flatten(l:list):
    flattened_list = []
    for item in l:
        for i in item:
            flattened_list.append(i)
    return flattened_list
    
def compute_feature(list_of_arrays:list) -> dict:
    d = dict()
    for id in range(len(list_of_arrays)):
        d[f'{id}_mean_feature'] = np.mean(list_of_arrays[id])
        d[f'{id}_std_feature'] = np.std(list_of_arrays[id])
    return d

def create_dict_dataset(path:str, is_eval:bool=False) -> dict:
    """
    function used to read the dataset. It creates a dictionary in
    the following form:
    id: {label, frequencies, times, spectrogram}.
    If is_eval == True, no label.

    Args:
        path (str): path used to read the dataset
        is_eval (bool, optional): boolean flag to set to True
        if the dataset you want to read is the evaluation one.
        Defaults to False.

    Returns:
        dict: dataset in dictionary form
    """
    all_samples = os.listdir(path)
    d = dict()
    for sample in all_samples:
        if is_eval:
            id = int(sample.split('.')[0])
        else:
            id = int(sample.split('_')[0])
        sample_rate, samples = read(path + sample)
        frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
        if is_eval:
            d_temp = {'frequencies': frequencies,
                      'times': times,
                      'spectrogram': spectrogram} 
        else:
            d_temp = {'label': int(sample.split('_')[1][:-4]), # till -4 since it has the .wav extension
                    'frequencies': frequencies,
                      'times': times,
                      'spectrogram': spectrogram} 
        d[id] = d_temp 
    return d

In [8]:
shrunk_df = temp_df[['time_length', 'spectrogram', 'max_sample', 'encoded_emotion']]


In [9]:
from imblearn.over_sampling import BorderlineSMOTE


dictionary = dict()
for row in tqdm(shrunk_df.index):
    z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = shrunk_df.loc[row]['time_length']
    z['max_sample'] = shrunk_df.loc[row]['max_sample']
    z['encoded_emotion'] = shrunk_df.loc[row].encoded_emotion
    dictionary[row] = z
    
new_df = pd.DataFrame.from_dict(dictionary).T

np.random.seed(0) #to make experiments reproducible


smote = BorderlineSMOTE(random_state=42)

X, y = new_df[new_df.columns[:-1]], new_df[new_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)
y_res = y_res.astype(int)

100%|██████████| 9597/9597 [01:11<00:00, 133.88it/s]


In [10]:
y_res

0        0
1        1
2        2
3        3
4        4
        ..
11370    6
11371    6
11372    6
11373    6
11374    6
Name: encoded_emotion, Length: 11375, dtype: int64

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features = 'log2', n_estimators=300).fit(X_res, y_res)


# eval

In [12]:
from scipy.io.wavfile import read
from scipy import signal

d = dict()
for filename in eval_df.filename:
    sample_rate, samples = read(audio_path+f'/{filename}')
    frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
    d[(int)(filename.split('.')[0])] = {'samples':samples, 'sample_rate':sample_rate, 'time_length':len(samples)/sample_rate, 'frequencies':frequencies, 'spectrogram':spectrogram}  

new_eval_df = pd.DataFrame.from_dict(d).T    
new_eval_df['max_sample'] = new_eval_df['samples'].map(lambda x: max(x))
new_eval_df = new_eval_df.astype({'time_length':float})
new_eval_df.head()

Unnamed: 0,samples,sample_rate,time_length,frequencies,spectrogram,max_sample
9597,"[6, 20, 17, 18, 31, 39, 30, 19, 36, 53, 45, 34...",8000,2.2825,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.024010094, 0.4403729, 0.7793636, 1.5887632...",3849
9598,"[-106, -232, -300, -415, -417, -342, -314, -43...",8000,2.936375,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[13.073401, 2.9347198, 3.5555453, 2.4982939, ...",9525
9599,"[-2, -4, -3, -4, -2, -2, -1, -1, -1, 0, 1, 1, ...",8000,2.675875,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.5504999, 1.1110619, 0.5763831, 0.004715546...",2217
9600,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8000,3.637125,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.79...",6896
9601,"[68, 175, 156, 144, 142, 138, 137, 145, 160, 1...",8000,2.3025,"[0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5...","[[0.21942894, 18.0716, 2.2244778, 0.19599216, ...",2090


In [30]:
eval_shrunk_df = new_eval_df[['time_length', 'spectrogram', 'max_sample']]


In [31]:
dictionary = dict()
for row in tqdm(eval_shrunk_df.index):
    z = compute_feature(divide_matrix(eval_shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = eval_shrunk_df.loc[row]['time_length']
    z['max_sample'] = eval_shrunk_df.loc[row]['max_sample']
    dictionary[row] = z
    
eval_features_df = pd.DataFrame.from_dict(dictionary).T


100%|██████████| 3201/3201 [00:23<00:00, 136.30it/s]


In [50]:
preds = rf.predict(eval_features_df)

In [56]:
preds

array([5, 3, 5, ..., 2, 2, 4])

In [33]:
{v:k for k,v in labels_dict.items()}

{0: 'Disgusted',
 1: 'Fearful',
 2: 'Neutral',
 3: 'Happy',
 4: 'Angry',
 5: 'Sad',
 6: 'Suprised'}

In [59]:
lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(eval_shrunk_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))

In [62]:
df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale

Unnamed: 0,Id,Predicted
0,9597.wav,Sad
1,9598.wav,Happy
2,9599.wav,Sad
3,9600.wav,Suprised
4,9601.wav,Sad
...,...,...
3196,12793.wav,Suprised
3197,12794.wav,Happy
3198,12795.wav,Neutral
3199,12796.wav,Neutral


In [67]:
df_finale.to_csv(path_or_buf=path+'/results.csv', header=True, index=False)

In [42]:
preds = pd.DataFrame(preds, index=eval_shrunk_df.index)

#preds.map({v:k for k,v in labels_dict.items()})


0            Sad
1          Happy
2            Sad
3       Suprised
4            Sad
          ...   
3196    Suprised
3197       Happy
3198     Neutral
3199     Neutral
3200       Angry
Length: 3201, dtype: object

# risultato 0.619

# prova con train test e split


In [None]:
'''from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import train_test_split


dictionary = dict()
for row in tqdm(shrunk_df.index):
    # z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=n, num_columns=n*ratio)) # with ratio
    z = compute_feature(divide_matrix(shrunk_df.loc[row].spectrogram, num_rows=12, num_columns=12))
    z['time_length'] = shrunk_df.loc[row]['time_length']
    z['max_sample'] = shrunk_df.loc[row]['max_sample']
    z['encoded_emotion'] = shrunk_df.loc[row].encoded_emotion
    dictionary[row] = z
    
new_df = pd.DataFrame.from_dict(dictionary).T

np.random.seed(0) #to make experiments reproducible


smote = BorderlineSMOTE(random_state=42)

X, y = new_df[new_df.columns[:-1]], new_df[new_df.columns[-1]]
X_res, y_res = smote.fit_resample(X,y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.2, random_state=42)

y_train = y_train.astype(int)
y_test = y_test.astype(int)



from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features = 'log2', n_estimators=300).fit(X_train, y_train)


from sklearn.metrics import f1_score


preds_test = rf.predict(X_test)
score = f1_score(y_test, preds_test, average='macro')


preds = rf.predict(eval_features_df)

lista_finale = []
preds_ser = pd.Series(preds)
for id, label in zip(eval_shrunk_df.index, preds_ser.map({v:k for k,v in labels_dict.items()})):
    lista_finale.append((f'{id}.wav', label))
    

df_finale = pd.DataFrame(lista_finale, columns=['Id', 'Predicted'])
df_finale.to_csv(path_or_buf=path+'/train_test_split_results.csv', header=True, index=False)'''

100%|██████████| 9597/9597 [01:14<00:00, 128.07it/s]


con train test e split risulta avere performance inferiori. Trasformo tutto in testo