In [1]:
import pandas as pd
import numpy as np
import noisereduce as nr
import librosa, librosa.display
import matplotlib.pyplot as plt     #needed for librosa.display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

  "class": algorithms.Blowfish,


In [2]:
def preprocessor_final(csv_path,evaluation = False):
    #Reading the developement set as Pandas dataframe
    dev_df = pd.read_csv(csv_path)
    
    #Setting the index of the dataframe to "Id" column
    dev_df = dev_df.set_index("Id")

    if not evaluation:
        #Replacing the "change language" object with "change" and updating the "action" to "language"
        dev_df["object"] = dev_df["object"].replace(["none"],"language")
        dev_df["action"] = dev_df["action"].replace(["change language"],"change")
         #Concatanating the action and object
        dev_df["Intent"] = dev_df['action'].astype(str) + dev_df["object"]

    #Dropping columns 
    if not evaluation:
        dev_df = dev_df.drop(columns=["speakerId",'object','action',"Self-reported fluency level ","First Language spoken","Current language used for work/school","ageRange"])
    else:
        dev_df = dev_df.drop(columns=["speakerId","Self-reported fluency level ","First Language spoken","Current language used for work/school","ageRange"])
    
    dev_df = pd.get_dummies(dev_df, prefix="Gender", prefix_sep='_', columns=["gender"],drop_first = True)
    
    return dev_df

In [3]:
def retrieve_mel_spec(signal, n_mels, sample_rate):
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=n_mels, fmax=8000)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return log_mel_spectrogram

def plot_mel_spectrogram(mel_spectrogram, sample_rate):
    librosa.display.specshow(mel_spectrogram,  x_axis="time", y_axis="mel", sr=sample_rate)
    plt.colorbar(format="%+2.f")
    plt.show()

def retrieve_mfcc(signal, n_mfcc, sample_rate):
    return librosa.feature.mfcc(y=signal, n_mfcc=n_mfcc, sr=sample_rate)

def plot_mfcc(mfcc, sample_rate):
    librosa.display.specshow(mfcc, sr=sample_rate, x_axis='time')
    plt.colorbar(format="%+2.f")
    plt.show()

def split_matrix(matrix, splits_on_rows, splits_on_columns):
    blocks = list()
    #first split on the col.s
    sub_matrix = np.array_split(matrix, indices_or_sections=splits_on_columns, axis=1)
    #then on the rows
    for s in sub_matrix:
        sub_sub_matrix=  np.array_split(s, indices_or_sections=splits_on_rows, axis=0)
        blocks.extend(sub_sub_matrix)
    return blocks

In [4]:
def audio_features_extractor_final(textual_df, sample_rate=16000,evaluation = False):
  min_shape_1 = 10000
  #audio paths
  audio_paths =textual_df["path"].tolist()

  mfcc_13 = list()
  zcr = list()
  rms = list()

  #for each audio
  for k, path in enumerate(audio_paths):
    if not k%100:
      print(k)
    
    signal, sr = librosa.load(path, sr=sample_rate)

    signal = nr.reduce_noise(signal, sr=sr)

    signal = librosa.effects.trim(y=signal, top_db=30, hop_length=64, frame_length=128)[0]
  
    duration = librosa.get_duration(y=signal, sr=sr)

    
    if not evaluation and (duration<0.4 or duration>2.7):
      textual_df.drop(index=k, inplace=True)
      continue
    elif evaluation and len(signal) < 6145:
      signal = librosa.util.fix_length(data=signal,size=6145)
    

    mfcc_13.append(retrieve_mfcc(signal=signal, n_mfcc=13, sample_rate=sr))
    zcr.append(librosa.feature.zero_crossing_rate(y = signal))
    rms.append(librosa.feature.rms(y = signal))
    if mfcc_13[-1].shape[1]<min_shape_1:
      min_shape_1 = mfcc_13[-1].shape[1]

  return  mfcc_13 ,zcr,rms, min_shape_1

In [5]:
#utils for grid_searcher

def grid_search_internal(textual_df, mfcc_13,zcr,rms, n_splits, param_grid_rfc, param_grid_svc, best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc):
    iter = 0
    for n in n_splits:
        iter += 1
        print(f" Iter: {iter}/{len(n_splits)}")
        mfcc_df = compute_features_df(n, n, mfcc_13,zcr,rms, "mfcc")
        final_df = pd.concat([textual_df.reset_index().drop(columns = "Id"), mfcc_df], axis=1)
        X= final_df.loc[:, final_df.columns != 'Intent'].drop(columns = "path")
        y = final_df["Intent"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)  
        
        #grid search - RFC
        gd_rfc = GridSearchCV(estimator = RandomForestClassifier(), param_grid=param_grid_rfc, scoring="accuracy", cv=3, verbose=2)
        gd_rfc.fit(X_train, y_train)
        if gd_rfc.best_score_>best_params_rfc["accuracy"]:
            print(" Found a new BEST model for RFC, acc= ", gd_rfc.best_score_)
            best_params_rfc["accuracy"] = gd_rfc.best_score_
            best_params_rfc["n"] = n
            best_params_rfc["model_params"] = gd_rfc.best_params_
            best_params_rfc["best_estimator"] = gd_rfc.best_estimator_
            best_params_rfc["X_test"] = X_test
            best_params_rfc["y_test"] = y_test
            for k,v in best_params_rfc.items():
                if k not in ["best_estimator", "X_test", "y_test"]:
                    print(k, " : ", v)
        y_test_pred = gd_rfc.best_estimator_.predict(X_test)
        acc = accuracy_score(y_test,y_test_pred)
        accuracies_rfc.append(acc)   

        #grid search - SVC
        gd_svc = GridSearchCV(estimator = Pipeline(steps=[("scaler", StandardScaler()),  ("svc", SVC())]), param_grid=param_grid_svc, scoring="accuracy", cv=3, verbose=2)
        gd_svc.fit(X_train, y_train)
        if gd_svc.best_score_>best_params_svc["accuracy"]:
            print(" Found a new BEST model for SVC, acc= ", gd_svc.best_score_)
            best_params_svc["accuracy"] = gd_svc.best_score_
            best_params_svc["n"] = n
            best_params_svc["model_params"] = gd_svc.best_params_
            best_params_svc["best_estimator"] = gd_svc.best_estimator_
            best_params_svc["X_test"] = X_test
            best_params_svc["y_test"] = y_test
            for k,v in best_params_svc.items():
                if k not in ["best_estimator", "X_test", "y_test"]:
                    print(k, " : ", v)
        y_test_pred = gd_svc.best_estimator_.predict(X_test)
        acc = accuracy_score(y_test,y_test_pred)
        accuracies_svc.append(acc)   
        
    return best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc


def compute_features_df(n_row, n_col, array_list,zcr,rms, style):
    #define dataframe with the current split
    mean_features = [style+"_mean_"+str(i) for i in range(n_row*n_col)]
    std_features = [style+"_std_"+str(i) for i in range(n_row*n_col)]
    max_features = [style+"_max_"+str(i) for i in range(n_row*n_col)]
    min_features = [style+"_min_"+str(i) for i in range(n_row*n_col)]
    zcr_features0 = ["_zcr_mean_"+str(i) for i in range(n_row)]
    rms_features0 = ["_rms_mean_"+str(i) for i in range(n_row)]
    zcr_features1 = ["_zcr_std_"+str(i) for i in range(n_row)]
    rms_features1 = ["_rms_std_"+str(i) for i in range(n_row)] 
    features_df = pd.DataFrame(columns=mean_features+std_features+max_features+min_features+zcr_features0+rms_features0+zcr_features1+rms_features1)
    
    #for each mfcc
    for i,mfcc in enumerate(array_list):
        #split it in nxn blocks
        mfcc_blocks = split_matrix(mfcc, splits_on_rows=n_row, splits_on_columns= n_col)
        #compute the mean and std for each block
        mfcc_means = [np.mean(block,axis = None) for block in mfcc_blocks]
        mfcc_stds = [np.std(block, axis = None) for block in mfcc_blocks]
        mfcc_max = [np.max(block,axis = None) for block in mfcc_blocks]
        mfcc_min = [np.min(block, axis = None) for block in mfcc_blocks]
        zcr_means = [np.mean(block,axis=None) for block in np.array_split(zcr[i],n_row,axis=1)]
        rms_means = [np.mean(block,axis=None) for block in np.array_split(rms[i],n_row,axis=1)]
        zcr_stds = [np.std(block,axis=None) for block in np.array_split(zcr[i],n_row,axis=1)]
        rms_stds = [np.std(block,axis=None) for block in np.array_split(rms[i],n_row,axis=1)]
        #append the computed data as a new row
        new_row = mfcc_means+mfcc_stds+mfcc_max+mfcc_min+zcr_means+rms_means+zcr_stds+rms_stds
        features_df.loc[len(features_df)] = new_row
    return(features_df)

In [6]:
def grid_searcher_big(textual_df,mfcc_13,zcr,rms, min_shape_1, param_grid_rfc, param_grid_svc):
    
    accuracies_rfc = list()
    accuracies_svc = list()
    
    #select the values for the splits
    n_splits = list(set([ round(el) for el in np.linspace(1, min(min_shape_1,13), 5) ]))
    n_splits.sort()
    
    best_params_rfc = {"accuracy" : -1.1}
    best_params_svc = {"accuracy" : -1.1}

    #call the grid_search on each combination of spec and mfcc
    best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc = grid_search_internal(textual_df, mfcc_13,zcr,rms, n_splits, param_grid_rfc, param_grid_svc, best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc)

    return best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc, n_splits

In [7]:
textual_df = preprocessor_final("development.csv")

In [8]:
mfcc_13,zcr,rms, min_shape_1 = audio_features_extractor_final(textual_df, sample_rate=16000)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800


In [9]:
param_grid_svc = {'svc__C':[1,5,10,50,500,1000],'svc__kernel':['rbf','sigmoid'], "svc__gamma":["scale","auto"]}
param_grid_rfc = {
                "n_estimators": [1000],
                "criterion": ["gini","entropy"],
                "max_features": ["sqrt","log2"],
                "random_state": [42],
                "n_jobs": [-1]  
            }
best_params_rfc, accuracies_rfc, best_params_svc, accuracies_svc,n_splits = grid_searcher_big(textual_df, mfcc_13,zcr,rms, min_shape_1, param_grid_rfc, param_grid_svc)

 Iter: 1/5
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END criterion=gini, max_features=sqrt, n_estimators=1000, n_jobs=-1, random_state=42; total time=   4.3s
[CV] END criterion=gini, max_features=sqrt, n_estimators=1000, n_jobs=-1, random_state=42; total time=   1.5s
[CV] END criterion=gini, max_features=sqrt, n_estimators=1000, n_jobs=-1, random_state=42; total time=   1.5s
[CV] END criterion=gini, max_features=log2, n_estimators=1000, n_jobs=-1, random_state=42; total time=   1.6s
[CV] END criterion=gini, max_features=log2, n_estimators=1000, n_jobs=-1, random_state=42; total time=   1.6s
[CV] END criterion=gini, max_features=log2, n_estimators=1000, n_jobs=-1, random_state=42; total time=   1.6s
[CV] END criterion=entropy, max_features=sqrt, n_estimators=1000, n_jobs=-1, random_state=42; total time=   2.8s
[CV] END criterion=entropy, max_features=sqrt, n_estimators=1000, n_jobs=-1, random_state=42; total time=   2.8s
[CV] END criterion=entropy, max_features=sq

In [None]:
X_test = best_params_svc["X_test"]
y_test = best_params_svc["y_test"]
best_model = best_params_svc["best_estimator"]
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1914 entries, 8400 to 9307
Columns: 729 entries, Gender_male to _rms_std_12
dtypes: float64(728), uint8(1)
memory usage: 10.6 MB


In [None]:
y_test_pred = best_model.predict(X_test)
acc = accuracy_score(y_test,y_test_pred)

print("Accuracy on the test set: ",acc)

Accuracy on the test set:  0.8254963427377221


In [None]:
#Train the best model on the whole development set

audio_df = compute_features_df(best_params_svc['n'], best_params_svc['n'], mfcc_13, zcr, rms, "mfcc")
final_df = pd.concat([textual_df.reset_index().drop(columns = "Id"), audio_df], axis=1)
X = final_df.loc[:, final_df.columns != 'Intent'].drop(columns = "path")
y = final_df["Intent"]
svc_pipeline = Pipeline(steps=[("scaler", StandardScaler()), ("svc", SVC(C=10, kernel="rbf", gamma="scale"))])
model = svc_pipeline.fit(X,y)


In [None]:
evaluation_textual_df = preprocessor_final("evaluation.csv", evaluation=True)
print(evaluation_textual_df.head())
mfcc_13_ev, zcrs_ev, rmss_ev, min_shape_1_ev = audio_features_extractor_final(evaluation_textual_df, sample_rate=16000, evaluation=True)

                                                 path  Gender_male
Id                                                                
0   dsl_data/audio/speakers/NgQEvO2x7Vh3xy2xz/f53c...            1
1   dsl_data/audio/speakers/k5bqyxx2lzIbrlg9/1d5f8...            1
2   dsl_data/audio/speakers/7B4XmNppyrCK977p/1c0d5...            1
3   dsl_data/audio/speakers/k5bqyxx2lzIbrlg9/275c3...            1
4   dsl_data/audio/speakers/V4ZbwLm9G5irobWn/b7c7a...            0
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


In [None]:
audio_df_ev = compute_features_df(best_params_svc['n'], best_params_svc['n'], mfcc_13_ev, zcrs_ev, rmss_ev, "mfcc")
final_df_ev = pd.concat([evaluation_textual_df.reset_index().drop(columns = ["Id","path"]), audio_df_ev], axis=1)
print(final_df_ev.info())
final_df_ev.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1455 entries, 0 to 1454
Columns: 729 entries, Gender_male to _rms_std_12
dtypes: float64(728), uint8(1)
memory usage: 8.1 MB
None


Unnamed: 0,Gender_male,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,...,_rms_std_3,_rms_std_4,_rms_std_5,_rms_std_6,_rms_std_7,_rms_std_8,_rms_std_9,_rms_std_10,_rms_std_11,_rms_std_12
0,1,-309.411316,181.127197,-32.28904,24.868547,-1.302502,-23.01293,-9.459589,-27.581188,-0.165432,...,0.001928,0.004145,0.002772,0.00289,0.0081,0.001045,0.00173,0.00579,0.00766,0.0
1,1,-304.587646,64.497292,10.538591,64.674644,-0.274724,-2.021576,-75.419861,28.708878,-53.073238,...,0.015598,0.004443,0.001938,0.013705,0.00208,0.003992,0.001603,0.000837,0.00399,0.006406
2,1,-364.165802,94.238747,-97.383171,75.004097,-12.590721,-11.355885,-21.763548,-16.19313,-0.754175,...,0.001497,0.000772,0.001618,0.00124,0.001432,0.005191,0.000304,0.004246,0.002458,0.003703
3,1,-283.609741,113.461044,15.626864,7.198956,-46.202415,-25.762638,-52.309162,10.094173,-35.111469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,-299.321625,16.878889,53.243801,159.048584,-63.731991,-39.186981,-33.44997,-11.231477,-30.810892,...,0.000215,0.009017,0.027145,0.012064,0.007741,0.014166,0.008472,4.8e-05,0.004477,0.009136


In [None]:
y_ev = model.predict(final_df_ev)

In [None]:
prediction_df = pd.DataFrame(data=y_ev, columns=["Predicted"])
prediction_df.index.name = "Id"
print(prediction_df.head())
my_csv = prediction_df.to_csv("submission.csv")

           Predicted
Id                  
0     increasevolume
1       decreaseheat
2       increaseheat
3   deactivatelights
4     decreasevolume
