In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn import model_selection
from sklearn import metrics
from statistics import *

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

In [3]:
df = pd.read_csv('../data/features_3_sec.csv')
df['label'] = df['label'].astype('category')
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


# Modelos a usar

En el top 4 (de la exploración inicial) se ve recurrentemente random forest, gradient boosting, adicionalmente en los mejores casos entran knn y svm; por lo cual, se van a probar estos cuatro modelos, con las dos versiones de los datos que tienen mejores resultados, modificando los parámetros de distancias y observar si se presentan mejoras en los resultados de precisión.

### Distancias para KNN

###### Parametro -> p (int)

p: int, predeterminado=2

Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

###### metric str o invocable, predeterminado = 'minkowski'

La métrica de distancia que se utilizará para el árbol. La métrica predeterminada es minkowski. Si la métrica está “calculada previamente”, se supone que X es una matriz de distancia y debe ser cuadrada durante el ajuste.

###### Vamos a usar Minkowski con p=1 y p=2, y Mahalanobis

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html

Se prueba la distancia chebyshev, pero disminuye la precisión; por lo tanto no la incluiremos

### Distancias para SVM

No se encuentra parametro de métricas de distancias en la documentación

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

### Distancias para Random Forest

No se encuentra parametro de métricas de distancias en la documentación

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

### Distancias para Gradient Boosting

No se encuentra parametro de métricas de distancias en la documentación

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [30]:
models = []
models.append(('knn_mk_manh', KNeighborsClassifier(p = 1 , metric = 'minkowski' )))
models.append(('knn_mk_eu', KNeighborsClassifier(p = 2 , metric = 'minkowski' )))
models.append(('knn_mahalanobis', KNeighborsClassifier(metric = 'mahalanobis', metric_params={'VI': cov_i_norm})))

models.append(('ran_forest_d3', RandomForestClassifier(random_state=1, max_depth=3)))
models.append(('ran_forest_d5', RandomForestClassifier(random_state=1, max_depth=5)))
models.append(('ran_forest_d7', RandomForestClassifier(random_state=1, max_depth=7)))

models.append(('gradient_l1_d3', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, random_state=0)))
models.append(('gradient_l1_d5', GradientBoostingClassifier(learning_rate=0.1, max_depth=5, random_state=0)))
models.append(('gradient_l1_d7', GradientBoostingClassifier(learning_rate=0.1, max_depth=7, random_state=0)))
models.append(('gradient_l2_d3', GradientBoostingClassifier(learning_rate=0.2, max_depth=3, random_state=0)))
models.append(('gradient_l2_d5', GradientBoostingClassifier(learning_rate=0.2, max_depth=5, random_state=0)))
models.append(('gradient_l2_d7', GradientBoostingClassifier(learning_rate=0.2, max_depth=7, random_state=0)))
models.append(('gradient_l3_d3', GradientBoostingClassifier(learning_rate=0.3, max_depth=3, random_state=0)))
models.append(('gradient_l3_d5', GradientBoostingClassifier(learning_rate=0.3, max_depth=5, random_state=0)))
models.append(('gradient_l3_d7', GradientBoostingClassifier(learning_rate=0.3, max_depth=7, random_state=0)))

models.append(('svm_k_rbf', SVC(kernel='rbf')))
models.append(('svm_k_pol',SVC(kernel='poly')))
models.append(('svm_k_sig',SVC(kernel='sigmoid')))
models.append(('svm_k_lin',SVC(kernel='linear')))

# 1. Datos sin outliers

In [31]:
x = df.loc[:, 'chroma_stft_mean':'mfcc20_var']
y  = df['label'].cat.codes.values

In [32]:
scaler = MinMaxScaler()
x_norm = scaler.fit_transform(x)

In [33]:
data = df.iloc[:,2:-1]
data_norm = scaler.fit_transform(data)

In [34]:
def mahalanobis(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    #if not cov:
        #cov = np.cov(x.T)
    inv_cov = np.linalg.inv(cov)
    mahalanobis_distances = np.diag(np.dot((x_mu @ inv_cov), x_mu.T))
   
    return mahalanobis_distances

In [35]:
cov_h_norm = np.cov(x_norm.T)

#Mejorar la matriz de covarianzas

cov_i_norm = cov_h_norm + 10*np.eye(57, 57) ## ¿Cómo determinamos lamda?
cond_i_norm = np.linalg.cond(cov_i_norm)
det_i_norm = np.linalg.det(cov_i_norm)

In [36]:
mahalanobis_dis_i_norm = mahalanobis(x=x_norm, data=data_norm, cov=cov_i_norm)

x_ot = x_norm[mahalanobis_dis_i_norm < 0.4]
y_ot = y[mahalanobis_dis_i_norm < 0.4]

In [37]:
cols = df.loc[:, 'chroma_stft_mean':'mfcc20_var'].columns
df_3 = pd.DataFrame(x_ot,columns=cols)
df_3['label'] = y_ot
df_3['label'] = df_3['label'].astype('category')

### Partición train-test

In [38]:
df_results_ot1 = pd.DataFrame(columns=['Modelo','Score_train','Score_test'])

fila_nueva = []
 
x_train, x_test, y_train, y_test = train_test_split(x_ot, y_ot, test_size = 0.2, random_state = 0)

for name, model in models:
    modelo = model.fit(x_train,y_train)
    
    #scores train y test
    score_train = model.score(x_train, y_train)
    score_test = model.score(x_test, y_test)
       
         
    fila_nueva = {'Modelo' : name,'Score_train': score_train,'Score_test': score_test}
    df_results_ot1 = df_results_ot1.append(fila_nueva, ignore_index = True)
    
df_results_ot1 = df_results_ot1.sort_values(['Score_test'], ascending = False)

In [39]:
df_results_ot1

Unnamed: 0,Modelo,Score_train,Score_test
0,knn_mk_manh,0.957592,0.911156
14,gradient_l3_d7,0.99899,0.90106
2,knn_mahalanobis,0.946232,0.891469
11,gradient_l2_d7,0.99899,0.891469
1,knn_mk_eu,0.946485,0.890964
10,gradient_l2_d5,0.99899,0.879354
8,gradient_l1_d7,0.99899,0.879354
13,gradient_l3_d5,0.99899,0.878344
7,gradient_l1_d5,0.99899,0.864715
12,gradient_l3_d3,0.99899,0.855124


### Partición con kfold, cross-validation

In [55]:
df_results_ot2 = pd.DataFrame(columns=['Modelo','Score_train','Score_test']) #,'Accuracy_validation'

fila_nueva = []

#kfold = KFold(n_splits=10, random_state=0, shuffle=True)
   

for name, model in models:
    
    # Cross-validation
    #scoring = 'accuracy'
    kfold = model_selection.KFold(n_splits=10, random_state=0, shuffle=True)
    #cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    #score_v = mean(cv_results)
    
    for train_index, test_index in kfold.split(df_3):
        x_train = df_3.iloc[train_index].loc[:, 'chroma_stft_mean':'mfcc20_var']
        x_test = df_3.iloc[test_index].loc[:,'chroma_stft_mean':'mfcc20_var']
        y_train = df_3.iloc[train_index].loc[:,'label'].cat.codes.values
        y_test = df_3.loc[test_index]['label'].cat.codes.values
    
    
    modelo = model.fit(x_train,y_train)
    
    #scores train y test
    score_train = model.score(x_train, y_train)
    score_test = model.score(x_test, y_test)
    
         
    fila_nueva = {'Modelo' : name,'Score_train': score_train,'Score_test': score_test} #,'Accuracy_validation':score_v
    df_results_ot2 = df_results_ot2.append(fila_nueva, ignore_index = True)
    
df_results_ot2 = df_results_ot2.sort_values(['Score_test'], ascending = (False))

In [56]:
df_results_ot2

Unnamed: 0,Modelo,Score_train,Score_test
0,knn_mk_manh,0.958492,0.928283
2,knn_mahalanobis,0.949405,0.924242
1,knn_mk_eu,0.949742,0.923232
14,gradient_l3_d7,0.999103,0.911111
11,gradient_l2_d7,0.999103,0.90101
8,gradient_l1_d7,0.999103,0.891919
10,gradient_l2_d5,0.999103,0.889899
13,gradient_l3_d5,0.999103,0.887879
7,gradient_l1_d5,0.999103,0.867677
12,gradient_l3_d3,0.998878,0.866667


# 2. Datos sin outliers ni columnas con dependencia lineal

In [45]:
df_4 = df_3.drop(['spectral_centroid_mean','spectral_bandwidth_mean','rolloff_mean'],axis=1)
df_4['label'] = df_4['label'].astype('category')

### Partición train-test split

In [46]:
x_4 = df_4.loc[:, 'chroma_stft_mean':'mfcc20_var']
y_4 = df_4['label'].cat.codes.values

In [49]:
df_results_ot_dl1 = pd.DataFrame(columns=['Modelo','Score_train','Score_test'])

fila_nueva = []
 
x_train, x_test, y_train, y_test = train_test_split(x_4, y_4, test_size = 0.2, random_state = 0)

for name, model in models:
    modelo = model.fit(x_train,y_train)
    
    #scores train y test
    score_train = model.score(x_train, y_train)
    score_test = model.score(x_test, y_test)
       
         
    fila_nueva = {'Modelo' : name,'Score_train': score_train,'Score_test': score_test}
    df_results_ot_dl1 = df_results_ot_dl1.append(fila_nueva, ignore_index = True)
    
df_results_ot_dl1 = df_results_ot_dl1.sort_values(['Score_test'], ascending = False)

In [50]:
df_results_ot_dl1

Unnamed: 0,Modelo,Score_train,Score_test
0,knn_mk_manh,0.959359,0.917718
14,gradient_l3_d7,0.99899,0.890459
1,knn_mk_eu,0.945728,0.888945
11,gradient_l2_d7,0.99899,0.880363
13,gradient_l3_d5,0.99899,0.878849
10,gradient_l2_d5,0.99899,0.878849
8,gradient_l1_d7,0.99899,0.877335
7,gradient_l1_d5,0.99899,0.860676
12,gradient_l3_d3,0.99899,0.848561
9,gradient_l2_d3,0.994573,0.847047


### Partición kfold, cross-validation

In [57]:
df_results_ot_dl2 = pd.DataFrame(columns=['Modelo','Score_train','Score_test']) #,'Accuracy_validation'

fila_nueva = []

#kfold = KFold(n_splits=10, random_state=0, shuffle=True)
   

for name, model in models:
    
    # Cross-validation
    #scoring = 'accuracy'
    kfold = model_selection.KFold(n_splits=10, random_state=0, shuffle=True)
    #cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    #score_v = mean(cv_results)
    
    for train_index, test_index in kfold.split(df_4):
        x_train = df_4.iloc[train_index].loc[:, 'chroma_stft_mean':'mfcc20_var']
        x_test = df_4.iloc[test_index].loc[:,'chroma_stft_mean':'mfcc20_var']
        y_train = df_4.iloc[train_index].loc[:,'label'].cat.codes.values
        y_test = df_4.loc[test_index]['label'].cat.codes.values
    
    
    modelo = model.fit(x_train,y_train)
    
    #scores train y test
    score_train = model.score(x_train, y_train)
    score_test = model.score(x_test, y_test)
    
         
    fila_nueva = {'Modelo' : name,'Score_train': score_train,'Score_test': score_test} #,'Accuracy_validation':score_v
    df_results_ot_dl2 = df_results_ot_dl2.append(fila_nueva, ignore_index = True)
    
df_results_ot_dl2 = df_results_ot_dl2.sort_values(['Score_test'], ascending = (False))

In [58]:
df_results_ot_dl2

Unnamed: 0,Modelo,Score_train,Score_test
0,knn_mk_manh,0.959278,0.937374
1,knn_mk_eu,0.948059,0.914141
14,gradient_l3_d7,0.999103,0.90404
13,gradient_l3_d5,0.999103,0.891919
11,gradient_l2_d7,0.999103,0.891919
8,gradient_l1_d7,0.999103,0.883838
10,gradient_l2_d5,0.999103,0.881818
7,gradient_l1_d5,0.999103,0.871717
12,gradient_l3_d3,0.998878,0.863636
9,gradient_l2_d3,0.993493,0.844444
