## Creación de datasets de escenarios

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
data_path = '../data/features_3_sec.csv'
df = pd.read_csv(data_path)
df['label'] = df['label'].astype('category')
df['length'] = pd.to_numeric(df['length'])

In [5]:
# 1. DF original
df_x = df.drop(['filename','label','length'],axis=1)
X = np.array(df_x) 
y  = df['label'].cat.codes.values


In [6]:
# 2. DF columnas filtradas

df_x_cf = df_x.drop(['spectral_centroid_mean','spectral_bandwidth_mean','rolloff_mean'],axis=1)
X_cf = np.array(df_x_cf) 
y_cf = y

In [8]:
def mahalanobis(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    #if not cov:
        #cov = np.cov(x.T)
    inv_cov = np.linalg.inv(cov)
    mahalanobis_distances = np.diag(np.dot((x_mu @ inv_cov), x_mu.T))
   
    return mahalanobis_distances

In [10]:
# 3. DF filtrado de outliers

scaler = MinMaxScaler()
x_norm = scaler.fit_transform(X)
data = df.iloc[:,2:-1]
data_norm = scaler.fit_transform(data)
cov_h_norm = np.cov(x_norm.T)
cov_i_norm = cov_h_norm + 10*np.eye(57, 57) ## ¿Cómo determinamos lamda?
cond_i_norm = np.linalg.cond(cov_i_norm)
det_i_norm = np.linalg.det(cov_i_norm)
mahalanobis_dis_i_norm = mahalanobis(x=x_norm, data=data_norm, cov=cov_i_norm)
mahalanobis_dis_i_norm
x_fn = x_norm[mahalanobis_dis_i_norm < 0.4]
y_fn = y[mahalanobis_dis_i_norm < 0.4]


In [12]:
# 4. filtrado de outliers + filtrado columnas

cols = df_x.columns
df_x_fn = pd.DataFrame(x_fn,columns=cols)
df_x_fn_cf = df_x_fn.drop(['spectral_centroid_mean','spectral_bandwidth_mean','rolloff_mean'],axis=1)
X_fn_cf = np.array(df_x_fn_cf)
y_fn_cf = y_fn

In [14]:
# train test split de escenarios
x_train, x_test, y_train, y_test = train_test_split(X,y)
x_train_cf, x_test_cf, y_train_cf, y_test_cf = train_test_split(X_cf,y_cf)
x_train_fn, x_test_fn, y_train_fn, y_test_fn = train_test_split(x_fn,y_fn)
x_train_fn_cf, x_test_fn_cf, y_train_fn_cf, y_test_fn_cf = train_test_split(X_fn_cf,y_fn_cf)

## Exploración de modelos

* CART
* Naive Bayes
* KNN
* Stochastic Gradient Descent
* Random Forest
* SVM
* Logistic Regression

### CART

In [16]:
from sklearn.tree import DecisionTreeClassifier
# Original
tree = DecisionTreeClassifier().fit(x_train,y_train)
tree_train = tree.score(x_train,y_train)
tree_test = tree.score(x_test,y_test)
print('Score de entrenamiento: ')
print(tree_train)
print('Score de Testeo: ')
print(tree_test)




Score de entrenamiento: 
0.9993326214628937
Score de Testeo: 
0.6505204163330665


In [17]:
#Columnas filtradas
tree = DecisionTreeClassifier().fit(x_train_cf,y_train_cf)
tree_train_cf = tree.score(x_train_cf,y_train_cf)
tree_test_cf = tree.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(tree_train_cf)
print('Score de Testeo: ')
print(tree_test_cf)

Score de entrenamiento: 
0.9991991457554725
Score de Testeo: 
0.6585268214571657


In [18]:
#filtrado normalizado
tree = DecisionTreeClassifier().fit(x_train_fn,y_train_fn)
tree_train_fn = tree.score(x_train_fn,y_train_fn)
tree_test_fn = tree.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(tree_train_fn)
print('Score de Testeo: ')
print(tree_test_fn)

Score de entrenamiento: 
0.999057619816909
Score de Testeo: 
0.6591276252019386


In [19]:
#filtrado normalizado + columnas filtradas
tree = DecisionTreeClassifier().fit(x_train_fn_cf,y_train_fn_cf)
tree_train_fn_cf = tree.score(x_train_fn_cf,y_train_fn_cf)
tree_test_fn_cf = tree.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(tree_train_fn_cf)
print('Score de Testeo: ')
print(tree_test_fn_cf)

Score de entrenamiento: 
0.9991922455573505
Score de Testeo: 
0.6425686591276252


### Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
# Original
gnb = GaussianNB().fit(x_train,y_train)
gnb_train = gnb.score(x_train,y_train)
gnb_test = gnb.score(x_test,y_test)
print('Score de entrenamiento: ')
print(gnb_train)
print('Score de Testeo: ')
print(gnb_test)


Score de entrenamiento: 
0.4223171382808329
Score de Testeo: 
0.42954363490792635


In [21]:
# columnas filtradas
gnb = GaussianNB().fit(x_train_cf,y_train_cf)
gnb_train_cf = gnb.score(x_train_cf,y_train_cf)
gnb_test_cf = gnb.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(gnb_train_cf)
print('Score de Testeo: ')
print(gnb_test_cf)

Score de entrenamiento: 
0.38587827015483184
Score de Testeo: 
0.3710968775020016


In [22]:
# filtrado normalizado
gnb = GaussianNB().fit(x_train_fn,y_train_fn)
gnb_train_fn = gnb.score(x_train_fn,y_train_fn)
gnb_test_fn = gnb.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(gnb_train_fn)
print('Score de Testeo: ')
print(gnb_test_fn)

Score de entrenamiento: 
0.5238287560581584
Score de Testeo: 
0.5266558966074314


In [23]:
# filtrado normalizado + columnas filtradas
gnb = GaussianNB().fit(x_train_fn_cf,y_train_fn_cf)
gnb_train_fn_cf = gnb.score(x_train_fn_cf,y_train_fn_cf)
gnb_test_fn_cf = gnb.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(gnb_train_fn_cf)
print('Score de Testeo: ')
print(gnb_test_fn_cf)

Score de entrenamiento: 
0.531502423263328
Score de Testeo: 
0.5359450726978998


### KNN

In [42]:
from sklearn.neighbors import KNeighborsClassifier
# Original
knn = KNeighborsClassifier(100).fit(x_train,y_train) # definir k
knn_train = knn.score(x_train,y_train)
knn_test = knn.score(x_test,y_test)
print('Score de entrenamiento: ')
print(knn_train)
print('Score de Testeo: ')
print(knn_test)

Score de entrenamiento: 
0.32100907634810466
Score de Testeo: 
0.30064051240992795


In [43]:
# columnas filtradas
knn = KNeighborsClassifier(100).fit(x_train_cf,y_train_cf) # definir k
knn_train_cf = knn.score(x_train_cf,y_train_cf)
knn_test_cf = knn.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(knn_train_cf)
print('Score de Testeo: ')
print(knn_test_cf)

Score de entrenamiento: 
0.32020822210357713
Score de Testeo: 
0.30384307445956765


In [48]:
# filtrado normalizado
knn = KNeighborsClassifier().fit(x_train_fn,y_train_fn) # definir k
knn_train_fn = knn.score(x_train_fn,y_train_fn)
knn_test_fn = knn.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(knn_train_fn)
print('Score de Testeo: ')
print(knn_test_fn)

Score de entrenamiento: 
0.9446688206785138
Score de Testeo: 
0.9115508885298869


In [57]:
# filtrado normalizado + columnas filtradas
knn = KNeighborsClassifier().fit(x_train_fn_cf,y_train_fn_cf) # definir k
knn_train_fn_cf = knn.score(x_train_fn_cf,y_train_fn_cf)
knn_test_fn_cf = knn.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(knn_train_fn_cf)
print('Score de Testeo: ')
print(knn_test_fn_cf)

Score de entrenamiento: 
0.9437264404954228
Score de Testeo: 
0.8852988691437803


### Stochastic Gradient Descent

In [58]:
from sklearn.linear_model import SGDClassifier
# Original
SGD = SGDClassifier().fit(x_train,y_train) # Muchos hiperparamétros por definir..
SGD_train = SGD.score(x_train,y_train)
SGD_test = SGD.score(x_test,y_test)
print('Score de entrenamiento: ')
print(SGD_train)
print('Score de Testeo: ')
print(SGD_test)

Score de entrenamiento: 
0.1528296849973305
Score de Testeo: 
0.15812650120096078


In [60]:
# columnas filtradas
SGD = SGDClassifier().fit(x_train_cf,y_train_cf) # Muchos hiperparamétros por definir..
SGD_train_cf = SGD.score(x_train_cf,y_train_cf)
SGD_test_cf = SGD.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(SGD_train_cf)
print('Score de Testeo: ')
print(SGD_test_cf)

Score de entrenamiento: 
0.19927923117992524
Score de Testeo: 
0.19855884707766214


In [61]:
# filtrado normalizado
SGD = SGDClassifier().fit(x_train_fn,y_train_fn) # Muchos hiperparamétros por definir..
SGD_train_fn = SGD.score(x_train_fn,y_train_fn)
SGD_test_fn = SGD.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(SGD_train_fn)
print('Score de Testeo: ')
print(SGD_test_fn)

Score de entrenamiento: 
0.6762250942380184
Score de Testeo: 
0.6554927302100162


In [62]:
# filtrado normalizado + columnas filtradas
SGD = SGDClassifier().fit(x_train_fn_cf,y_train_fn_cf) # Muchos hiperparamétros por definir..
SGD_train_fn_cf = SGD.score(x_train_fn_cf,y_train_fn_cf)
SGD_test_fn_cf = SGD.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(SGD_train_fn_cf)
print('Score de Testeo: ')
print(SGD_test_fn_cf)

Score de entrenamiento: 
0.6755519655358104
Score de Testeo: 
0.6647819063004846


### Random Forest

In [63]:
from sklearn.ensemble import RandomForestClassifier
# Original
rfc = RandomForestClassifier().fit(x_train,y_train) 
rfc_train = rfc.score(x_train,y_train)
rfc_test = rfc.score(x_test,y_test)
print('Score de entrenamiento: ')
print(rfc_train)
print('Score de Testeo: ')
print(rfc_test)

Score de entrenamiento: 
0.9993326214628937
Score de Testeo: 
0.8702962369895917


In [64]:
# columnas filtradas
rfc = RandomForestClassifier().fit(x_train_cf,y_train_cf) 
rfc_train_cf = rfc.score(x_train_cf,y_train_cf)
rfc_test_cf = rfc.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(rfc_train_cf)
print('Score de Testeo: ')
print(rfc_test_cf)

Score de entrenamiento: 
0.9991991457554725
Score de Testeo: 
0.8586869495596477


In [65]:
# filtrado normalizado
rfc = RandomForestClassifier().fit(x_train_fn,y_train_fn) 
rfc_train_fn = rfc.score(x_train_fn,y_train_fn)
rfc_test_fn = rfc.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(rfc_train_fn)
print('Score de Testeo: ')
print(rfc_test_fn)

Score de entrenamiento: 
0.999057619816909
Score de Testeo: 
0.8695476575121163


In [67]:
# filtrado normalizado + columnas filtradas
rfc = RandomForestClassifier().fit(x_train_fn_cf,y_train_fn_cf) 
rfc_train_fn_cf = rfc.score(x_train_fn_cf,y_train_fn_cf)
rfc_test_fn_cf = rfc.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(rfc_train_fn_cf)
print('Score de Testeo: ')
print(rfc_test_fn_cf)

Score de entrenamiento: 
0.9991922455573505
Score de Testeo: 
0.8655088852988692


### Support Vector Machines

In [68]:
from sklearn import svm
# Original
svmc = svm.SVC().fit(x_train,y_train) 
svmc_train = svmc.score(x_train,y_train)
svmc_test = svmc.score(x_test,y_test)
print('Score de entrenamiento: ')
print(svmc_train)
print('Score de Testeo: ')
print(svmc_test)

Score de entrenamiento: 
0.28750667378537104
Score de Testeo: 
0.289031224979984


In [69]:
# columnas filtradas
svmc = svm.SVC().fit(x_train_cf,y_train_cf) 
svmc_train_cf = svmc.score(x_train_cf,y_train_cf)
svmc_test_cf = svmc.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(svmc_train_cf)
print('Score de Testeo: ')
print(svmc_test_cf)

Score de entrenamiento: 
0.2920448478376935
Score de Testeo: 
0.2838270616493195


In [70]:
# filtrado normalizado
svmc = svm.SVC().fit(x_train_fn,y_train_fn) 
svmc_train_fn = svmc.score(x_train_fn,y_train_fn)
svmc_test_fn = svmc.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(svmc_train_fn)
print('Score de Testeo: ')
print(svmc_test_fn)

Score de entrenamiento: 
0.7700592353257943
Score de Testeo: 
0.7681744749596123


In [71]:
# filtrado normalizado + columnas filtradas
svmc = svm.SVC().fit(x_train_fn_cf,y_train_fn_cf) 
svmc_train_fn_cf = svmc.score(x_train_fn_cf,y_train_fn_cf)
svmc_test_fn_cf = svmc.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(svmc_train_fn_cf)
print('Score de Testeo: ')
print(svmc_test_fn_cf)

Score de entrenamiento: 
0.7724824986537426
Score de Testeo: 
0.7576736672051696


### Logistic Regression


In [72]:
from sklearn.linear_model import LogisticRegression
# Original
lr = LogisticRegression(random_state=0).fit(x_train,y_train) 
lr_train = lr.score(x_train,y_train)
lr_test = lr.score(x_test,y_test)
print('Score de entrenamiento: ')
print(lr_train)
print('Score de Testeo: ')
print(lr_test)

Score de entrenamiento: 
0.21489588894821143
Score de Testeo: 
0.22417934347477983


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
# columnas filtradas
lr = LogisticRegression(random_state=0).fit(x_train_cf,y_train_cf) 
lr_train_cf = lr.score(x_train_cf,y_train_cf)
lr_test_cf = lr.score(x_test_cf,y_test_cf)
print('Score de entrenamiento: ')
print(lr_train_cf)
print('Score de Testeo: ')
print(lr_test_cf)

Score de entrenamiento: 
0.2207688200747464
Score de Testeo: 
0.21297037630104082


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
# filtrado normalizado
lr = LogisticRegression(random_state=0).fit(x_train_fn,y_train_fn) 
lr_train_fn = lr.score(x_train_fn,y_train_fn)
lr_test_fn = lr.score(x_test_fn,y_test_fn)
print('Score de entrenamiento: ')
print(lr_train_fn)
print('Score de Testeo: ')
print(lr_test_fn)

Score de entrenamiento: 
0.6980344641895531
Score de Testeo: 
0.6906300484652665


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
# filtrado normalizado + columnas filtradas
lr = LogisticRegression(random_state=0).fit(x_train_fn_cf,y_train_fn_cf) 
lr_train_fn_cf = lr.score(x_train_fn_cf,y_train_fn_cf)
lr_test_fn_cf = lr.score(x_test_fn_cf,y_test_fn_cf)
print('Score de entrenamiento: ')
print(lr_train_fn_cf)
print('Score de Testeo: ')
print(lr_test_fn_cf)

Score de entrenamiento: 
0.6945341949380721
Score de Testeo: 
0.6821486268174475


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
