In [None]:
import pandas as pd
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# leemos el dataset
attributes_spotify = pd.read_csv(
    "https://raw.githubusercontent.com/emmanueliarussi/DataScienceCapstone/master/3_MidtermProjects/ProjectBOM/data/attributes_spotify.csv")

# eliminamos columnas que no se utilizaran
attributes_spotify.drop('Unnamed: 0', inplace=True, axis=1)
attributes_spotify.drop('song_title', inplace=True, axis=1)
attributes_spotify.drop('artist', inplace=True, axis=1)

# modelos a comparar
names = [
    "KNN"
    #"SVM",
    #"Random Forest",
    #"Perceptrones"
]

models = [
    KNeighborsClassifier(5)
    #SVC(kernel="linear", C=0.025),
    #RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    #MLPClassifier(max_iter=100)
]

data = {
    "Modelo": names,
    "TPR": [],
    "FNR": [],
    "TNR": [],
    "FPR": [],
    "F1": [],
    "Precisión": [],
    "Exactitud": [],
    "Sesgo Ratio": [],
}

# porcentajes para el train-validation-test
train_ratio = 0.60
validation_ratio = 0.20
test_ratio = 0.20

# split del dataset para entrenar
x_train, x_test, y_train, y_test = train_test_split(
    attributes_spotify, attributes_spotify.target, test_size=1-train_ratio)

# split del dataset para validar
x_val, x_test, y_val, y_test = train_test_split(
    x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

# ejecucion de los modelos
for name, model in zip(names, models):
    model2 = cross_validate(model, x_train, y_train, return_estimator=True)
    model.fit(x_val, y_val)
    predicted = model.predict(x_test)

    # calculo de medidas de evaluacion
    tpr = recall_score(y_test, predicted, pos_label=1)
    tnr = recall_score(y_test, predicted, pos_label=0)
    fpr =  1 - recall_score(y_test, predicted, pos_label=0)
    fnr = 1 - recall_score(y_test, predicted, pos_label=1)

    sesgo_ratio = (tpr+fnr)/(tnr+fpr) - (tpr+fpr)/(tnr+fnr)

    # visualizacion de datos
    data["TPR"].append(tpr)
    data["TNR"].append(tnr)
    data["FPR"].append(fpr)
    data["FNR"].append(fnr)
    data["F1"].append(f1_score(y_test, predicted))
    data["Precisión"].append(precision_score(
        y_test, predicted, zero_division=1))
    data["Exactitud"].append(accuracy_score(y_test, predicted))
    data["Sesgo Ratio"].append(sesgo_ratio)

print(pd.DataFrame(data))


          Modelo       TPR       FPR        F1  Precisión  Exactitud
0            KNN  0.504808  0.586735  0.532995   0.564516   0.544554
1            SVM  0.764423  0.316327  0.634731   0.542662   0.547030
2  Random Forest  0.971154  0.974490  0.973494   0.975845   0.972772
3   Perceptrones  1.000000  0.000000  0.679739   0.514851   0.514851
