In [1]:
# That's an impressive list of imports.
import numpy as np
import pandas as pd
import random

import scipy.io as sio

import sklearn
from sklearn import feature_selection, datasets, model_selection, preprocessing, decomposition, metrics
from sklearn.model_selection import validation_curve, learning_curve, cross_validate, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import sys

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, KFold


def getypredttl(y_pred_arr):
    y_pred_ttl = list(y_pred_arr[0])
    y_pred_ttl.extend(y_pred_arr[1])
    y_pred_ttl.extend(y_pred_arr[2])
    y_pred_ttl.extend(y_pred_arr[3])
    y_pred_ttl.extend(y_pred_arr[4])
    
    return y_pred_ttl


def obtenerpesos(y_train):
    no = y_train[y_train == 0].shape[0]
    yes =  y_train[y_train == 1].shape[0]

    total = no + yes
    # Calculo los porcentajes de pesos
    p_mr = round(no/total,4)
    p_nomr = round(yes/total,4)
    pesos = []
    for i in range(y_train.shape[0]):
        if y_train[i] == 1:
            pesos.append(no)
        else:
            pesos.append(yes)
    return pesos



def calculatemetrics(parameter):
    mean = round(np.mean(parameter)*100, 2)
    deviation = round(np.sqrt(np.sum(np.power(parameter - np.mean(parameter), 2) / len(parameter)))*100, 2)
    return mean, deviation


def printOutAlgorithm(v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train, auc):
    mean_specificity, deviation_specificity = calculatemetrics(v_specificity)
    mean_recall, deviation_recall = calculatemetrics(v_recall)
    #print("v_f1score: ", v_f1score)
    mean_f1, deviation_f1 = calculatemetrics(v_f1score)
    mean_precision, deviation_precision = calculatemetrics(v_precision)
    mean_accuracy, deviation_accuracy = calculatemetrics(v_accuracy)
    mean_auc, deviation_auc = calculatemetrics(auc)

    deviation_test = round(np.sqrt(np.sum(np.power(v_accuracy_test - np.mean(v_accuracy_test), 2) / len(v_accuracy_test)))*100, 2)
    deviation_train = round(np.sqrt(np.sum(np.power(v_accuracy_train - np.mean(v_accuracy_train), 2) / len(v_accuracy_train)))*100, 2)

    print("\n \nAccuracy en test final: ", round(np.mean(v_accuracy_test)*100,2), "+-", deviation_test)
    print("Accuracy en train final: ", round(np.mean(v_accuracy_train)*100,2), "+-", deviation_train)
    
    print("Accuracy: ", mean_accuracy, "+-", deviation_accuracy)
#     print("Precision: ", mean_precision, "+-", deviation_precision)
    print("Specificity: ", mean_specificity, "+-", deviation_specificity)
    print("Sensitivity: ", mean_recall,  "+-", deviation_recall)
#     print("F1-Score: ", mean_f1, "+-", deviation_f1)
    print("AUC: ", mean_auc, "+-", deviation_auc)

    print("\n")    
    print(' & ', mean_accuracy, ' $\pm$ ', deviation_accuracy, ' & ', mean_specificity, '$\pm$', deviation_specificity, ' & ', mean_recall,  ' $\pm$ ', deviation_recall, ' & ',  mean_f1, ' $\pm$ ', deviation_f1, ' & ', mean_auc, ' $\pm$ ', deviation_auc)


def calculateconfusionmatrix(y_pred, y_train, y_test, v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train, indice, y_pred_Xtrain):
    
    accuracy_test = sklearn.metrics.accuracy_score(y_test, y_pred)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()    
    v_specificity.append(tn / (tn + fp))
    v_precision.append(tp / (tp + fp))
    v_recall.append(tp / (tp + fn))
    v_f1score.append((2 * v_recall[indice] * v_precision[indice]) / (v_recall[indice] + v_precision[indice]))
    v_accuracy.append((tp + tn) / (tp + fn + fp + tn))

    v_accuracy_test.append(accuracy_test)
    
    accuracy_train = sklearn.metrics.accuracy_score(y_train, y_pred_Xtrain)
    v_accuracy_train.append(accuracy_train)
    
    return v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train




## App1. WxTxF --> Wx(T*F)

In [6]:
folders = ["s1"] 
matrix_all_values = np.zeros((16, len(folders)))

C = [1e-4, 1e-2, 1, 3, 5, 7, 9]
idx_exp = 1
for i in range(len(folders)):
    
    X_train = np.load("../Data/splits/App" +str(idx_exp)+ "/"  + folders[i] + "/X_train_tensor.npy")
    print(X_train.shape)
    y_train = np.load("../Data/splits/App" +str(idx_exp)+ "/"  + folders[i] + "/y_train_tensor.npy")
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2])
    X_train = np.nan_to_num(X_train, nan=0)
    l = pd.DataFrame(y_train).sum(axis=1).reset_index(drop=True).values
#     y_train = [1 if valor > 2 else 0 for valor in l]
    
    
    X_test = np.load("../Data/splits/App" +str(idx_exp)+ "/"  + folders[i] + "/X_test_tensor.npy")
    y_test = np.load("../Data/splits/App" +str(idx_exp)+ "/"  + folders[i] + "/y_test_tensor.npy")
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2])
    X_test = np.nan_to_num(X_test, nan=0)
    l = pd.DataFrame(y_test).sum(axis=1).reset_index(drop=True).values
#     y_test = [1 if valor > 2 else 0 for valor in l]
    
    bestHyperparameters = {'C': 0}

    bestMetricDev = 0
    for c in range(len(C)):
        
        clf = LogisticRegression(solver='liblinear', C=C[c], penalty='l1', max_iter = 1000)

        roc_auc_score = []
        threshold_1 = []
        threshold_2 = 0
        y_pred_arr = []
        
        kf = KFold(n_splits=5, shuffle=True)
        kf.get_n_splits(X_train)
        for train_index, val_index in kf.split(X_train):

            X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
            y_train_cv, y_val_cv = np.array(y_train)[train_index], np.array(y_train)[val_index]
            pesos = obtenerpesos(y_train_cv)
            clf = clf.fit(np.array(X_train_cv), np.array(y_train_cv), sample_weight = pesos)
            y_pred = clf.predict_proba(X_val_cv)[:,1]

            auc = sklearn.metrics.roc_auc_score(y_val_cv, y_pred)
            roc_auc_score.append(auc)

            y_pred_arr.append(y_pred)

        y_pred_ttl = getypredttl(y_pred_arr)

        if np.mean(roc_auc_score) > bestMetricDev:
            print("\tCambio the best roc auc score ", bestMetricDev, " por: ", np.mean(roc_auc_score))
            bestMetricDev = np.mean(roc_auc_score)
            bestHyperparameters['C'] = C[c]
            bestHyperparameters['y_pred_val'] = y_pred_ttl


    
    print("Best roc auc score: ", bestMetricDev)
    print("C: ", bestHyperparameters["C"])

    clf = LogisticRegression(solver='liblinear', C=bestHyperparameters['C'],  penalty='l1',  max_iter = 1000)
    pesos = obtenerpesos(y_train)
    clf = clf.fit(np.array(X_train), np.array(y_train),  sample_weight = pesos)
    
    y_pred_test = clf.predict_proba(X_test)[:,1]
    y_pred_Xtrain = clf.predict(X_train)

    selecCalculateMetrics_aux = ['umbral: 0.5']

    for j in range(len(selecCalculateMetrics_aux)):

        v_accuracy_test = []
        v_accuracy_train = []
        v_specificity = []
        v_sensitivity = []
        v_precision = []
        v_recall = []
        v_f1score = []
        v_accuracy = []
        auc_score = []

        if selecCalculateMetrics_aux[j] == 'umbral: 0.5':
            auc_score.append(sklearn.metrics.roc_auc_score(y_test, y_pred_test))
            y_pred = (y_pred_test > 0.5).astype('int')

        v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train = calculateconfusionmatrix(y_pred,\
                                           y_train, y_test, v_specificity, v_recall, \
                                           v_f1score, v_precision, v_accuracy, v_accuracy_test, \
                                           v_accuracy_train, 0, y_pred_Xtrain)

        matrix_all_values[j*8:j*8 + 8, i] = v_specificity[0], v_recall[0], v_f1score[0], \
            v_precision[0], v_accuracy[0], v_accuracy_test[0], v_accuracy_train[0], auc_score[0]

print()
print("====> Threshold 0.5")
print()
printOutAlgorithm(matrix_all_values[0,:], matrix_all_values[1,:], matrix_all_values[2,:], matrix_all_values[3,:], matrix_all_values[4,:], matrix_all_values[5,:], matrix_all_values[6,:], matrix_all_values[7,:])

(18446, 6, 15)
	Cambio the best roc auc score  0  por:  0.7247084689129595


KeyboardInterrupt: 

In [11]:
y_test = np.load("../Data/splits/App" +str(2)+ "/"  + folders[i] + "/y_test_tensor.npy")
y_test

array([[1.41585000e+06, 1.41585000e+06, 1.41585000e+06, ...,
        1.41585000e+06, 1.41585000e+06, 1.41585000e+06],
       [1.41585100e+06, 1.41585100e+06, 1.41585100e+06, ...,
        1.41585100e+06, 1.41585100e+06, 1.41585100e+06],
       [1.41585200e+06, 1.41585200e+06, 1.41585200e+06, ...,
        1.41585200e+06, 1.41585200e+06, 1.41585200e+06],
       ...,
       [3.24741725e+08, 3.24741725e+08, 3.24741725e+08, ...,
        3.24741725e+08, 3.24741725e+08, 3.24741725e+08],
       [3.24741726e+08, 3.24741726e+08, 3.24741726e+08, ...,
        3.24741726e+08, 3.24741726e+08, 3.24741726e+08],
       [3.24741727e+08, 3.24741727e+08, 3.24741727e+08, ...,
        3.24741727e+08, 3.24741727e+08, 3.24741727e+08]])