In [8]:
# That's an impressive list of imports.
import numpy as np
import pandas as pd
import random

import scipy.io as sio

import sklearn
from sklearn import feature_selection, datasets, model_selection, preprocessing, decomposition, metrics
from sklearn.model_selection import validation_curve, learning_curve, cross_validate, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import sys

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, KFold


def getypredttl(y_pred_arr):
    y_pred_ttl = list(y_pred_arr[0])
    y_pred_ttl.extend(y_pred_arr[1])
    y_pred_ttl.extend(y_pred_arr[2])
    y_pred_ttl.extend(y_pred_arr[3])
    y_pred_ttl.extend(y_pred_arr[4])
    
    return y_pred_ttl


# def Find_Optimal_Cutoff(target, predicted):
#     """ Find the optimal probability cutoff point for a classification model related to event rate
#     Parameters
#     ----------
#     target : Matrix with dependent or target data, where rows are observations

#     predicted : Matrix with predicted data, where rows are observations

#     Returns
#     -------     
#     list type, with optimal cutoff value

#     """
#     fpr, tpr, threshold = roc_curve(target, predicted)
#     i = np.arange(len(tpr)) 
#     roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
# #     roc_t = roc.loc[(roc.tf-0).abs().argsort()[:1]]
#     roc_t = roc.loc[(roc.tf-0).abs().idxmin()]

# #     return list(roc_t['threshold']) 
#     return roc_t['threshold']


def calculatemetrics(parameter):
    mean = round(np.mean(parameter)*100, 2)
    deviation = round(np.sqrt(np.sum(np.power(parameter - np.mean(parameter), 2) / len(parameter)))*100, 2)
    return mean, deviation


def printOutAlgorithm(v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train, auc):
    mean_specificity, deviation_specificity = calculatemetrics(v_specificity)
    mean_recall, deviation_recall = calculatemetrics(v_recall)
    #print("v_f1score: ", v_f1score)
    mean_f1, deviation_f1 = calculatemetrics(v_f1score)
    mean_precision, deviation_precision = calculatemetrics(v_precision)
    mean_accuracy, deviation_accuracy = calculatemetrics(v_accuracy)
    mean_auc, deviation_auc = calculatemetrics(auc)

    deviation_test = round(np.sqrt(np.sum(np.power(v_accuracy_test - np.mean(v_accuracy_test), 2) / len(v_accuracy_test)))*100, 2)
    deviation_train = round(np.sqrt(np.sum(np.power(v_accuracy_train - np.mean(v_accuracy_train), 2) / len(v_accuracy_train)))*100, 2)

    print("\n \nAccuracy en test final: ", round(np.mean(v_accuracy_test)*100,2), "+-", deviation_test)
    print("Accuracy en train final: ", round(np.mean(v_accuracy_train)*100,2), "+-", deviation_train)
    
    print("Accuracy: ", mean_accuracy, "+-", deviation_accuracy)
    print("Precision: ", mean_precision, "+-", deviation_precision)
    print("Specificity: ", mean_specificity, "+-", deviation_specificity)
    print("Sensitivity: ", mean_recall,  "+-", deviation_recall)
    print("F1-Score: ", mean_f1, "+-", deviation_f1)
    print("AUC: ", mean_auc, "+-", deviation_auc)

    print("\n")    
    print(' & ', mean_accuracy, ' $\pm$ ', deviation_accuracy, ' & ', mean_specificity, '$\pm$', deviation_specificity, ' & ', mean_recall,  ' $\pm$ ', deviation_recall, ' & ',  mean_f1, ' $\pm$ ', deviation_f1, ' & ', mean_auc, ' $\pm$ ', deviation_auc)


def calculateconfusionmatrix(y_pred, y_train, y_test, v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train, indice, y_pred_Xtrain):
    
    accuracy_test = sklearn.metrics.accuracy_score(y_test, y_pred)
    
    #Construcción de métricas
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()    
    #print("Confusion matrix: ")
    #print("tn:" + str(tn) + " fp:" + str(fp) + " fn:" +  str(fn) + " tp:" + str(tp))
    v_specificity.append(tn / (tn + fp))
    v_precision.append(tp / (tp + fp))
    v_recall.append(tp / (tp + fn))
    v_f1score.append((2 * v_recall[indice] * v_precision[indice]) / (v_recall[indice] + v_precision[indice]))
    v_accuracy.append((tp + tn) / (tp + fn + fp + tn))

    v_accuracy_test.append(accuracy_test)
    
    accuracy_train = sklearn.metrics.accuracy_score(y_train, y_pred_Xtrain)
    v_accuracy_train.append(accuracy_train)
    
    return v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train


folders = ["s1", "s2", "s3"] 
matrix_all_values = np.zeros((16, len(folders)))

C = [.0000001, .000001, .00001, .0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, .75, 1, 3, 5, 8, 10 ,12, 15]

for i in range(len(folders)):

    X_train = np.load("../0_Data/splits/" + folders[i] + "/X_train_tensor.npy")
    y_train = np.load("../0_Data/splits/" + folders[i] + "/y_train_tensor.npy")
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_train = np.nan_to_num(X_train, nan=0)
    y_train = y_train.reshape(-1)

    X_test = np.load("../0_Data/splits/" + folders[i] + "/X_test_tensor.npy")
    y_test = np.load("../0_Data/splits/" + folders[i] + "/y_test_tensor.npy")
    X_test = X_test.reshape(X_test.shape[0], -1)
    X_test = np.nan_to_num(X_test, nan=0)
    y_test = y_test.reshape(-1)

    bestHyperparameters = {'C': 0}

    bestMetricDev = 0
    for c in range(len(C)):

        clf = LogisticRegression(solver='liblinear', C=C[c], penalty='l1', n_jobs=24)

        roc_auc_score = []
        threshold_1 = []
        threshold_2 = 0
        y_pred_arr = []
        
        kf = KFold(n_splits=5, shuffle=False)
        kf.get_n_splits(X_train)
        for train_index, val_index in kf.split(X_train):

            X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]

            y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]

            clf = clf.fit(np.array(X_train_cv), np.array(y_train_cv))
            y_pred = clf.predict_proba(X_val_cv)[:,1]

#             threshold_1.append(Find_Optimal_Cutoff(y_val_cv, y_pred))
            auc = sklearn.metrics.roc_auc_score(y_val_cv, y_pred)
            roc_auc_score.append(auc)

            y_pred_arr.append(y_pred)

        # Obtengo el threshold para: modelo configurado con hiperparametros con datos de train
        y_pred_ttl = getypredttl(y_pred_arr)
#         threshold_2 = Find_Optimal_Cutoff(y_pre_train, y_pred_ttl)

        if np.mean(roc_auc_score) > bestMetricDev:
            print("\tCambio the best roc auc score ", bestMetricDev, " por: ", np.mean(roc_auc_score))
            bestMetricDev = np.mean(roc_auc_score)
            bestHyperparameters['C'] = C[c]
#             bestHyperparameters['threshold_1'] = np.mean(threshold_1)
#             bestHyperparameters['threshold_2'] = threshold_2
            bestHyperparameters['y_pred_val'] = y_pred_ttl


    
    print("Best roc auc score: ", bestMetricDev)
    print("C: ", bestHyperparameters["C"])
#     print("threshold train dataset: ", bestHyperparameters["threshold_2"])

    clf = LogisticRegression(solver='liblinear', C=bestHyperparameters['C'],  penalty='l1', n_jobs=24)
    clf = clf.fit(np.array(X_train), np.array(y_train))
    
    y_pred_test = clf.predict_proba(X_test)[:,1]
    y_pred_Xtrain = clf.predict(X_pre_train)

    selecCalculateMetrics_aux = ['umbral: 0.5']
#                                  'umbral: threshold_2 (datos train)']

    for j in range(len(selecCalculateMetrics_aux)):

        v_accuracy_test = []
        v_accuracy_train = []
        v_specificity = []
        v_sensitivity = []
        v_precision = []
        v_recall = []
        v_f1score = []
        v_accuracy = []
        auc_score = []

        if selecCalculateMetrics_aux[j] == 'umbral: 0.5':
            auc_score.append(sklearn.metrics.roc_auc_score(y_test, y_pred_test))
            y_pred = (y_pred_test > 0.5).astype('int')
#         elif selecCalculateMetrics_aux[j] == 'umbral: threshold_2 (datos train)':
#             auc_score.append(sklearn.metrics.roc_auc_score(y_test, y_pred_test))
#             y_pred = (y_pred_test > bestHyperparameters["threshold_2"]).astype('int')


        v_specificity, v_recall, v_f1score, v_precision, v_accuracy, v_accuracy_test, v_accuracy_train = calculateconfusionmatrix(y_pred,\
                                           y_pre_train, y_test, v_specificity, v_recall, \
                                           v_f1score, v_precision, v_accuracy, v_accuracy_test, \
                                           v_accuracy_train, 0, y_pred_Xtrain)

        matrix_all_values[j*8:j*8 + 8, i] = v_specificity[0], v_recall[0], v_f1score[0], \
            v_precision[0], v_accuracy[0], v_accuracy_test[0], v_accuracy_train[0], auc_score[0]


# print("====> Threshold data train")
# print()
# printOutAlgorithm(matrix_all_values[8,:], matrix_all_values[9,:], matrix_all_values[10,:], \
#                         matrix_all_values[11,:], matrix_all_values[12,:], matrix_all_values[13,:], \
#                         matrix_all_values[14,:], matrix_all_values[15,:])

print()
print("====> Threshold 0.5")
print()
printOutAlgorithm(matrix_all_values[0,:], matrix_all_values[1,:], matrix_all_values[2,:], matrix_all_values[3,:], matrix_all_values[4,:], matrix_all_values[5,:], matrix_all_values[6,:], matrix_all_values[7,:])



ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [12]:
np.sum(y_val_cv == 0)

2192

In [5]:
y_pred.shape

(2192,)