In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.misc import comb
from sklearn.model_selection import train_test_split , KFold, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
import pdb
from sklearn.metrics import roc_auc_score , make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import NuestroArbol as ourTree
import random
from scipy.stats import randint as sp_randint
from time import time
import collections

%matplotlib inline

# Importo datos 

In [2]:
X_comp = pd.read_csv('X_competencia.csv')
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
X.drop(['index'],inplace=True, axis=1)
y.drop(['index'], inplace=True,axis=1)



In [3]:
#partimos los datos en desarrollo(87%) y holdout(13%). 
X_desarrollo , X_holout ,y_desarrollo, y_holdout = train_test_split(X, y['output'],
                                                                    test_size=0.13,random_state=0,stratify=y['output'])
#pase los kfold aca arriba ya que se usan en varios lugares.
#evaluar si esta bueno que siempre se usen los mismos folds
kfold = StratifiedKFold(n_splits=5)
kfold.get_n_splits(X_desarrollo,y_desarrollo)


5

### K-fold CV 

In [4]:
def primerTablaEjercicio2(treeClasifier):
    
    accuracy_train=[]
    accuracy_valildation=[]
    ROC_AUC_train=[]
    ROC_AUC_validation=[]

    #este for itera sobre los k folds en cada loop tego un set de datos y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=3, criterion="gini")

        tree.fit(X_train, y_train.astype(int))
        accuracy_train.append(tree.score(X=X_train, y=y_train))
        accuracy_valildation.append(tree.score(X=X_val, y=y_val))
        ROC_AUC_train.append(roc_auc_score(y_train,tree.predict(X_train)))
        ROC_AUC_validation.append(roc_auc_score(y_val,tree.predict(X_val)))
        
    return pd.DataFrame({ 'Partición' : np.arange(1,6),'Accuracy (training)' :accuracy_train,
                          'Accuracy (validación)' : accuracy_valildation,
                          'ROC AUC (training)' : ROC_AUC_train,
                          'ROC AUC (validación)' : ROC_AUC_validation})






# Tabla de precision

In [5]:
display(primerTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Partición,Accuracy (training),Accuracy (validación),ROC AUC (training),ROC AUC (validación)
0,1,0.783862,0.659091,0.78598,0.660417
1,2,0.813218,0.747126,0.801587,0.739894
2,3,0.83046,0.597701,0.820954,0.588564
3,4,0.841954,0.62069,0.838025,0.617287
4,5,0.82808,0.639535,0.823049,0.630933


# Arboles combinaciones

In [6]:
def accuracyForTrainingAndValidation(depth,criteria,treeClasifier):
    acc_train=[]
    acc_val=[]


    #este for itera sobre los k folds en cada loop tego un set de training y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=depth, criterion=criteria)

        tree.fit(X_train, y_train.astype(int))
        acc_train.append(tree.score(X=X_train, y=y_train))
        acc_val.append(tree.score(X=X_val, y=y_val))
        
    return {"training":np.mean(acc_train), "validation": np.mean(acc_val)}

        

In [7]:
def segundaTablaEjercicio2(treeClasifier):

    trainingResults=[]
    validationResults=[]
    evaluatedDepths=[]
    evaluatedCriterias=[]
    depthsDictionary={3:'3',5:'5',None:'Infinito'}
    depths = [3,5,None]    
    criterias =['gini','entropy']

    for depth in depths:
        for criteria in criterias:
            trainingResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["training"])
            validationResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["validation"])
            evaluatedDepths.append(depthsDictionary[depth])
            evaluatedCriterias.append(criteria)
    
    return pd.DataFrame({ 'Altura Máxima' : evaluatedDepths,
                          'Criterio de evaluación de corte' : evaluatedCriterias,
                          'Accuracy (training)' : trainingResults,
                          'Accuracy (validación)' : validationResults}).sort_values(by=['Criterio de evaluación de corte'],ascending=False)

# Tabla con combinaciones

In [8]:
display(segundaTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Altura Máxima,Criterio de evaluación de corte,Accuracy (training),Accuracy (validación)
0,3,gini,0.819515,0.65053
2,5,gini,0.928729,0.680391
4,Infinito,gini,1.0,0.652564
1,3,entropy,0.791947,0.682955
3,5,entropy,0.917243,0.687657
5,Infinito,entropy,1.0,0.673784


# Ejercicio Extra: Resultados para nuestro clasificador

In [9]:
#display(primerTablaEjercicio2(ourTree.MiClasificadorArbol))  

In [10]:
#display(segundaTablaEjercicio2(ourTree.MiClasificadorArbol))
#esta tabla tardo 10 minutos en crearse, ver como mejorar performance

In [11]:
roc_auc_score = make_scorer(roc_auc_score)
n_iter_search = 50
def performGridSearch(clasiffier, param_grid):
    start = time()
    gridSearch = GridSearchCV(clasiffier, param_grid, cv=kfold, scoring=roc_auc_score, return_train_score=False)
    gridSearch.fit(X_desarrollo, y_desarrollo)
    return {"bestScore" : gridSearch.best_score_, "bestParams" : gridSearch.best_params_, "executionTime" : time() - start, "allScores" : gridSearch.cv_results_}

def performRandomSearch(classifier, param_dist):
    start = time()
    randomSearch = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=n_iter_search, cv=kfold, scoring=roc_auc_score, refit=True)
    randomSearch.fit(X_desarrollo, y_desarrollo)
    return {"bestScore" : randomSearch.best_score_, "bestParams" : randomSearch.best_params_, "executionTime" : time() - start}


def displayBestParamsTable(classifier, paramsGridSearch, paramsRandomSearch):
    gridSearchResult = performGridSearch(classifier, paramsGridSearch);
    randomSearchResult = performRandomSearch(classifier, paramsRandomSearch);
    df =  pd.DataFrame({ ' ' : ["Mejor performance (Roc Auc)", "Tiempo de ejecución (Segundos)"],
                          'Grid Search' : [gridSearchResult["bestScore"],gridSearchResult[ "executionTime"]],
                          'Random Search' : [randomSearchResult["bestScore"],randomSearchResult[ "executionTime"]]
                        })      
    display(df.set_index(' '))
    orderedBestGridParams = collections.OrderedDict(sorted(gridSearchResult["bestParams"].items()))
    orderedBestRandomParams = collections.OrderedDict(sorted(randomSearchResult["bestParams"].items()))
    display(pd.DataFrame({ 'Mejores parámetros Grid Search' : list(orderedBestGridParams.keys()),
                          '' : list(orderedBestGridParams.values())
                        }).set_index('Mejores parámetros Grid Search'))
    display(pd.DataFrame({ 'Mejores parámetros Random Search' : list(orderedBestRandomParams.keys()),
                          '' : list(orderedBestRandomParams.values())
                        }).set_index('Mejores parámetros Random Search'))
    
    allScores = renameAndDeleteColumnsInAllScoresTable(gridSearchResult["allScores"])
    
    display(pd.DataFrame(allScores).sort_values(by=['rank_test_score']).set_index("rank_test_score"))
    
def renameAndDeleteColumnsInAllScoresTable(allScores):
    allScores.pop('mean_fit_time', None)
    allScores.pop('std_fit_time', None)
    allScores.pop('std_score_time', None)
    allScores.pop('params', None)
    allScores.pop('split0_test_score', None)
    allScores.pop('split1_test_score', None)
    allScores.pop('split2_test_score', None)
    allScores.pop('split3_test_score', None)
    allScores.pop('split4_test_score', None)
    allScores.pop('mean_score_time', None)
    allScores.pop('std_test_score', None)
    newDict = {}
    for key in list(allScores.keys()):
        if key.startswith("param_"):
            newDict[key[6:]] = allScores.pop(key);
    
    for key in list(newDict.keys()):
        allScores[key]=newDict[key]
    
    return allScores
    

# LDA

In [12]:
lda_param_grid = [{'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0, 0.25, 0.5, 0.75, 1]},
                   {'solver': ['svd'], 'store_covariance' : [True, False]}]

lda_param_random = {'solver': ['lsqr', 'eigen'], 'shrinkage': np.arange(0, 1, 0.01)}

displayBestParamsTable(LinearDiscriminantAnalysis(),lda_param_grid, lda_param_random)

Unnamed: 0,Grid Search,Random Search
,,
Mejor performance (Roc Auc),0.752495,0.763814
Tiempo de ejecución (Segundos),1.213713,3.046431


Mejores parámetros Grid Search,Unnamed: 1
shrinkage,0.75
solver,eigen


Mejores parámetros Random Search,Unnamed: 1
shrinkage,0.56
solver,lsqr


Unnamed: 0_level_0,mean_test_score,shrinkage,solver,store_covariance
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.752495,0.75,eigen,
2,0.75173,0.75,lsqr,
3,0.751551,0.5,lsqr,
4,0.749448,0.5,eigen,
5,0.737282,auto,lsqr,
6,0.733751,auto,eigen,
7,0.719456,0.25,eigen,
8,0.717701,0.25,lsqr,
9,0.715291,1,lsqr,
9,0.715291,1,eigen,


# Arbol de Decisión

In [13]:
tree_param_grid = [{'max_depth': [3,5,10,15,20],'criterion': ('gini','entropy'), 'splitter' : ['random', 'best'] }]

tree_param_random = {'max_depth': np.arange(3, 21),'criterion': ('gini','entropy'), 'splitter' : ['random', 'best'] }

displayBestParamsTable(DecisionTreeClassifier(),tree_param_grid, tree_param_random)



Unnamed: 0,Grid Search,Random Search
,,
Mejor performance (Roc Auc),0.693668,0.705423
Tiempo de ejecución (Segundos),2.676413,7.103217


Mejores parámetros Grid Search,Unnamed: 1
criterion,entropy
max_depth,10
splitter,best


Mejores parámetros Random Search,Unnamed: 1
criterion,entropy
max_depth,15
splitter,best


Unnamed: 0_level_0,mean_test_score,criterion,max_depth,splitter
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.693668,entropy,10,best
2,0.691968,entropy,5,best
3,0.687704,entropy,15,best
4,0.683025,entropy,20,best
5,0.679503,entropy,3,best
6,0.679189,gini,10,best
7,0.674395,entropy,3,random
8,0.655935,entropy,20,random
9,0.655799,gini,20,best
10,0.649555,gini,3,best


# KNN

In [14]:
KNN_param_grid = {'n_neighbors': [5, 30, 50, 100],'p': np.arange(1,3), "weights" : ['uniform', 'distance']}
KNN_param_random = {'n_neighbors': np.arange(5, 101), 'p': np.arange(1,3), "weights" : ['uniform', 'distance']}
displayBestParamsTable(KNeighborsClassifier(),KNN_param_grid, KNN_param_random)

Unnamed: 0,Grid Search,Random Search
,,
Mejor performance (Roc Auc),0.750511,0.744304
Tiempo de ejecución (Segundos),1.423518,17.590641


Mejores parámetros Grid Search,Unnamed: 1
n_neighbors,30
p,2
weights,distance


Mejores parámetros Random Search,Unnamed: 1
n_neighbors,70
p,2
weights,distance


Unnamed: 0_level_0,mean_test_score,n_neighbors,p,weights
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.750511,30,2,distance
2,0.746821,50,1,uniform
3,0.743369,50,1,distance
4,0.741395,30,2,uniform
5,0.735808,100,2,distance
6,0.734644,50,2,uniform
7,0.734321,100,1,uniform
8,0.732907,100,2,uniform
9,0.732086,30,1,uniform
10,0.731192,50,2,distance


# SVM

In [15]:
SVM_param_grid = [{'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
SVM_param_random = {'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma':np.arange(1e-4, 1e-1, 0.0001),
                     'C': np.arange(1,1001)}
displayBestParamsTable(SVC(),SVM_param_grid, SVM_param_random)

Unnamed: 0,Grid Search,Random Search
,,
Mejor performance (Roc Auc),0.752268,0.739491
Tiempo de ejecución (Segundos),5.610753,16.020302


Mejores parámetros Grid Search,Unnamed: 1
C,10
gamma,0.0001
kernel,rbf


Mejores parámetros Random Search,Unnamed: 1
C,699
gamma,0.0233
kernel,sigmoid


Unnamed: 0_level_0,mean_test_score,C,gamma,kernel
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.752268,10,0.0001,rbf
2,0.743453,1,0.001,sigmoid
2,0.743453,10,0.0001,sigmoid
4,0.741276,1,0.001,rbf
5,0.71951,1,0.0001,rbf
6,0.71763,1000,0.001,poly
7,0.716586,10,0.001,rbf
8,0.708959,100,0.0001,sigmoid
9,0.704386,1000,0.001,rbf
9,0.704386,100,0.001,rbf
