In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.misc import comb
from sklearn.model_selection import train_test_split , KFold, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
import pdb
from sklearn.metrics import roc_auc_score , make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import NuestroArbol as ourTree
import random
from scipy.stats import randint as sp_randint
from time import time

%matplotlib inline

# Importo datos 

In [2]:
X_comp = pd.read_csv('X_competencia.csv')
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
X.drop(['index'],inplace=True, axis=1)
y.drop(['index'], inplace=True,axis=1)



In [3]:
#partimos los datos en desarrollo(87%) y holdout(13%). 
X_desarrollo , X_holout ,y_desarrollo, y_holdout = train_test_split(X, y['output'],
                                                                    test_size=0.13,random_state=0,stratify=y['output'])
#pase los kfold aca arriba ya que se usan en varios lugares.
#evaluar si esta bueno que siempre se usen los mismos folds
kfold = StratifiedKFold(n_splits=5)
kfold.get_n_splits(X_desarrollo,y_desarrollo)


5

### K-fold CV 

In [4]:
def primerTablaEjercicio2(treeClasifier):
    
    accuracy_train=[]
    accuracy_valildation=[]
    ROC_AUC_train=[]
    ROC_AUC_validation=[]

    #este for itera sobre los k folds en cada loop tego un set de datos y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=3, criterion="gini")

        tree.fit(X_train, y_train.astype(int))
        accuracy_train.append(tree.score(X=X_train, y=y_train))
        accuracy_valildation.append(tree.score(X=X_val, y=y_val))
        ROC_AUC_train.append(roc_auc_score(y_train,tree.predict(X_train)))
        ROC_AUC_validation.append(roc_auc_score(y_val,tree.predict(X_val)))
        
    return pd.DataFrame({ 'Partición' : np.arange(1,6),'Accuracy (training)' :accuracy_train,
                          'Accuracy (validación)' : accuracy_valildation,
                          'ROC AUC (training)' : ROC_AUC_train,
                          'ROC AUC (validación)' : ROC_AUC_validation})






# Tabla de precision

In [5]:
display(primerTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Partición,Accuracy (training),Accuracy (validación),ROC AUC (training),ROC AUC (validación)
0,1,0.783862,0.659091,0.78598,0.660417
1,2,0.813218,0.758621,0.801587,0.752394
2,3,0.83046,0.597701,0.820954,0.588564
3,4,0.844828,0.597701,0.84117,0.592287
4,5,0.82808,0.639535,0.823049,0.628751


# Arboles combinaciones

In [6]:
def accuracyForTrainingAndValidation(depth,criteria,treeClasifier):
    acc_train=[]
    acc_val=[]


    #este for itera sobre los k folds en cada loop tego un set de training y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=depth, criterion=criteria)

        tree.fit(X_train, y_train.astype(int))
        acc_train.append(tree.score(X=X_train, y=y_train))
        acc_val.append(tree.score(X=X_val, y=y_val))
        
    return {"training":np.mean(acc_train), "validation": np.mean(acc_val)}

        

In [7]:
def segundaTablaEjercicio2(treeClasifier):

    trainingResults=[]
    validationResults=[]
    evaluatedDepths=[]
    evaluatedCriterias=[]
    depthsDictionary={3:'3',5:'5',None:'Infinito'}
    depths = [3,5,None]    
    criterias =['gini','entropy']

    for depth in depths:
        for criteria in criterias:
            trainingResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["training"])
            validationResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["validation"])
            evaluatedDepths.append(depthsDictionary[depth])
            evaluatedCriterias.append(criteria)
    
    return pd.DataFrame({ 'Altura Máxima' : evaluatedDepths,
                          'Criterio de evaluación de corte' : evaluatedCriterias,
                          'Accuracy (training)' : trainingResults,
                          'Accuracy (validación)' : validationResults}).sort_values(by=['Criterio de evaluación de corte'],ascending=False)

# Tabla con combinaciones

In [8]:
display(segundaTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Altura Máxima,Criterio de evaluación de corte,Accuracy (training),Accuracy (validación)
0,3,gini,0.819515,0.652829
2,5,gini,0.928729,0.664192
4,Infinito,gini,1.0,0.654891
1,3,entropy,0.791947,0.682955
3,5,entropy,0.917243,0.692281
5,Infinito,entropy,1.0,0.685251


# Ejercicio Extra: Resultados para nuestro clasificador

In [9]:
#display(primerTablaEjercicio2(ourTree.MiClasificadorArbol))  

In [10]:
#display(segundaTablaEjercicio2(ourTree.MiClasificadorArbol))
#esta tabla tardo 10 minutos en crearse, ver como mejorar performance

In [11]:
roc_auc_score = make_scorer(roc_auc_score)
def performGridSearch(clasiffier, param_grid):
    start = time()
    gridSearch = GridSearchCV(clasiffier, param_grid, cv=kfold, scoring=roc_auc_score)
    gridSearch.fit(X_desarrollo, y_desarrollo)
    return gridSearch.best_score_, gridSearch.best_params_, time() - start 


n_iter_search = 20
def performRandomSearch(classifier, param_dist):
    start = time()
    randomSearch = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=n_iter_search, cv=kfold, scoring=roc_auc_score, refit=True)
    randomSearch.fit(X, y.values.ravel())
    return randomSearch.best_score_, randomSearch.best_params_, time() - start

# Grid Search para LDA

In [12]:
lda_param_grid = [{'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0, 0.25, 0.5, 0.75, 1]},
                   {'solver': ['svd'], 'store_covariance' : [True, False]}]
print(performGridSearch(LinearDiscriminantAnalysis(), lda_param_grid))

(0.7524946855541131, {'shrinkage': 0.75, 'solver': 'eigen'}, 1.1164896488189697)


# Random Search para LDA

In [13]:
lda_param_grid = {'solver': ['lsqr', 'eigen'], 'shrinkage': np.arange(0, 1, 0.01)}
print(performRandomSearch(LinearDiscriminantAnalysis(), lda_param_grid))

(0.757334050651442, {'solver': 'lsqr', 'shrinkage': 0.59}, 1.2806713581085205)


# Grid Search para Arbol de Decisión

In [14]:
tree_param_grid = [{'max_depth': np.arange(3, 10),'criterion': ('gini','entropy'), 'splitter' : ['random', 'best'] }]
print(performGridSearch(DecisionTreeClassifier(), tree_param_grid))

(0.7011388904565721, {'criterion': 'gini', 'max_depth': 7, 'splitter': 'random'}, 3.765662670135498)


# Grid Search para KNN

In [15]:
KNN_param_grid = {'n_neighbors': np.arange(3, 10),'p': np.arange(1,3)}
print(performGridSearch(KNeighborsClassifier(), KNN_param_grid))

(0.7296823403628246, {'n_neighbors': 8, 'p': 2}, 4.7829625606536865)


# Grid Search para SVM

In [16]:
SVM_param_grid = [{'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
print(performGridSearch(SVC(), SVM_param_grid))

(0.7522676693568109, {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}, 9.596637964248657)


# FALTA AGREGAR RANDOM SEARCH PARA Arboles, KNN y SVM