In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.misc import comb
from sklearn.model_selection import train_test_split , KFold, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pdb
from sklearn.metrics import roc_auc_score , make_scorer
from sklearn.model_selection import GridSearchCV
import NuestroArbol as ourTree


%matplotlib inline

# Importo datos 

In [2]:
X_comp = pd.read_csv('X_competencia.csv')
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
X.drop(['index'],inplace=True, axis=1)
y.drop(['index'], inplace=True,axis=1)



In [3]:
#partimos los datos en desarrollo(87%) y holdout(13%). 
X_desarrollo , X_holout ,y_desarrollo, y_holdout = train_test_split(X, y['output'],
                                                                    test_size=0.13,random_state=0,stratify=y['output'])
#pase los kfold aca arriba ya que se usan en varios lugares.
#evaluar si esta bueno que siempre se usen los mismos folds
kfold = StratifiedKFold(n_splits=5)
kfold.get_n_splits(X_desarrollo,y_desarrollo)


5

### K-fold CV 

In [4]:
def primerTablaEjercicio2(treeClasifier):
    
    accuracy_train=[]
    accuracy_valildation=[]
    ROC_AUC_train=[]
    ROC_AUC_validation=[]

    #este for itera sobre los k folds en cada loop tego un set de datos y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=3, criterion="gini")

        tree.fit(X_train, y_train.astype(int))
        accuracy_train.append(tree.score(X=X_train, y=y_train))
        accuracy_valildation.append(tree.score(X=X_val, y=y_val))
        ROC_AUC_train.append(roc_auc_score(y_train,tree.predict(X_train)))
        ROC_AUC_validation.append(roc_auc_score(y_val,tree.predict(X_val)))
        
    return pd.DataFrame({ 'Partición' : np.arange(1,6),'Accuracy (training)' :accuracy_train,
                          'Accuracy (validación)' : accuracy_valildation,
                          'ROC AUC (training)' : ROC_AUC_train,
                          'ROC AUC (validación)' : ROC_AUC_validation})






# Tabla de precision

In [5]:
display(primerTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Partición,Accuracy (training),Accuracy (validación),ROC AUC (training),ROC AUC( validación)
0,1,0.783862,0.659091,0.78598,0.660417
1,2,0.813218,0.758621,0.801587,0.752394
2,3,0.83046,0.597701,0.820954,0.588564
3,4,0.844828,0.597701,0.84117,0.592287
4,5,0.82808,0.639535,0.823049,0.628751


# Arboles combinaciones

In [6]:
def accuracyForTrainingAndValidation(depth,criteria,treeClasifier):
    acc_train=[]
    acc_val=[]


    #este for itera sobre los k folds en cada loop tego un set de training y otro de validacion
    for train, test  in kfold.split(X_desarrollo,y_desarrollo):
        #print("TRAIN:", train_index,'\n', "TEST:", test_index,'\n' )
        X_train, X_val = X_desarrollo.iloc[train], X_desarrollo.iloc[test]
        y_train, y_val = y_desarrollo.iloc[train], y_desarrollo.iloc[test]
        #intancio el arbol que voy a entrenar en cada fold
        tree = treeClasifier(max_depth=depth, criterion=criteria)

        tree.fit(X_train, y_train.astype(int))
        acc_train.append(tree.score(X=X_train, y=y_train))
        acc_val.append(tree.score(X=X_val, y=y_val))
        
    return {"training":np.mean(acc_train), "validation": np.mean(acc_val)}

        

In [7]:
def segundaTablaEjercicio2(treeClasifier):

    trainingResults=[]
    validationResults=[]
    evaluatedDepths=[]
    evaluatedCriterias=[]
    depthsDictionary={3:'3',5:'5',None:'Infinito'}
    depths = [3,5,None]    
    criterias =['gini','entropy']

    for depth in depths:
        for criteria in criterias:
            trainingResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["training"])
            validationResults.append(accuracyForTrainingAndValidation(depth,criteria,treeClasifier)["validation"])
            evaluatedDepths.append(depthsDictionary[depth])
            evaluatedCriterias.append(criteria)
    
    return pd.DataFrame({ 'Altura Máxima' : evaluatedDepths,
                          'Criterio de evaluación de corte' : evaluatedCriterias,
                          'Accuracy (training)' : trainingResults,
                          'Accuracy (validación)' : validationResults}).sort_values(by=['Criterio de evaluación de corte'],ascending=False)

# Tabla con combinaciones

In [8]:
display(segundaTablaEjercicio2(DecisionTreeClassifier))  

Unnamed: 0,Altura Máxima,Criterio de evaluación de corte,Accuracy (training),Accuracy (validación)
0,3,gini,0.819515,0.652829
2,5,gini,0.928154,0.664272
4,Infinito,gini,1.0,0.636683
1,3,entropy,0.791947,0.682955
3,5,entropy,0.917243,0.696879
5,Infinito,entropy,1.0,0.689822


# Ejercicio Extra: Resultados para nuestro clasificador

In [9]:
display(primerTablaEjercicio2(ourTree.MiClasificadorArbol))  

Unnamed: 0,Partición,Accuracy (training),Accuracy (validación),ROC AUC (training),ROC AUC( validación)
0,1,0.763689,0.659091,0.817342,0.694271
1,2,0.784483,0.666667,0.829856,0.704255
2,3,0.798851,0.655172,0.84508,0.672606
3,4,0.761494,0.666667,0.817244,0.685372
4,5,0.77937,0.744186,0.82004,0.810147


In [12]:
display(segundaTablaEjercicio2(ourTree.MiClasificadorArbol))
#esta tabla tardo 10 minutos en crearse, ver como mejorar performance

Unnamed: 0,Altura Máxima,Criterio de evaluación de corte,Accuracy (training),Accuracy (validación)
0,3,gini,0.777577,0.678357
2,5,gini,0.883911,0.632031
4,Infinito,gini,1.0,0.590756
1,3,entropy,0.766674,0.676058
3,5,entropy,0.863816,0.662264
5,Infinito,entropy,1.0,0.655392


# Grid Search

### ensayo Grid Search para el arbol que usamos antes

In [10]:
roc_auc_score = make_scorer(roc_auc_score)
param_grid = {'max_depth': np.arange(3, 10),'criterion': ('gini','entropy')}

arbol_grid = GridSearchCV(DecisionTreeClassifier(), param_grid,cv=kfold, scoring=roc_auc_score)
arbol_grid.fit(X_desarrollo, y_desarrollo)
print(arbol_grid.best_score_ , arbol_grid.best_params_)

0.6937245800176834 {'criterion': 'entropy', 'max_depth': 6}


### Ensayo de Grid  Knn

In [11]:
vecinos_param_grid = {'n_neighbors': np.arange(3, 10),'p': np.arange(1,3)}
 

vecinos_grid = GridSearchCV(KNeighborsClassifier(), vecinos_param_grid,cv=kfold, scoring=roc_auc_score)
vecinos_grid.fit(X_desarrollo, y_desarrollo)
print(vecinos_grid.best_score_,vecinos_grid.best_params_ )

0.7296823403628246 {'n_neighbors': 8, 'p': 2}
