# Analisis de conjuntos de variables

In [1]:
%matplotlib notebook

In [2]:
import os
import json
import numpy as np
import pandas as pd
import datetime as dt

In [3]:
### Realizamos el cambio de directoroi de trabajo al "Directorio Base" que se
current_dir = os.getcwd()
base_path = os.path.dirname(current_dir)

os.chdir(base_path)

In [4]:
import scripts.funciones as funciones



In [5]:
from sklearn import metrics
from sklearn.base import clone
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import ExtraTreesClassifier

# Carga y preparación de datos

In [6]:
%%time

version = 'verFinal'

X_train_under = pd.read_csv('data/train.csv')
Y_train_under = X_train_under['Accidente']
X_train_under = X_train_under.drop(columns = ['TW','BARRIO','Accidente'])

X_val = pd.read_csv('data/validation.csv')
Y_val = X_val['Accidente']
X_val = X_val.drop(columns = ['TW','BARRIO','Accidente'])

Wall time: 456 ms


In [8]:
### Estandarizacion del conjunto de datos
scaler = StandardScaler()
scaler.fit(X_train_under)

X_train_under_z = pd.DataFrame(scaler.transform(X_train_under), columns = X_train_under.columns)
X_val_z = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)

# Entrenamiento de modelos eliminando conjuntos de variables

In [9]:
Classifier1 = MLPClassifier(activation='identity', alpha=0.05,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(29, 53),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=42, shuffle=True, solver='adam',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False)

Classifier2 = ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0,
                                      class_weight=None, criterion='entropy',
                                      max_depth=20, max_features='log2',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=500, n_jobs=None,
                                      oob_score=False, random_state=42,
                                      verbose=0, warm_start=False)

Classifier3 = LogisticRegression(C=0.6453715401646702, class_weight=None,
                                    dual=False, fit_intercept=True,
                                    intercept_scaling=1, l1_ratio=None,
                                    max_iter=100, multi_class='auto',
                                    n_jobs=None, penalty='l1', random_state=42,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False)

VotingClassifier = EnsembleVoteClassifier(clfs=[Classifier1, Classifier2, Classifier3], weights=[1,1,1], refit=True)


clasificadores = [VotingClassifier]

In [10]:
file_name = 'vars_relevantes_final.json'
path = os.path.join(base_path, f'models/verFinal/{file_name}')
with open(path, 'r') as f:
    info_vars = json.load(f)

vars_voto = info_vars['voto']['features']

In [11]:
def obtencion_metricas(clasificadores,variables,X,Y,X_val,Y_val):
    
    ROC = []
    PR = []
    precision = []
    recall = []
    fscore = []
    bAccuracY = []

    for clf in clasificadores:
        clf_mod = clone(clf)
        clf_mod.fit(X[variables],Y)

        ### Metricas en validation

        preds_val = clf_mod.predict_proba(X_val[variables])
        labels_val = clf_mod.predict(X_val[variables])

        ROC_mod = metrics.roc_auc_score(Y_val,preds_val[:,1])
        PR_mod = funciones.precision_recall_auc_score(Y_val,preds_val[:,1])
        precision_mod = metrics.precision_score(Y_val,labels_val)
        recall_mod = metrics.recall_score(Y_val,labels_val)
        f1_mod = metrics.f1_score(Y_val,labels_val)
        bAccuracY_mod = metrics.balanced_accuracy_score(Y_val,labels_val)

        ROC.append(ROC_mod)
        PR.append(PR_mod)
        precision.append(precision_mod) 
        recall.append(recall_mod)
        fscore.append(f1_mod)
        bAccuracY.append(bAccuracY_mod)


    print(f'Mean ROC: {np.mean(ROC)}')
    print(f'Mean PR: {np.mean(PR)}')
    print(f'Mean Precision: {np.mean(precision)}')
    print(f'Mean Recall: {np.mean(recall)}')
    print(f'Mean F Score: {np.mean(fscore)}')
    print(f'Mean Balanced Accuracy: {np.mean(bAccuracY)}')
    
    
    return None

### Vars elegidas

In [12]:
obtencion_metricas(clasificadores,vars_voto,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7844021022705707
Mean PR: 0.060799060893915374
Mean Precision: 0.06569724866617847
Mean Recall: 0.45244956772334294
Mean F Score: 0.11473463049237234
Mean Balanced Accuracy: 0.666029051685836


### Eliminando variables relacionadas al barrio

In [13]:
vars_sinBarrio = []
for col in vars_voto:
    if not 'poblado' in col:
        vars_sinBarrio.append(col)

obtencion_metricas(clasificadores,vars_sinBarrio,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7739596459429462
Mean PR: 0.057630758829553916
Mean Precision: 0.06656184486373165
Mean Recall: 0.3659942363112392
Mean F Score: 0.11263858093126385
Mean Balanced Accuracy: 0.6349807262599023


### Eliminando variables relacionadas al barrio y senales de accidentes

In [14]:
vars_sinBarrio_acc = []
for col in vars_voto:
    if (not 'poblado' in col) and (not 'cumAcc' in col):
        vars_sinBarrio_acc.append(col)
        
obtencion_metricas(clasificadores,vars_sinBarrio_acc,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7323888571998968
Mean PR: 0.0416509520816257
Mean Precision: 0.04997139042532901
Mean Recall: 0.18876080691642652
Mean F Score: 0.07902277182928669
Mean Balanced Accuracy: 0.5608080216456687


### Eliminando variables climaticas

In [15]:
clima = ['precipIntensity',
         'precipProbability',
         'uvIndex',
         'visibility',
          'icon_clear-day',
         'icon_cloudy',
         'icon_fog',
          'cloudCover_mean',
         'precipIntensity_mean',
         'visibility_mean',
         'windSpeed_mean',
         'cloudCover_mean_forward',
         'dewPoint_mean_forward',
         'precipIntensity_mean_forward',
         'temperature_mean_forward']

vars_sinClima = []
for col in vars_voto:
    if not col in clima:
        vars_sinClima.append(col)

obtencion_metricas(clasificadores,vars_sinClima,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7791964423519826
Mean PR: 0.05837387017684498
Mean Precision: 0.06272401433691756
Mean Recall: 0.42867435158501443
Mean F Score: 0.10943535037704616
Mean Balanced Accuracy: 0.6544110471646545


### Eliminando senal de accidentes

In [16]:
vars_sinAccidente = []
for col in vars_voto:
    if not 'cumAcc' in col:
        vars_sinAccidente.append(col)

obtencion_metricas(clasificadores,vars_sinAccidente,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7850353695380856
Mean PR: 0.06072219233763472
Mean Precision: 0.06737149926892363
Mean Recall: 0.4315561959654179
Mean F Score: 0.11654830236404319
Mean Balanced Accuracy: 0.659889282485897


### Eliminando las temporales

In [17]:
tiempo = ['hora_0', 'hora_1',
         'hora_2', 'hora_3',
         'hora_4', 'hora_5',
         'hora_7', 'hora_11',
         'hora_13', 'hora_15',
         'hora_16', 'hora_17',
         'hora_18', 'hora_19',
         'hora_20', 'hora_22',
         'hora_23',  'dia_sem_4',
         'dia_sem_5', 'dia_sem_6',
         'festivo', 'Mes_Abril',
         'Mes_Agosto', 'Mes_Enero',
         'Mes_Febrero', 'Mes_Julio',
         'Mes_Mayo', 'Mes_Septiembre',
         'Year_2017', 'Year_2018', 'Year_2019']

vars_sinTiempo = []
for col in vars_voto:
    if not col in tiempo:
        vars_sinTiempo.append(col)

obtencion_metricas(clasificadores,vars_sinTiempo,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7538172890539677
Mean PR: 0.051218777882587024
Mean Precision: 0.05999268381904646
Mean Recall: 0.35446685878962536
Mean F Score: 0.10261758264678278
Mean Balanced Accuracy: 0.6252740856098484
