# Analisis de conjuntos de variables

In [1]:
%matplotlib notebook

In [2]:
import os
import json
import numpy as np
import pandas as pd
import datetime as dt

In [3]:
### Realizamos el cambio de directoroi de trabajo al "Directorio Base" que se
current_dir = os.getcwd()
base_path = os.path.dirname(current_dir)

os.chdir(base_path)

In [4]:
import scripts.funciones as funciones



In [5]:
from sklearn import metrics
from sklearn.base import clone
from xgboost import XGBClassifier
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Carga y preparación de datos

In [6]:
d_ini = dt.datetime(2017,6,1)
d_fin = dt.datetime(2019,8,1) 

In [7]:
%%time

version = 'verFinal'

### params
freq1 = '1D'
freq2 = '3D'
freq3 = '7D'
freq4 = '14D'
freq5 = '30D'
freq6 = '60D'

### Realizamos la lectura de la informacion climatica en el rango de fechas
### especificado, incluye la etiqueta de si ocurre o no un accidente. 
### Posteriormente, en la organizacion de la informacion climatica, lo
### que se hace es agregar las variables con la informacion distribucional
### de las ultimas 5 horas de la info climatica
data = funciones.read_clima_accidentes(d_ini, d_fin, poblado = True)
data_org = funciones.organizar_data_infoClima(data)


### agregamos la informacion relacionada a la cantidad de accidentes ocurridas
### en las ultimas X horas
### Agregar senales
senales = [freq1, freq2, freq3, freq4, freq5, freq6]
d_ini_acc = d_ini - dt.timedelta(days = int(freq6.replace('D', '')))  ### freq mayor
raw_accidentes = funciones.read_accidentes(d_ini_acc, d_fin)
for fresen in senales:
    data_org = funciones.obtener_accidentes_acumulados(data_org, 
                                                        raw_accidentes, 
                                                        freq = fresen)


### Convertimos la bariable de Barrios en variable dummy para ser incluida
### en el modelo
data_org['poblado'] = data_org['BARRIO']
data_org= pd.get_dummies(data_org, columns=['poblado'])

### Relizamos la particion del conjunto de datos en las variables
### explicativas (X) y la variable respuesta (Y)
X = data_org.drop(columns = ['TW','BARRIO','Accidente','summary'])
Y = data_org['Accidente'] 

Wall time: 38.4 s


In [8]:
### Dividimos el conjunto de datos en entrenamiento y validacion
X_train, X_val, Y_train, Y_val = train_test_split(X, 
                                                  Y,
                                                  stratify = Y,
                                                  test_size = 0.2,
                                                  random_state = 42)

In [9]:
%%time
### Realizamos resampling combinando Tomek Links y Random Undersampling

### Tomek Link
tomeklinks = TomekLinks()
X_tom, y_tom = tomeklinks.fit_sample(X_train, Y_train)

### Random Undersampling
rus = RandomUnderSampler(sampling_strategy = 30/70,random_state = 42)
X_train_under, Y_train_under = rus.fit_sample(X_train, Y_train)

Wall time: 13min 41s


In [10]:
### Estandarizacion del conjunto de datos
scaler = StandardScaler()
scaler.fit(X_train_under)

X_train_under_z = pd.DataFrame(scaler.transform(X_train_under), columns = X_train_under.columns)
X_val_z = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)

# Entrenamiento de modelos eliminando conjuntos de variables

In [11]:
Classifier1 = XGBClassifier(n_estimators = 300, 
                           max_depth = 2,
                           random_state = 42)

Classifier2 = RandomForestClassifier(bootstrap=False,  
                             criterion='entropy',
                             max_features='auto',
                             n_estimators=500, 
                             max_depth=10,
                             random_state=42,
                             warm_start=False)

Classifier3 = LogisticRegression()

clasificadores = [Classifier1, Classifier2, Classifier3]

In [12]:
file_name = 'vars_relevantes_final.json'
path = os.path.join(base_path, f'models/verFinal/{file_name}')
with open(path, 'r') as f:
    info_vars = json.load(f)

vars_voto = info_vars['voto']['features']

In [13]:
def obtencion_metricas(clasificadores,variables,X,Y,X_val,Y_val):
    
    ROC = []
    PR = []
    precision = []
    recall = []
    fscore = []
    bAccuracY = []

    for clf in clasificadores:
        clf_mod = clone(clf)
        clf_mod.fit(X[variables],Y)

        ### Metricas en validation

        preds_val = clf_mod.predict_proba(X_val[variables])
        labels_val = clf_mod.predict(X_val[variables])

        ROC_mod = metrics.roc_auc_score(Y_val,preds_val[:,1])
        PR_mod = funciones.precision_recall_auc_score(Y_val,preds_val[:,1])
        precision_mod = metrics.precision_score(Y_val,labels_val)
        recall_mod = metrics.recall_score(Y_val,labels_val)
        f1_mod = metrics.f1_score(Y_val,labels_val)
        bAccuracY_mod = metrics.balanced_accuracy_score(Y_val,labels_val)

        ROC.append(ROC_mod)
        PR.append(PR_mod)
        precision.append(precision_mod) 
        recall.append(recall_mod)
        fscore.append(f1_mod)
        bAccuracY.append(bAccuracY_mod)


    print(f'Mean ROC: {np.mean(ROC)}')
    print(f'Mean PR: {np.mean(PR)}')
    print(f'Mean Precision: {np.mean(precision)}')
    print(f'Mean Recall: {np.mean(recall)}')
    print(f'Mean F Score: {np.mean(fscore)}')
    print(f'Mean Balanced Accuracy: {np.mean(bAccuracY)}')
    
    
    return None

### Vars elegidas

In [35]:
obtencion_metricas(clasificadores,vars_voto,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7775714076139798
Mean PR: 0.0594891099789445
Mean Precision: 0.0681817573505847
Mean Recall: 0.4121037463976946
Mean F Score: 0.11652741769906107
Mean Balanced Accuracy: 0.6528119125609653


### Eliminando variables relacionadas al barrio

In [14]:
vars_sinBarrio = []
for col in vars_voto:
    if not 'poblado' in col:
        vars_sinBarrio.append(col)

obtencion_metricas(clasificadores,vars_sinBarrio,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7716137940052525
Mean PR: 0.0585062674301878
Mean Precision: 0.06674881935952924
Mean Recall: 0.39024975984630167
Mean F Score: 0.11386706078988806
Mean Balanced Accuracy: 0.6439226727687714


### Eliminando variables relacionadas al barrio y senales de accidentes

In [34]:
vars_sinBarrio_acc = []
for col in vars_voto:
    if (not 'poblado' in col) and (not 'cumAcc' in col):
        vars_sinBarrio_acc.append(col)
        
obtencion_metricas(clasificadores,vars_sinBarrio_acc,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7302450136843225
Mean PR: 0.04287146648881309
Mean Precision: 0.056832906156912465
Mean Recall: 0.1611431316042267
Mean F Score: 0.07816517068718949
Mean Balanced Accuracy: 0.5542402859491371


### Eliminando variables climaticas

In [29]:
clima = ['precipIntensity',
         'precipProbability',
         'uvIndex',
         'visibility',
          'icon_clear-day',
         'icon_cloudy',
         'icon_fog',
          'cloudCover_mean',
         'precipIntensity_mean',
         'visibility_mean',
         'windSpeed_mean',
         'cloudCover_mean_forward',
         'dewPoint_mean_forward',
         'precipIntensity_mean_forward',
         'temperature_mean_forward']

vars_sinClima = []
for col in vars_voto:
    if not col in clima:
        vars_sinClima.append(col)

obtencion_metricas(clasificadores,vars_sinClima,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7717489097987461
Mean PR: 0.05682754960893314
Mean Precision: 0.06632320864506924
Mean Recall: 0.3463016330451489
Mean F Score: 0.10844431553703021
Mean Balanced Accuracy: 0.6262128388187879


### Eliminando senal de accidentes

In [23]:
vars_sinAccidente = []
for col in vars_voto:
    if not 'cumAcc' in col:
        vars_sinAccidente.append(col)

obtencion_metricas(clasificadores,vars_sinAccidente,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7774831827017391
Mean PR: 0.059158515615715314
Mean Precision: 0.0719675410973866
Mean Recall: 0.3378962536023054
Mean F Score: 0.11491955806049829
Mean Balanced Accuracy: 0.6267327045795296


### Eliminando las temporales

In [28]:
tiempo = ['hora_0', 'hora_1',
         'hora_2', 'hora_3',
         'hora_4', 'hora_5',
         'hora_7', 'hora_11',
         'hora_13', 'hora_15',
         'hora_16', 'hora_17',
         'hora_18', 'hora_19',
         'hora_20', 'hora_22',
         'hora_23',  'dia_sem_4',
         'dia_sem_5', 'dia_sem_6',
         'festivo', 'Mes_Abril',
         'Mes_Agosto', 'Mes_Enero',
         'Mes_Febrero', 'Mes_Julio',
         'Mes_Mayo', 'Mes_Septiembre',
         'Year_2017', 'Year_2018', 'Year_2019']

vars_sinTiempo = []
for col in vars_voto:
    if not col in tiempo:
        vars_sinTiempo.append(col)

obtencion_metricas(clasificadores,vars_sinTiempo,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.752735956423054
Mean PR: 0.050657436197890926
Mean Precision: 0.05953855935532439
Mean Recall: 0.3539865513928914
Mean F Score: 0.10175267556421098
Mean Balanced Accuracy: 0.6244273239285204
