In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.svm import SVC

In [3]:
from pytictoc import TicToc
t = TicToc()

### Cargar data

In [4]:
infile = open('data_rf.pickle','rb')
data_rf = pickle.load(infile)
infile.close()

In [5]:
data_rf

Unnamed: 0,delegacion_inicio,incidente_c4,tipo_entrada,dow_creacion,hora_creacion,mes_creacion,geopoint,label
1164271,alvaro obregon,lesionado_atropellado,llamada_del_066,1,23,12,"-99.25598,19.33929",0
1164423,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,1,18,12,"-99.2009,19.37831",1
372612,alvaro obregon,accidente_choque_con_lesionados,llamada_del_066,2,21,1,"-99.19104,19.36336",0
372622,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,2,21,1,"-99.19145,19.36348",0
372628,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,2,6,1,"-99.20009,19.38161",0
...,...,...,...,...,...,...,...,...
1380319,xochimilco,accidente_choque_sin_lesionados,llamada_del_911,5,6,10,"-99.12949,19.28373",0
1382406,xochimilco,accidente_choque_sin_lesionados,llamada_del_911,5,21,10,"-99.10249,19.29447",0
1382445,xochimilco,accidente_choque_con_lesionados,llamada_del_911,5,20,10,"-99.13296,19.21005",1
1382469,xochimilco,accidente_choque_con_lesionados,llamada_del_911,5,7,10,"-99.12949,19.28373",0


## Feature engineering

### Incidentes

In [6]:
incidentes_top = data_rf.incidente_c4.value_counts().head(7).reset_index(name = "n")['index'].values
incidentes_top

array(['accidente_choque_sin_lesionados',
       'accidente_choque_con_lesionados', 'lesionado_atropellado',
       'accidente_motociclista', 'accidente_volcadura',
       'lesionado_accidente_automovilistico',
       'accidente_persona_atrapada_desbarrancada'], dtype=object)

In [7]:
data_rf[data_rf['incidente_c4'].isin(incidentes_top)]

Unnamed: 0,delegacion_inicio,incidente_c4,tipo_entrada,dow_creacion,hora_creacion,mes_creacion,geopoint,label
1164271,alvaro obregon,lesionado_atropellado,llamada_del_066,1,23,12,"-99.25598,19.33929",0
1164423,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,1,18,12,"-99.2009,19.37831",1
372612,alvaro obregon,accidente_choque_con_lesionados,llamada_del_066,2,21,1,"-99.19104,19.36336",0
372622,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,2,21,1,"-99.19145,19.36348",0
372628,alvaro obregon,accidente_choque_sin_lesionados,llamada_del_066,2,6,1,"-99.20009,19.38161",0
...,...,...,...,...,...,...,...,...
1380319,xochimilco,accidente_choque_sin_lesionados,llamada_del_911,5,6,10,"-99.12949,19.28373",0
1382406,xochimilco,accidente_choque_sin_lesionados,llamada_del_911,5,21,10,"-99.10249,19.29447",0
1382445,xochimilco,accidente_choque_con_lesionados,llamada_del_911,5,20,10,"-99.13296,19.21005",1
1382469,xochimilco,accidente_choque_con_lesionados,llamada_del_911,5,7,10,"-99.12949,19.28373",0


In [8]:
data_rf['incidente'] = np.where((data_rf['incidente_c4'].isin(incidentes_top)) ,data_rf['incidente_c4'],"otros" )

data_rf['incidente'].value_counts()

accidente_choque_sin_lesionados             760243
accidente_choque_con_lesionados             314883
lesionado_atropellado                       193436
accidente_motociclista                       55594
accidente_volcadura                          31754
otros                                        14969
lesionado_accidente_automovilistico           6053
accidente_persona_atrapada_desbarrancada      5602
Name: incidente, dtype: int64

### Geopoints

In [9]:
geopoints_top = data_rf.geopoint.value_counts().reset_index(name = "n").query('n >= 300')['index'].values

#geopoints_top

In [10]:
data_rf['tipo_geopoint'] = np.where((data_rf['geopoint'].isin(geopoints_top)) , "frecuente","aislado" )

data_rf['tipo_geopoint'].value_counts()

aislado      1246332
frecuente     136202
Name: tipo_geopoint, dtype: int64

### Tipo_entrada

In [11]:
tipo_entrada_top = data_rf.tipo_entrada.value_counts().head(4).reset_index(name = "n")['index'].values
tipo_entrada_top

array(['llamada_del_911', 'llamada_del_066', 'boton_de_auxilio', 'radio'],
      dtype=object)

In [12]:
data_rf['tipo_entrada_mod'] = np.where((data_rf['tipo_entrada'].isin(tipo_entrada_top)) ,data_rf['tipo_entrada'],"otros" )

data_rf['tipo_entrada_mod'].value_counts()

llamada_del_911     738879
llamada_del_066     463374
boton_de_auxilio     78609
radio                77698
otros                23974
Name: tipo_entrada_mod, dtype: int64

### Delegacion inicio

In [13]:
data_rf['delegacion_inicio'] = data_rf['delegacion_inicio'].str.replace(" ", "_", regex = True)

### Conservar variables necesarias

In [14]:
data_rf = data_rf.drop(columns = ['geopoint','incidente_c4','tipo_entrada'])

data_rf

Unnamed: 0,delegacion_inicio,dow_creacion,hora_creacion,mes_creacion,label,incidente,tipo_geopoint,tipo_entrada_mod
1164271,alvaro_obregon,1,23,12,0,lesionado_atropellado,aislado,llamada_del_066
1164423,alvaro_obregon,1,18,12,1,accidente_choque_sin_lesionados,aislado,llamada_del_066
372612,alvaro_obregon,2,21,1,0,accidente_choque_con_lesionados,frecuente,llamada_del_066
372622,alvaro_obregon,2,21,1,0,accidente_choque_sin_lesionados,aislado,llamada_del_066
372628,alvaro_obregon,2,6,1,0,accidente_choque_sin_lesionados,aislado,llamada_del_066
...,...,...,...,...,...,...,...,...
1380319,xochimilco,5,6,10,0,accidente_choque_sin_lesionados,aislado,llamada_del_911
1382406,xochimilco,5,21,10,0,accidente_choque_sin_lesionados,frecuente,llamada_del_911
1382445,xochimilco,5,20,10,1,accidente_choque_con_lesionados,aislado,llamada_del_911
1382469,xochimilco,5,7,10,0,accidente_choque_con_lesionados,aislado,llamada_del_911


### Encoding

In [15]:
data_encoded = pd.get_dummies(data_rf, columns = ['delegacion_inicio', 'incidente', 'tipo_geopoint', 'tipo_entrada_mod'])
data_encoded

Unnamed: 0,dow_creacion,hora_creacion,mes_creacion,label,delegacion_inicio_alvaro_obregon,delegacion_inicio_azcapotzalco,delegacion_inicio_benito_juarez,delegacion_inicio_coyoacan,delegacion_inicio_cuajimalpa,delegacion_inicio_cuauhtemoc,...,incidente_lesionado_accidente_automovilistico,incidente_lesionado_atropellado,incidente_otros,tipo_geopoint_aislado,tipo_geopoint_frecuente,tipo_entrada_mod_boton_de_auxilio,tipo_entrada_mod_llamada_del_066,tipo_entrada_mod_llamada_del_911,tipo_entrada_mod_otros,tipo_entrada_mod_radio
1164271,1,23,12,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
1164423,1,18,12,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
372612,2,21,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
372622,2,21,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
372628,2,6,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380319,5,6,10,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1382406,5,21,10,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1382445,5,20,10,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1382469,5,7,10,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


### Separar variable de respuesta del resto

In [16]:
x_data = data_encoded.drop(columns= ['label'])

In [17]:
y_data = data_encoded['label']

### Dividir data en train test

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [19]:
y_train.value_counts(normalize = True)

0    0.795943
1    0.204057
Name: label, dtype: float64

In [20]:
y_test.value_counts(normalize = True)

0    0.796017
1    0.203983
Name: label, dtype: float64

### Definir RF

In [21]:
rfc=RandomForestClassifier(random_state=42)

In [22]:
param_grid = { 
    'n_estimators': [200],
    'max_features': [5,6,7,8,9],
    'max_depth' : [20,25],
    'criterion': ['gini']
}

In [23]:
param_grid

{'n_estimators': [100, 200],
 'max_features': [4, 5, 6, 7, 8],
 'criterion': ['gini']}

In [24]:

#CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, scoring = 'precision', n_jobs= 3)

In [25]:
t.tic()
#CV_rfc.fit(x_train, y_train)
t.toc()

Elapsed time is 0.000046 seconds.


In [26]:
#CV_rfc.best_params_

In [28]:
rfc1 = RandomForestClassifier(random_state =42, max_features = 8, n_estimators = 200, criterion = 'gini', max_depth=20)

In [29]:
t.tic()
rfc1.fit(x_train, y_train)
t.toc()

Elapsed time is 243.332920 seconds.


In [30]:
pred=rfc1.predict(x_test)

In [31]:
pd.DataFrame(pred).value_counts()

0    411613
1      3148
dtype: int64

In [51]:
pred 

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.7948071298892615


In [59]:
pred_train=rfc1.predict(x_train)

In [61]:
accuracy_score(y_train,pred_train)

0.805647605378534

In [60]:
precision_score(y_train,pred_train)

0.6790179932906374

In [33]:
print("Precision for Random Forest on CV data: ",precision_score(y_test,pred))

Precision for Random Forest on CV data:  0.4202668360864041


In [49]:
d_imp = {'feature': x_train.columns, 'importance': rfc1.feature_importances_}

pd.DataFrame(d_imp).sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
1,hora_creacion,0.383408
2,mes_creacion,0.275765
0,dow_creacion,0.160852
20,incidente_accidente_choque_sin_lesionados,0.038413
31,tipo_entrada_mod_boton_de_auxilio,0.018666
35,tipo_entrada_mod_radio,0.01687
19,incidente_accidente_choque_con_lesionados,0.011252
27,incidente_lesionado_atropellado,0.008493
33,tipo_entrada_mod_llamada_del_911,0.007933
32,tipo_entrada_mod_llamada_del_066,0.007618
