# Distribucion de los Accidentes

In [1]:
%matplotlib notebook

In [2]:
import os
import json
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
### Realizamos el cambio de directoroi de trabajo al "Directorio Base" que se
current_dir = os.getcwd()
base_path = os.path.dirname(current_dir)

os.chdir(base_path)

In [4]:
import scripts.funciones as funciones



In [44]:
d_ini = dt.datetime(2018,9,1)
d_fin = dt.datetime(2020,1,1)

In [45]:
data = funciones.read_clima_accidentes(d_ini, d_fin, poblado = True)

In [46]:
data_rain = data[data['icon']=='rain'].reset_index(drop = True)
data_Norain = data[~(data['icon']=='rain')].reset_index(drop = True)

In [47]:
((data['icon']=='rain')&(data['Accidente']==1)).sum()/data['Accidente'].sum()

0.41194837635303916

In [48]:
data_rain['Accidente'].sum()/len(data_rain)

0.017394285111581833

In [49]:
data_Norain['Accidente'].sum()/len(data_Norain)

0.019705223802515295

In [50]:
plt.figure('Violin')
ax = sns.violinplot(data = data, x = 'Accidente', y ='precipIntensity')

<IPython.core.display.Javascript object>

In [24]:
data.columns

Index(['TW', 'BARRIO', 'summary', 'icon', 'precipIntensity',
       'precipProbability', 'temperature', 'apparentTemperature', 'dewPoint',
       'humidity', 'windSpeed', 'cloudCover', 'uvIndex', 'visibility',
       'Accidente'],
      dtype='object')

In [51]:
plt.figure('Histograma Precip')
data_rain[data_rain['precipIntensity']>0]['precipIntensity'].hist(bins = 50)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1538a6a4148>

In [52]:
aguacero = 2.5
data_aguacero = data[data['precipIntensity']> aguacero].reset_index(drop = True)
data_Noaguacero = data[data['precipIntensity']<= aguacero].reset_index(drop = True)

In [53]:
data_aguacero['Accidente'].sum()/len(data_aguacero)

0.019556974655757335

In [54]:
data_Noaguacero['Accidente'].sum()/len(data_Noaguacero)

0.018608810082162197

In [55]:
data_ale = data[data['BARRIO'] == 'alejandria'].reset_index(drop = True)

In [56]:
data_ale.plot(x = 'TW', y = 'windSpeed')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1538c197bc8>

# Entrenar modelo para Nico

In [75]:
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from scripts.clase_model.modelo import Modelo

In [76]:
d_ini = dt.datetime(2019,1,1)
d_fin = dt.datetime(2019,8,1)  

In [77]:
version = 'ver006p2'
now_date = dt.datetime.now()

cv = 3
balance = 'rus'
score = 'roc_auc'
prop_deseada_under = 0.4
n_proc = 1

descripcion = f""" Entrena modelo para realizar la prediccion de accidentes
                   en los barrios del Poblado. considera solo variables
                   de hora, dia semana y barrios relevantes. Entrena en las
                   fechas {d_ini}-{d_fin}. {balance}-{score}-{prop_deseada_under}."""

mod = Modelo(now_date, version, base_path, descripcion)

In [78]:
models = {
                   'logistic':{
                               'mod':LogisticRegression(random_state = 42,penalty ='l1', solver ='saga'),
                               'par':{
                                 
                               }
                   }
            }
mod.models = models

In [79]:
nico = funciones.carga_model(base_path,'models/ver005','ver005')
nico['model'].steps[1]

('modelo',
 Pipeline(memory=None,
          steps=[('scaler',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('logistic',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l1', random_state=42,
                                     solver='saga', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False))

In [80]:
data = funciones.read_clima_accidentes(d_ini, d_fin, poblado = True)
data_org = funciones.organizar_data_infoClima(data)

data_org['poblado'] = data_org['BARRIO']
data_org= pd.get_dummies(data_org, columns=['poblado'])

X = data_org.drop(columns = ['TW','BARRIO','Accidente','summary'])

## Caso sin var climaticas
vars_ele = ['hora_0', 'hora_1', 'hora_2', 'hora_3', 'hora_4', 'hora_5',
            'hora_6', 'hora_7', 'hora_8', 'hora_9', 'hora_10', 'hora_11', 'hora_12',
            'hora_13', 'hora_14', 'hora_15', 'hora_16', 'hora_17', 'hora_18',
            'hora_19', 'hora_20', 'hora_21', 'hora_22', 'hora_23','dia_sem_0',
            'dia_sem_1', 'dia_sem_2', 'dia_sem_3', 'dia_sem_4', 'dia_sem_5',
            'dia_sem_6','poblado_alejandria', 
            'poblado_altosdelpoblado', 'poblado_astorga', 'poblado_castropol', 
            'poblado_elcastillo', 'poblado_eldiamanteno2', 
            'poblado_laaguacatala', 'poblado_lalinde', 'poblado_losbalsosno1', 
            'poblado_losnaranjos', 'poblado_manila', 'poblado_sanlucas', 
            'poblado_santamariadelosangeles', 'poblado_villacarlota']

X = X[vars_ele]
Y = data_org['Accidente']    

In [81]:
X_test, Y_test, models, selected = mod.train(X, 
                                                 Y, 
                                                 cv = cv,
                                                 score = score,
                                                 n_proc = n_proc,
                                                 balance = balance,
                                                 prop_deseada_under = prop_deseada_under)
       

#Realiza la prediccion de las fallas en un conjunto de datos de prueba
model_sel = models[selected]['bestModel']
preds_ff = mod.predict(X_test, model_sel)
preds_ff['Accidente'] = Y_test
#Realiza graficas de la curva ROC-AUC y diagramas de violin que permitan
#analizar el comportamiento y deseméño del modelo
funciones.graphs_evaluation(f'{base_path}/models/{version}', selected, preds_ff, save = True)
funciones.precision_recall_graph(f'{base_path}/models/{version}', selected, preds_ff, save = True)

#Umbrales
bound = [0.1,0.2,0.3,0.4,0.5,0.6]

#Obtencion de matrices de confusion para diferentes umbrales de la predicicon
for b in bound:
    funciones.matrix_confusion(f'{base_path}/models/{version}', 
                               selected, preds_ff, b,  save=True)


# Guarda el modelo elegido y el objeto de clase modelo como parte de un
#pipeline    
mod_pipe = Pipeline([('procesador', mod),
                      ('modelo', models[selected]['bestModel'])])

path_best_mod = os.path.join(f'{base_path}/models/{version}', f"{version}.sav")

joblib.dump(mod_pipe, path_best_mod)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   0.2s
[CV]  ................................................................
[CV] ................................................. , total=   0.2s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s



[CV]  ................................................................
[CV] ................................................. , total=   0.5s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl_data['Predicted'] = (pl_data['Predicted'] > bound).astype(int)


<IPython.core.display.Javascript object>

NameError: name 'logger' is not defined

In [82]:
joblib.dump(mod_pipe, path_best_mod)

['C:\\Users\\pasal\\Google Drive\\EAFIT\\Ciencia de Datos y Analitica\\02 Segundo Semestre\\Proyecto Integrador\\Proyecto-Integrador-2/models/ver006p2\\ver006p2.sav']