# <center> Crear el modelo de AI </center>

Con los datos de el script de consolidados entrenar un modelo de AI y anotar sus predicciones

In [54]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

from catboost import CatBoostRegressor


In [55]:
data = pd.read_csv('../Consolidados/training_set.csv')

In [56]:
# Secciones en las que la mayoria de las farmacias tiene 5 años o mas
sections_with_pharmacies = data[data['farm_total'] > 0]
stable_sections = sections_with_pharmacies[sections_with_pharmacies['farm_antiguas'] / sections_with_pharmacies['farm_total'] >= 0.6]
stable_sections.shape

(12514, 81)

In [57]:
# Crea los sets de prueba y entrenamiento

drop_cols = ['geometry', 'id_ent_secc', 'farm_con_super', 'farm_sin_super', 'farm_1_anio', 'farm_3_anios', 'farm_5_anios', 'farm_antiguas']
trainig_set = stable_sections.drop(drop_cols, axis=1)

X = trainig_set.drop('farm_total', axis=1)
Y = trainig_set['farm_total']

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=X['ENTIDAD_x'])

In [61]:
# Entrena el modelo y verificalo contra el set de validacion
model = CatBoostRegressor(iterations=1000, 
                          learning_rate=0.1, 
                          depth=10, 
                          loss_function='RMSE', 
                          verbose=True, 
                          use_best_model=True, 
                          random_seed=42)

model.fit(x_train, y_train, eval_set=(x_val, y_val), early_stopping_rounds=50)
y_pred = model.predict(x_val)

sqrt(mean_squared_error(y_val, y_pred))


0:	learn: 2.2084509	test: 2.3025649	best: 2.3025649 (0)	total: 234ms	remaining: 3m 53s
1:	learn: 2.1832566	test: 2.2875957	best: 2.2875957 (1)	total: 433ms	remaining: 3m 35s
2:	learn: 2.1619959	test: 2.2734064	best: 2.2734064 (2)	total: 658ms	remaining: 3m 38s
3:	learn: 2.1410190	test: 2.2591511	best: 2.2591511 (3)	total: 867ms	remaining: 3m 35s
4:	learn: 2.1255350	test: 2.2516488	best: 2.2516488 (4)	total: 1.06s	remaining: 3m 32s
5:	learn: 2.1106508	test: 2.2449289	best: 2.2449289 (5)	total: 1.23s	remaining: 3m 24s
6:	learn: 2.0935611	test: 2.2364508	best: 2.2364508 (6)	total: 1.37s	remaining: 3m 14s
7:	learn: 2.0761739	test: 2.2283084	best: 2.2283084 (7)	total: 1.52s	remaining: 3m 8s
8:	learn: 2.0604995	test: 2.2207019	best: 2.2207019 (8)	total: 1.67s	remaining: 3m 3s
9:	learn: 2.0467230	test: 2.2145843	best: 2.2145843 (9)	total: 1.81s	remaining: 2m 59s
10:	learn: 2.0352508	test: 2.2107743	best: 2.2107743 (10)	total: 1.95s	remaining: 2m 55s
11:	learn: 2.0239392	test: 2.2061395	best: 

2.1307459320198854

In [62]:
# Utilizando el modelo predice contra todas las secciones

x_test = data.drop(drop_cols, axis=1)
x_test = x_test.drop('farm_total', axis=1)

y_pred = model.predict(x_test)

pred_data = data.copy()
pred_data['farm_total_pred'] = y_pred
pred_data['diferencia'] = pred_data['farm_total_pred'] - pred_data['farm_total']
pred_data.sort_values('diferencia', ascending=False, inplace=True)


In [63]:
pd.set_option('display.max_columns', None)
pred_data.head(200)

Unnamed: 0,ENTIDAD_x,SECCION_x,geometry,area,farm_total,farm_con_super,farm_sin_super,farm_1_anio,farm_3_anios,farm_5_anios,farm_antiguas,id_ent_secc,ENTIDAD_y,SECCION_y,POBTOT,P_0A2,P_5YMAS,P_60YMAS,REL_H_M,PROM_HNV,PRESOE15_F,PRESOE15_M,P3YM_HLI,P3YM_HLI_F,P3YM_HLI_M,P3HLINHE,P3HLINHE_F,P3HLINHE_M,P3HLI_HE,P3HLI_HE_F,P3HLI_HE_M,P5_HLI,P5_HLI_NHE,P5_HLI_HE,POB_AFRO,POB_AFRO_F,POB_AFRO_M,PCON_DISC,PCDISC_MOT,PCDISC_VIS,PCDISC_LEN,PCDISC_AUD,PCDISC_M_A,PCDISC_MEN,PSIND_LIM,P15YM_AN,P15SEC_IN,P18YM_PB_M,GRAPROES,GRAPROES_F,GRAPROES_M,PEA,PEA_F,PEA_M,PE_INAC,PE_INAC_F,PE_INAC_M,POCUPADA,POCUPADA_F,POCUPADA_M,PDESOCUP,PDESOCUP_F,PDESOCUP_M,PSINDER,PDER_SS,PDER_IMSS,PDER_ISTE,PDER_ISTEE,PAFIL_PDOM,PDER_SEGP,PDER_IMSSB,PAFIL_IPRI,PAFIL_OTRA,POBHOG,TVIVHAB,VPH_EXCSA,VPH_DSADMA,VPH_PC,VPH_INTER,VPH_SINCIN,VPH_SINTIC,farm_total_pred,diferencia
18584,15,5912,"POLYGON ((-99.026864269 19.781195583999946, -9...",0.001225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15-5912,15,5912,39780,1646,36647,2246,95.24,1.83,2212,2206,587,306,281,1,0,1,574,299,275,584,1,574,693,349,344,2040,695,1123,250,345,234,320,31847,325,1188,6004,10.03,9.89,10.17,18938,7692,11246,11686,8198,3488,18467,7559,10908,471,133,338,13363,26369,17750,2581,238,427,4762,167,179,464,39780,12156,12118,12096,4085,6456,4876,49,11.250923,11.250923
10730,14,2049,POLYGON ((-103.26149443370863 20.5810087217306...,0.000328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14-2049,14,2049,26814,1404,24165,1923,103.18,2.26,256,234,185,91,94,0,0,0,182,90,92,182,0,179,292,147,145,1164,490,528,152,224,153,243,22438,714,968,2433,8.36,8.42,8.31,13298,5117,8181,6934,4913,2021,13063,5039,8024,235,78,157,8628,18016,13469,189,29,111,2593,41,1536,198,26784,6776,6643,6579,1684,2134,4075,50,9.240985,9.240985
60623,8,1615,POLYGON ((-106.46210634901391 31.7378212271192...,0.000020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8-1615,8,1615,200,4,191,44,102.02,1.60,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,7,8,0,3,2,3,165,1,1,56,12.48,12.38,12.56,99,37,62,74,48,26,97,37,60,2,0,2,43,157,107,11,0,0,19,0,12,8,200,70,69,69,47,54,13,0,7.766991,7.766991
22173,16,760,POLYGON ((-101.56027840392815 20.3221349899413...,0.000828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16-760,16,760,2549,121,2339,453,94.28,2.64,40,41,2,1,1,0,0,0,2,1,1,2,0,2,3,1,2,299,156,141,25,61,33,53,1721,158,104,245,7.60,7.37,7.86,1189,460,729,849,592,257,1173,457,716,16,3,13,1287,1261,143,154,8,0,952,5,1,5,2547,692,653,652,153,258,388,5,7.757077,7.757077
19629,15,906,POLYGON ((-99.24977195820289 19.66885146738725...,0.000893,2.0,0.0,2.0,0.0,1.0,0.0,1.0,15-906,15,906,19433,1013,17670,1155,100.53,1.96,325,309,794,403,391,5,5,0,786,397,389,792,5,784,655,313,342,698,265,365,99,106,89,107,15834,385,549,2184,8.91,8.81,9.00,9319,3499,5820,5623,4018,1605,9118,3444,5674,201,55,146,8798,10614,6552,326,121,119,2831,53,163,521,19433,4857,4374,3923,1268,1685,2809,46,9.645153,7.645153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22284,16,86,POLYGON ((-102.35340617293006 19.0934063339890...,0.000012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16-86,16,86,886,32,826,162,86.92,2.14,7,5,1,0,1,0,0,0,1,0,1,1,0,1,1,0,1,61,38,18,6,16,9,7,710,24,37,161,10.00,10.00,9.99,542,271,271,198,124,74,530,268,262,12,3,9,362,524,262,145,0,2,104,6,12,2,886,275,275,275,111,159,100,4,4.705121,4.705121
10667,14,1996,POLYGON ((-105.19511224605509 20.7051578060879...,0.000607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14-1996,14,1996,11811,619,10776,996,109.97,2.12,331,409,156,71,85,2,2,0,152,67,85,156,2,152,198,92,106,493,227,225,67,93,81,92,10030,260,374,1507,9.03,9.18,8.89,6057,2612,3445,2918,1780,1138,5990,2596,3394,67,16,51,3446,8353,6544,233,5,40,1469,53,51,38,11215,3021,2982,2964,999,1624,1223,36,4.702951,4.702951
56369,5,1517,POLYGON ((-100.74064776719072 28.6950222184451...,0.010529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5-1517,5,1517,153,7,142,35,128.36,3.65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23,6,17,17,10,8,4,2,9,2,108,11,0,7,6.10,6.48,5.83,65,14,51,53,37,16,65,14,51,0,0,0,44,109,55,0,0,0,54,0,0,0,153,46,37,37,1,1,44,1,4.702560,4.702560
23069,17,674,POLYGON ((-99.1037889060427 18.991744485128486...,0.000014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17-674,17,674,1280,44,1203,245,84.44,1.80,23,22,16,5,11,0,0,0,16,5,11,16,0,16,8,5,3,107,39,41,14,38,17,18,935,13,25,293,11.60,11.62,11.58,747,360,387,342,230,112,740,357,383,7,3,4,524,756,301,217,0,2,223,0,18,2,1280,366,362,361,164,250,108,5,4.701871,4.701871


In [137]:
#genero una lista con las 200 farmacias. En nombre es "Entidad-Sección"

lista_farmacias = []

for x in range(200):
    lista_farmacias.append(pred_data.iloc[x]['id_ent_secc'])

df_lista_farmacias = pd.DataFrame({'Entidad-Sección': lista_farmacias})
df_lista_farmacias.to_csv('../Consolidados/lista_propuesta.csv', index=False)