In [74]:
from  google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

In [76]:
# Leer el archivo CSV
archivo_csv = '/content/drive/MyDrive/ENAHO DATA/df_result_3_tomodel_noh_enaho.csv'
df = pd.read_csv(archivo_csv)

In [77]:
# Add Mes-num
meses = ['ENERO', 'FEBRERO','MARZO','ABRIL','MAYO','JUNIO','JULIO', 'AGOSTO','SETIEMBRE','OCTUBRE','NOVIEMBRE', 'DICIEMBRE' ]
diccionario = {mes : i+1 for i, mes in enumerate(meses)}
df["MES_NUM"] = df["MES"].apply(lambda x: diccionario.get(x,None))

In [78]:
# Sorting
df.sort_values(by=['VACUNA','PROVINCIA','ANIO','MES_NUM'], inplace=True)

In [79]:
# Rename
df.rename(columns={"CANTIDAD DE PERSONAL": "CANTIDAD_DE_PERSONAL"}, inplace = True)

In [80]:
# Log
df["CANTIDAD_DE_PERSONAL"]  = np.log(df["CANTIDAD_DE_PERSONAL"] + 1)
df["CANTIDAD"]              = np.log(df["CANTIDAD"] + 1)
df["PRESUPUESTO"]           = np.log(df["PRESUPUESTO"] + 1)
df["POBLACION"]             = np.log(df["POBLACION"] + 1)

In [81]:
# Shifting
df_list=[]
for item,df_group in df.groupby(['PROVINCIA','VACUNA']):
  df_group.set_index(['VACUNA', 'ANIO', 'MES', 'PROVINCIA', 'RANGO_EDAD',
       'CANTIDAD_DE_PERSONAL', 'POBLACION', 'PRESUPUESTO',
       'UBIGEO', 'ALFABETISMO', 'AGUA_POTABLE', 'DEMORA_CITA_horas',
       'DEMORA_TRASLADO_horas', 'MES_NUM'], inplace = True)
  df_group_shifted = df_group.shift(2)
  df_group_shifted.reset_index(inplace=True)
  df_list.append(df_group_shifted)
df_shifted_total = pd.concat(df_list)

In [82]:
# Remove Purus
df = df_shifted_total[df_shifted_total['PROVINCIA'] != 'PURUS']

In [83]:
# Remove nan
df = df.dropna(subset='CANTIDAD')

In [84]:
# Supongamos que tienes tus datos en un DataFrame llamado df
# Dividir los datos en variables predictoras (X) y la variable objetivo (y)
X = df[['ANIO', 'MES', 'PROVINCIA', 'VACUNA', 'CANTIDAD_DE_PERSONAL', 'POBLACION', 'PRESUPUESTO', 'ALFABETISMO', 'AGUA_POTABLE', 'DEMORA_CITA_horas', 'DEMORA_TRASLADO_horas']]
y = df['CANTIDAD']

# Convertir variables categóricas en variables dummy
X = pd.get_dummies(X)
#import pdb;pdb.set_trace()

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo XGBoost
model = XGBRegressor()
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

Mean Squared Error: 0.1476845983098973
R2: 0.8565282390581328


In [85]:
mse_eval_2 = mean_squared_error(np.exp(y_test)-1, np.exp(y_pred)-1)
mape_2 = mean_absolute_percentage_error(np.exp(y_test)-1, np.exp(y_pred)-1)
r2_2 = r2_score(np.exp(y_test)-1, np.exp(y_pred)-1)
acc_2 = 1 - mean_absolute_percentage_error(np.exp(y_test)-1, np.exp(y_pred)-1)
print(f"Error cuadrático medio en el conjunto de evaluación: {mse_eval_2}")
print(f"mape en el conjunto de evaluación: {mape_2}")
print(f"r2 en el conjunto de evaluación: {r2_2}")
print("Accuracy:", acc_2)

Error cuadrático medio en el conjunto de evaluación: 4704.685142124926
mape en el conjunto de evaluación: 0.2796390158978847
r2 en el conjunto de evaluación: 0.8227528422173034
Accuracy: 0.7203609841021152


In [86]:
y_predict_train = model.predict(X_train)

In [87]:
dic = {
    'Y_train': np.exp(y_train)-1,
    'Y_predict_train': np.exp(y_predict_train)-1,
    'row_mape': np.abs((np.exp(y_train)-1) - (np.exp(y_predict_train)-1))/(np.exp(y_train)-1)
}

In [90]:
Error_df = pd.DataFrame(dic).sort_values(by='row_mape', ascending=False)

In [92]:
Error_df

Unnamed: 0,Y_train,Y_predict_train,row_mape
2,0.0,0.033437,inf
29,1.0,1.486471,0.486471
17,1.0,1.362246,0.362246
9,24.0,30.237354,0.259890
23,12.0,15.023428,0.251952
...,...,...,...
15,91.0,91.023209,0.000255
32,71.0,71.016930,0.000238
34,460.0,460.079559,0.000173
26,259.0,258.962860,0.000143


In [93]:
Error_df2 = Error_df[Error_df['row_mape'] != np.inf]
Error_df2

Unnamed: 0,Y_train,Y_predict_train,row_mape
29,1.0,1.486471,0.486471
17,1.0,1.362246,0.362246
9,24.0,30.237354,0.259890
23,12.0,15.023428,0.251952
42,26.0,32.523697,0.250911
...,...,...,...
15,91.0,91.023209,0.000255
32,71.0,71.016930,0.000238
34,460.0,460.079559,0.000173
26,259.0,258.962860,0.000143


In [94]:
Error_df2['row_mape'].mean()

0.049851213745956614

In [89]:
pd.concat([X_train,y_train], axis=1)['CANTIDAD'].min()

0.0

In [95]:
importances = model.feature_importances_

# Crear un DataFrame para mostrar las importancias de las características
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
importance_df = importance_df.sort_values('Importance', ascending=False)

In [96]:
importance_df

Unnamed: 0,Feature,Importance
1,CANTIDAD_DE_PERSONAL,0.243777
34,VACUNA_ROTA,0.05727
4,ALFABETISMO,0.056854
31,VACUNA_INFLUENZA,0.043627
30,VACUNA_HVB,0.040483
18,MES_OCTUBRE,0.037001
25,VACUNA_APO2,0.03669
12,MES_FEBRERO,0.036524
3,PRESUPUESTO,0.034086
32,VACUNA_NEUMO,0.032941
