In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import KFold

In [42]:
# Leer el archivo CSV
archivo_csv = 'df_result_3_tomodel_noh_enaho.csv'
df = pd.read_csv(archivo_csv)
df.head()

Unnamed: 0,VACUNA,ANIO,MES,PROVINCIA,RANGO_EDAD,CANTIDAD DE PERSONAL,POBLACION,PRESUPUESTO,CANTIDAD,UBIGEO,ALFABETISMO,AGUA_POTABLE,DEMORA_CITA_horas,DEMORA_TRASLADO_horas
0,AMA,2019,ABRIL,CORONEL PORTILLO,1_ANIO,932,5690,998497,207,250100,0.233,0.527,1.254,0.251
1,AMA,2019,ABRIL,PADRE ABAD,1_ANIO,54,6037,998497,31,250300,0.056,0.667,0.5,0.375
2,AMA,2019,AGOSTO,CORONEL PORTILLO,1_ANIO,884,5690,3650033,75,250100,0.161,0.292,6.27,0.216
3,AMA,2019,AGOSTO,PADRE ABAD,1_ANIO,49,6037,3650033,13,250300,0.219,0.667,0.31,0.119
4,AMA,2019,DICIEMBRE,CORONEL PORTILLO,1_ANIO,887,5690,1953340,131,250100,0.186,0.537,38.927,0.23


In [43]:
# Add Mes-num
meses = ['ENERO', 'FEBRERO','MARZO','ABRIL','MAYO','JUNIO','JULIO', 'AGOSTO','SETIEMBRE','OCTUBRE','NOVIEMBRE', 'DICIEMBRE' ]
diccionario = {mes : i+1 for i, mes in enumerate(meses)}
df["MES_NUM"] = df["MES"].apply(lambda x: diccionario.get(x,None))

# Sorting
df.sort_values(by=['VACUNA','PROVINCIA','ANIO','MES_NUM'], inplace=True)

# Rename
df.rename(columns={"CANTIDAD DE PERSONAL": "CANTIDAD_DE_PERSONAL"}, inplace = True)

In [44]:
# Estandarizar las columnas
#scaler = preprocessing.StandardScaler()
scaler = preprocessing.MinMaxScaler()

cols_sc = ["CANTIDAD", "CANTIDAD_DE_PERSONAL" ,"PRESUPUESTO", "POBLACION"] 
scaler.fit(df.loc[:,cols_sc].to_numpy())
df.loc[:,cols_sc] = scaler.transform(df.loc[:,cols_sc].to_numpy())

sc = df.copy()
sc = sc[cols_sc]
df.head()

Unnamed: 0,VACUNA,ANIO,MES,PROVINCIA,RANGO_EDAD,CANTIDAD_DE_PERSONAL,POBLACION,PRESUPUESTO,CANTIDAD,UBIGEO,ALFABETISMO,AGUA_POTABLE,DEMORA_CITA_horas,DEMORA_TRASLADO_horas,MES_NUM
36,AMA,2020,ENERO,ATALAYA,1_ANIO,0.033146,0.0,0.194177,0.002098,250200,0.13,0.556,0.56,0.06,1
40,AMA,2020,FEBRERO,ATALAYA,1_ANIO,0.033442,0.0,0.189663,0.02028,250200,0.128,0.444,0.41,0.153,2
52,AMA,2020,MARZO,ATALAYA,1_ANIO,0.033738,0.0,0.186726,0.015385,250200,0.143,0.207,6.988,0.119,3
24,AMA,2020,ABRIL,ATALAYA,1_ANIO,0.035218,0.0,0.204631,0.002098,250200,0.314,0.75,0.0,0.0,4
56,AMA,2020,MAYO,ATALAYA,1_ANIO,0.034034,0.0,0.175326,0.024476,250200,0.552,0.0,0.0,0.0,5


In [45]:
df["ANIO"] = df["ANIO"].astype(str)

# Shifting
df_list=[]
for item,df_group in df.groupby(['PROVINCIA','VACUNA']):
  df_group.set_index(['VACUNA', 'ANIO', 'MES', 'PROVINCIA', 'RANGO_EDAD',
       'CANTIDAD_DE_PERSONAL', 'POBLACION', 'PRESUPUESTO',
       'UBIGEO', 'ALFABETISMO', 'AGUA_POTABLE', 'DEMORA_CITA_horas',
       'DEMORA_TRASLADO_horas', 'MES_NUM'], inplace = True)
  df_group_shifted = df_group.shift(2)
  df_group_shifted.reset_index(inplace=True)
  df_list.append(df_group_shifted)
df_shifted_total = pd.concat(df_list)

In [46]:
# Remove Purus
df = df_shifted_total[df_shifted_total['PROVINCIA'] != 'PURUS']

In [47]:
# Remove nan
df = df.dropna(subset=['CANTIDAD'])

In [68]:
# Supongamos que tienes tus datos en un DataFrame llamado df
# Dividir los datos en variables predictoras (X) y la variable objetivo (y)
X = df[['ANIO', 'MES', 'PROVINCIA', 'VACUNA', 'CANTIDAD_DE_PERSONAL', 'POBLACION', 'PRESUPUESTO', 'ALFABETISMO', 'AGUA_POTABLE', 'DEMORA_CITA_horas', 'DEMORA_TRASLADO_horas']]
#X = df[['ANIO', 'CANTIDAD_DE_PERSONAL', 'POBLACION', 'PRESUPUESTO', 'ALFABETISMO', 'AGUA_POTABLE', 'DEMORA_CITA_horas', 'DEMORA_TRASLADO_horas']]
y = df['CANTIDAD']

# Convertir variables categóricas en variables dummy
X = pd.get_dummies(X)
#import pdb;pdb.set_trace()

# Data splitting
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.2, random_state=42)

# Crear y entrenar el modelo XGBoost
model = XGBRegressor()
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_val)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error stf:", mse)
r2 = r2_score(y_val, y_pred)
print("R2 std:", r2)

#Inverse scaled
y_val_ = np.stack([y_val,y_val,y_val,y_val], axis = 1)
y_val_ = scaler.inverse_transform(y_val_)[:,0]
y_pred_ = np.stack([y_pred, y_pred, y_pred, y_pred], axis = 1)
y_pred_ = scaler.inverse_transform(y_pred_)[:,0]

# Calcular el error cuadrático medio normal(MSE)
mse_ = mean_squared_error(y_val_, y_pred_)
print("Mean Squared Error normal:", mse_)
r2_ = r2_score(y_val_, y_pred_)
print("R2 normal:", r2_)
X_tv, y_tv = X_tv.to_numpy(), y_tv.to_numpy()

Mean Squared Error stf: 0.0016871238850978871
R2 std: 0.8776334281729039
Mean Squared Error normal: 3449.999581654053
R2 normal: 0.8776334299811847


In [74]:
# Define the number of folds for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store MSE and R2 scores for each fold
mse_scores_scl = []
mse_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_tv):
    X_train_cv, X_val_cv = X_tv[train_index], X_tv[val_index]
    y_train_cv, y_val_cv = y_tv[train_index], y_tv[val_index]
    
    # Create and train the XGBoost model
    model_cv = XGBRegressor()
    model_cv.fit(X_train_cv, y_train_cv)
    
    # Make predictions on the validation set
    y_pred_cv = model_cv.predict(X_val_cv)
    
    # Calculate MSE score for this fold
    mse_cv = mean_squared_error(y_val_cv, y_pred_cv)
    
    # Append scores to the lists
    mse_scores_scl.append(mse_cv)

    #Inverse scaled
    y_val_cv_ = np.stack([y_val_cv,y_val_cv,y_val_cv,y_val_cv], axis = 1)
    y_val_cv_ = scaler.inverse_transform(y_val_cv_)[:,0]
    y_pred_cv_ = np.stack([y_pred_cv, y_pred_cv, y_pred_cv, y_pred_cv], axis = 1)
    y_pred_cv_ = scaler.inverse_transform(y_pred_cv_)[:,0]
    mse_cv_ = mean_squared_error(y_val_cv_, y_pred_cv_)
    mse_scores.append(mse_cv_)

# Calculate the average MSE and R2 scores across all folds
avg_mse_scl = np.mean(mse_scores_scl)
avg_mse = np.mean(mse_scores)

# Print the average scores
print("Average Mean Squared Error scl:", avg_mse_scl)
print("Average Mean Squared Error:", avg_mse)



Average Mean Squared Error scl: 0.002589857407229286
Average Mean Squared Error: 5295.999483515003


In [76]:
#Inverse transform of the scaler
y_val_ = np.stack([y_val,y_val,y_val,y_val], axis = 1)
y_val_ = scaler.inverse_transform(y_val_)[:,0]
y_train_ = np.stack([y_train,y_train,y_train,y_train], axis = 1)
y_train_ = scaler.inverse_transform(y_train_)[:,0]

# Calculate mean and median of the target variable
mean_target = np.mean(y_train_)
median_target = np.median(y_train_)

# Mean baseline predictions
y_pred_mean = np.full_like(y_val_, mean_target)

# Median baseline predictions
y_pred_median = np.full_like(y_val_, median_target)

# Calculate MSE for mean baseline
mse_mean = mean_squared_error(y_val_, y_pred_mean)


# Calculate MSE score for median baseline
mse_median = mean_squared_error(y_val_, y_pred_median)

# Print the baseline scores
print("Mean Baseline - MSE:", mse_mean)
print("Median Baseline - MSE:", mse_median)



Mean Baseline - MSE: 28194.533009005492
Median Baseline - MSE: 34280.42452830189


In [77]:
# Calculate mean and median of the target variable
mean_target = np.mean(y_train)
median_target = np.median(y_train)

# Mean baseline predictions
y_pred_mean = np.full_like(y_val, mean_target)

# Median baseline predictions
y_pred_median = np.full_like(y_val, median_target)

# Calculate MSE and R2 scores for mean baseline
mse_mean = mean_squared_error(y_val, y_pred_mean)
r2_mean = r2_score(y_val, y_pred_mean)

# Calculate MSE and R2 scores for median baseline
mse_median = mean_squared_error(y_val, y_pred_median)
r2_median = r2_score(y_val, y_pred_median)

# Print the baseline scores
print("Mean Baseline - MSE:", mse_mean, "R2:", r2_mean)
print("Median Baseline - MSE:", mse_median, "R2:", r2_median)

Mean Baseline - MSE: 0.013787731922835101 R2: -1.9917666246538346e-05
Median Baseline - MSE: 0.0167638635279485 R2: -0.2158778194129638
