# Configuración Inicial

In [4]:
# Importar paquetes
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely
from shapely import wkt
import statistics
import matplotlib.pyplot as plt
import zipfile

# Import paquetes con los modelos de sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# Importar métricas
from sklearn.metrics import (mean_absolute_error as mae, mean_squared_error as mse, 
                             explained_variance_score as evs, r2_score as r2)                
from sklearn.metrics import (accuracy_score as acc, balanced_accuracy_score as bal_acc, 
                             f1_score as f1, roc_auc_score as roc_auc,
                             average_precision_score as ave_prec)                            
def mape(Y_actual,Y_Predicted):
    #print(Y_actual, Y_Predicted)
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

r_state = 42

# Importar dataframes y crear conjuntos train y test

## Importar dataframes

In [7]:
path_2011 = r"C:\Users\Usuario\OneDrive\Escritorio\UOC\TFM\PEC3  - Implementacion\WorkStation\Datos\Variables\Procesados\Distritos\df_2011.csv"
path_2021 = r"C:\Users\Usuario\OneDrive\Escritorio\UOC\TFM\PEC3  - Implementacion\WorkStation\Datos\Variables\Procesados\Distritos\df_2021.csv"
df_2011 = pd.read_csv(path_2011, sep=";")
df_2021 = pd.read_csv(path_2021, sep=";")

## Conjunto train y test

In [10]:
# Se dividen los conjuntos usando 80/20 respectivamente
train, test = train_test_split(df_2011, test_size = 0.2, random_state = r_state)

x_train = train.loc[:, ~train.columns.isin(["SES_11","NOMBRE"])]
y_train = train.loc[:, train.columns == 'SES_11'].values.ravel()

x_test = test.loc[:, ~test.columns.isin(["SES_11","NOMBRE"])]
y_test = test.loc[:, test.columns == 'SES_11'].values.ravel()

#print(x_train)
#print(y_train)
#print(x_test)
#print(y_test)

# Construir RF

In [13]:
# Crear Modelo
rf = RandomForestRegressor(random_state=r_state)

In [15]:
# Entrenar modelo
rf_model = rf.fit(x_train, y_train)

# Predecir conjunto test con el modelo
y_pred_rf = rf_model.predict(x_test)

In [17]:
# Calcular métricas de desempeño
mae_rf = mae(y_test, y_pred_rf)
mse_rf = mse(y_test, y_pred_rf)
evs_rf = evs(y_test, y_pred_rf)
r2_rf = r2(y_test, y_pred_rf)
mape_rf = mape(y_test, y_pred_rf)

print(f"mae_rf: {mae_rf}, \nmse_rf: {mse_rf}, \nevs_rf: {evs_rf}, \nr2_rf: {r2_rf}, \nmape_rf: {mape_rf}")

mae_rf: 0.9503884577625059, 
mse_rf: 1.7825831999712571, 
evs_rf: 0.326081342518606, 
r2_rf: -0.3300102337469859, 
mape_rf: 55.55173301521551


# K-fold Validacion

In [20]:
kfold = KFold(n_splits=5, shuffle=True, random_state=r_state)  # 5 pliegues
scores = cross_val_score(rf, x_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print(f"Mean Cross-Validation Score: {np.mean(scores)}")
print(f"Standard Deviation: {np.std(scores)}")

Mean Cross-Validation Score: -0.30238598147710716
Standard Deviation: 0.5206765785202331
