# Configuración Inicial

In [204]:
import geopandas as gpd
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
r_state = 42

# Importar dataframes y crear conjuntos train y test

## Importar dataframes

In [208]:
path_2011 = r"C:\Users\Usuario\OneDrive\Escritorio\UOC\TFM\PEC3  - Implementacion\WorkStation\Datos\Variables\Procesados\Secciones\df_2011.csv"
path_2021 = r"C:\Users\Usuario\OneDrive\Escritorio\UOC\TFM\PEC3  - Implementacion\WorkStation\Datos\Variables\Procesados\Secciones\df_2021.csv"
df_2011 = pd.read_csv(path_2011, sep=";")
df_2021 = pd.read_csv(path_2021, sep=";")

df_2011.rename(columns={'SES_11': 'SES', 'he_pct_11': 'he_pct', 'kw_pct_11': 'kw_pct', 'median_price_inf_11': 'median_price_inf'}, inplace=True)
df_2021.rename(columns={'SES_21': 'SES', 'he_pct_21': 'he_pct', 'kw_pct_21': 'kw_pct', 'median_price_inf_21': 'median_price_inf'}, inplace=True)
#print(df_2011.columns)
#print(df_2021.columns)

## Conjunto train y test

In [211]:
# Se dividen los conjuntos usando 80/20 respectivamente
train, test = train_test_split(df_2011, test_size = 0.2, random_state = r_state)

x_train = train.loc[:, ~train.columns.isin(["SES"])]
y_train = train.loc[:, train.columns == 'SES']

x_test = test.loc[:, ~test.columns.isin(["SES"])]
y_test = test.loc[:, test.columns == 'SES']

# Crear Modelo Regresión Linear

In [214]:
# Import package:
from sklearn.linear_model import LinearRegression

# Fit model
lm = LinearRegression().fit(x_train, y_train)

# Fir prediction to test dataset
y_pred_lm = lm.predict(x_test)

In [216]:
# Import Regression metrics
from sklearn.metrics import (mean_absolute_error as mae, mean_squared_error as mse, 
                             explained_variance_score as evs, r2_score as r2)

def mape(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

# Evaluate Model performanc metrics
mae_lm = mae(y_test, y_pred_lm)
mse_lm = mse(y_test, y_pred_lm)
evs_lm = evs(y_test, y_pred_lm)
r2_lm = r2(y_test, y_pred_lm)
mape_lm = mape(y_test, y_pred_lm)

print(f"mae_lm: {mae_lm}, \nmse_lm: {mse_lm}, \nevs_lm: {evs_lm}, \nr2_lm: {r2_lm}, \nmape_lm: {mape_lm}")

mae_lm: 0.02543846176987255, 
mse_lm: 0.0010833749820197234, 
evs_lm: 0.9967295478139296, 
r2_lm: 0.9961304014853738, 
mape_lm: 7.665552616196288


## Kfold Validation

In [219]:
kfold = KFold(n_splits=5, shuffle=True, random_state=r_state)  # 5 pliegues
scores = cross_val_score(lm, x_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print(f"Mean Cross-Validation Score: {np.mean(scores)}")
print(f"Standard Deviation: {np.std(scores)}")

Mean Cross-Validation Score: -0.0013430075606360218
Standard Deviation: 0.0006554681288836973


# Usar la regresión lineal para predecir la gentrificación de 2021 

In [226]:
# Define 'x' data
X_21 = df_2021.loc[:, ~df_2021.columns.isin(["SES"])]

# Define new dataset to store results in:
df_2021_lm = df_2021.copy()

# Fit model to 2021 input data to predict 2021 SES scores
df_2021_lm['Predicted_SES_21'] = lm.predict(X_21)

In [228]:
print(df_2021_lm)

     NUMSECCENS     15-19     20-34     35-64       65+  Casado/a  \
0        1001.0  0.000000  0.230435  0.739130  0.030435  0.365297   
1        2003.0  0.000000  0.201521  0.787072  0.011407  0.540541   
2        2004.0  0.000000  0.162162  0.762162  0.075676  0.596685   
3        2009.0  0.000000  0.195238  0.790476  0.014286  0.549020   
4        2013.0  0.003460  0.190311  0.771626  0.034602  0.583916   
..          ...       ...       ...       ...       ...       ...   
110     11003.0  0.003356  0.221477  0.758389  0.016779  0.589226   
111     11004.0  0.000000  0.158333  0.816667  0.025000  0.556962   
112     11006.0  0.009302  0.265116  0.711628  0.013953  0.521327   
113     11007.0  0.003205  0.182692  0.807692  0.006410  0.612179   
114     11010.0  0.008097  0.161943  0.813765  0.016194  0.595041   

     Divorciado/a o Separado/a  Soltero/a   Viudo/a  \
0                     0.132420   0.488584  0.013699   
1                     0.127413   0.324324  0.007722   
2     