## Random forest ##

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

# load and cleand data
ROOT = ROOT = Path("/Users/maximeducotterd/Desktop/DSAP_intercantonal_dynamics")
DATA_PATH = ROOT / "data" / "databasecsv.csv"
df = pd.read_csv(DATA_PATH, sep=";")
df.columns = df.columns.str.strip() 

print("Colonnes dispo :", df.columns.tolist())

# features choice
feature_cols = [
    "log_rent_avg",
    "log_avg_income",
    "log_unemployment",
    "log_schockexposure",
    "CLUSTER0",
    "CLUSTER1",
    "CLUSTER2",
]

target_col = "migration_rate"  

cols_needed = feature_cols + [target_col]
df = df.dropna(subset=cols_needed).copy()

X = df[feature_cols].values
y = df[target_col].values

# split data into train and tests sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0,
    shuffle=True,
)

print(f"Train : {X_train.shape[0]} obs, Test : {X_test.shape[0]} obs")

# Random Forest model
rf = RandomForestRegressor(
    n_estimators=500,   
    max_depth=None,     
    random_state=0,
    n_jobs=-1,          
)

rf.fit(X_train, y_train)

# prediction on test set
y_pred = rf.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest — RMSE = {rmse:.4f}")
print(f"Random Forest — R²   = {r2:.4f}")

# importation of features
importances = rf.feature_importances_
feat_imp = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)

print("\nImportance des features (Random Forest) :")
for name, imp in feat_imp:
    print(f"  {name:20s} -> {imp:.3f}")


Colonnes dispo : ['canton', 'year', 'CLUSTER0', 'CLUSTER1', 'CLUSTER2', 'rent_avg', 'Z_score_rent', 'log_rent_avg', 'pop_cant', 'Solde_migratoire', 'migration_rate', 'migration_rate_zscore', 'immo_price_index', 'immoo_price_index_zscore', 'mortgage_rate', 'change_mrtgrate', 'logement_propr', 'debt_per_household_*1000', 'homeownership_rate', 'Z-score-debt', 'Z-score-ownrrate', 'avg_income', 'avg_income_zscore', 'log_avg_income', 'unemployment_rate', 'z-score_unemployment', 'log_unemployment', 'share_65plus', 'housing_constr_tot', 'housing_construction_pc', 'exposure_index', 'shock_exposure', 'log_schockexposure', 'shockexposure_zscore', 'delta_rent', 'delta_migration', 'delta_unemployment', 'delta_income']
Train : 208 obs, Test : 52 obs
Random Forest — RMSE = 0.2568
Random Forest — R²   = 0.3492

Importance des features (Random Forest) :
  log_unemployment     -> 0.328
  log_avg_income       -> 0.271
  log_rent_avg         -> 0.180
  log_schockexposure   -> 0.169
  CLUSTER2             