## Random Forest

In [4]:

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


# -----------------------------
# 1) Load
# -----------------------------
df = pd.read_csv("/Users/david/Desktop/CH_ECON_V3.csv")

TARGET = "econ_hires_per_1k"

# -----------------------------
# 2) Features (from LASSO; exclude leakage)
# -----------------------------
X_cols = [
    "econ_emp_per_1k",
    "state_avg_earnings",
    "growth_emp_qoq",
    "growth_emp_yoy",
    "growth_earn_qoq",
    "STATE_% Children in Poverty",
    "STATE_% Low Birthweight",
    "STATE_Food Environment Index",
    "STATE_Primary Care Physicians Rate",
    "STATE_Social Association Rate",
    "STATE_% Smokers",
    "STATE_Preventable Hospitalization Rate",
    "STATE_Mentally Unhealthy Days",
]

LEAKY = {"econ_hire_rate", "state_hires_total"}
X_cols = [c for c in X_cols if c in df.columns and c not in LEAKY]

# -----------------------------
# 3) Numeric conversion + time index
# -----------------------------
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["quarter"] = pd.to_numeric(df["quarter"], errors="coerce")
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")

for c in X_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["Year", "quarter", TARGET]).copy()
df["time_index"] = df["Year"] * 4 + df["quarter"]

# -----------------------------
# 4) Build modeling frame
#     (Do NOT drop rows for missing X; imputer will handle)
# -----------------------------
df_model = df[[TARGET, "time_index"] + X_cols].copy()
df_model = df_model.dropna(subset=[TARGET, "time_index"]).reset_index(drop=True)
df_model = df_model.sort_values("time_index").reset_index(drop=True)

y = df_model[TARGET].values
X_raw = df_model[X_cols].values

# Preprocess X (impute + scale)
x_pre = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
X = x_pre.fit_transform(X_raw)

# -----------------------------
# 5) Time-based split (last 20%)
# -----------------------------
split = int(len(df_model) * 0.8)
tr = np.arange(0, split)
te = np.arange(split, len(df_model))

X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]

# -----------------------------
# 6) Fit Random Forest
# -----------------------------
rf = RandomForestRegressor(
    n_estimators=600,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_tr, y_tr)

# -----------------------------
# 7) Evaluate
# -----------------------------
pred_tr = rf.predict(X_tr)
pred_te = rf.predict(X_te)

print("FEATURES USED:")
print(X_cols)

print("\nTRAIN:")
print("  R2  :", r2_score(y_tr, pred_tr))
print("  RMSE:", mean_squared_error(y_tr, pred_tr, squared=False))

print("\nTEST:")
print("  R2  :", r2_score(y_te, pred_te))
print("  RMSE:", mean_squared_error(y_te, pred_te, squared=False))


FEATURES USED:
['econ_emp_per_1k', 'state_avg_earnings', 'growth_emp_qoq', 'growth_emp_yoy', 'growth_earn_qoq', 'STATE_% Children in Poverty', 'STATE_% Low Birthweight', 'STATE_Food Environment Index', 'STATE_Primary Care Physicians Rate', 'STATE_Social Association Rate', 'STATE_% Smokers', 'STATE_Preventable Hospitalization Rate', 'STATE_Mentally Unhealthy Days']

TRAIN:
  R2  : 0.8869492831125159
  RMSE: 4.884658381125854

TEST:
  R2  : 0.4725399734051988
  RMSE: 8.364104568600691




In [6]:
param_grid = {
    "max_depth": [None, 10, 15, 20],
    "min_samples_leaf": [3, 5, 10, 20],
    "max_features": ["sqrt", 0.5, 0.75],
}


In [8]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

tscv = TimeSeriesSplit(n_splits=5)

rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

grid.fit(X_tr, y_tr)

print("Best params:", grid.best_params_)
print("CV RMSE:", -grid.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best params: {'max_depth': 15, 'max_features': 0.75, 'min_samples_leaf': 5}
CV RMSE: 9.351551715846814


In [10]:
best_rf = grid.best_estimator_

pred_tr = best_rf.predict(X_tr)
pred_te = best_rf.predict(X_te)

print("\nTUNED RANDOM FOREST RESULTS")

print("\nTRAIN:")
print("  R2  :", r2_score(y_tr, pred_tr))
print("  RMSE:", mean_squared_error(y_tr, pred_tr, squared=False))

print("\nTEST:")
print("  R2  :", r2_score(y_te, pred_te))
print("  RMSE:", mean_squared_error(y_te, pred_te, squared=False))



TUNED RANDOM FOREST RESULTS

TRAIN:
  R2  : 0.8818783876233387
  RMSE: 4.993007503748372

TEST:
  R2  : 0.4996871062101609
  RMSE: 8.146020999350588




## XGBoost

In [14]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

from xgboost import XGBRegressor


In [20]:
split = int(len(df_model) * 0.8)
X_tr, X_te = X[:split], X[split:]
y_tr, y_te = y[:split], y[split:]


In [22]:
xgb = XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=4,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.0,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
)

In [24]:
xgb.fit(X_tr, y_tr)
pred_tr = xgb.predict(X_tr)
pred_te = xgb.predict(X_te)

print("FEATURES USED:")
print(X_cols)

print("\nTRAIN:")
print("  R2  :", r2_score(y_tr, pred_tr))
print("  RMSE:", mean_squared_error(y_tr, pred_tr, squared=False))

print("\nTEST:")
print("  R2  :", r2_score(y_te, pred_te))
print("  RMSE:", mean_squared_error(y_te, pred_te, squared=False))


FEATURES USED:
['econ_emp_per_1k', 'state_avg_earnings', 'growth_emp_qoq', 'growth_emp_yoy', 'growth_earn_qoq', 'STATE_% Children in Poverty', 'STATE_% Low Birthweight', 'STATE_Food Environment Index', 'STATE_Primary Care Physicians Rate', 'STATE_Social Association Rate', 'STATE_% Smokers', 'STATE_Preventable Hospitalization Rate', 'STATE_Mentally Unhealthy Days']

TRAIN:
  R2  : 0.9425525332312072
  RMSE: 3.4820337541924022

TEST:
  R2  : 0.4773131744351913
  RMSE: 8.326173462301997




In [26]:
import pandas as pd

importance = pd.Series(
    xgb.get_booster().get_score(importance_type="gain")
)

importance = (
    importance.rename(lambda x: X_cols[int(x[1:])])
    .sort_values(ascending=False)
)

print("\nXGBoost feature importance (GAIN):")
print(importance)


XGBoost feature importance (GAIN):
econ_emp_per_1k                           1636.957886
growth_emp_qoq                             895.371582
STATE_Social Association Rate              799.890686
STATE_Primary Care Physicians Rate         550.269226
STATE_% Low Birthweight                    502.671021
growth_emp_yoy                             385.178223
STATE_% Children in Poverty                331.988983
growth_earn_qoq                            297.535461
STATE_% Smokers                            261.016998
STATE_Preventable Hospitalization Rate     256.037964
STATE_Food Environment Index               242.281677
STATE_Mentally Unhealthy Days              190.219315
state_avg_earnings                         161.952072
dtype: float64


Remove econ_emp_per_1k

whether health/social variables still matter without the dominant control.

Lag the health variables

Use t−1 or t−4 values:

stronger causal story