In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("/Users/david/Desktop/CH_ECON_V3.csv")

TARGET = "econ_hires_per_1k"
y = pd.to_numeric(df[TARGET], errors="coerce")

mask = y.notna()
df = df.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

# -----------------------------
# Time index (for TimeSeriesSplit)
# -----------------------------
if "year" in df.columns and "quarter" in df.columns:
    q = df["quarter"].astype(str).str.replace("Q", "", regex=False)
    df["_time_index"] = pd.to_numeric(df["year"], errors="coerce") * 4 + pd.to_numeric(q, errors="coerce")
else:
    df["_time_index"] = np.arange(len(df))

df = df.sort_values("_time_index").reset_index(drop=True)

# -----------------------------
# Feature selection
# -----------------------------
drop_cols = {TARGET, "_time_index"}
categorical_cols = [c for c in ["state", "county", "industry", "year", "quarter"] if c in df.columns]
numeric_cols = [c for c in df.columns if c not in drop_cols and c not in categorical_cols]

for c in numeric_cols:
    if df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c], errors="coerce")

X = df[numeric_cols + categorical_cols]

# -----------------------------
# Preprocessing + LASSO
# -----------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), categorical_cols)
    ],
    verbose_feature_names_out=False
)

model = Pipeline([
    ("preprocess", preprocess),
    ("lasso", LassoCV(
        cv=TimeSeriesSplit(n_splits=5),
        n_alphas=200,
        max_iter=30000,
        random_state=42
    ))
])

model.fit(X, y)

# -----------------------------
# Coefficient table (FINAL OUTPUT)
# -----------------------------
feature_names = model.named_steps["preprocess"].get_feature_names_out()
coefs = model.named_steps["lasso"].coef_

coef_table = (
    pd.DataFrame({
        "feature": feature_names,
        "coefficient": coefs
    })
    .assign(abs_coefficient=lambda d: d["coefficient"].abs())
    .sort_values("abs_coefficient", ascending=False)
    .reset_index(drop=True)
)

coef_table




Unnamed: 0,feature,coefficient,abs_coefficient
0,econ_hire_rate,12.445247,12.445247
1,econ_emp_per_1k,10.71541,10.71541
2,STATE_% Children in Poverty,-0.513073,0.513073
3,state_avg_earnings,-0.236653,0.236653
4,STATE_% Low Birthweight,-0.208205,0.208205
5,STATE_Food Environment Index,0.164597,0.164597
6,state_hires_total,0.159182,0.159182
7,growth_emp_qoq,0.153683,0.153683
8,growth_emp_yoy,0.121825,0.121825
9,STATE_Primary Care Physicians Rate,-0.099586,0.099586


In [26]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from merf.merf import MERF


# -----------------------------
# 1) Load
# -----------------------------

TARGET = "econ_hires_per_1k"

# -----------------------------
# 2) Ensure numeric types for Year/quarter/target/features
# -----------------------------
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["quarter"] = pd.to_numeric(df["quarter"], errors="coerce")
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")

# LASSO-selected features (exclude leakage)
X_cols = [
    "econ_emp_per_1k",
    "state_avg_earnings",
    "growth_emp_qoq",
    "growth_emp_yoy",
    "growth_earn_qoq",
    "STATE_% Children in Poverty",
    "STATE_% Low Birthweight",
    "STATE_Food Environment Index",
    "STATE_Primary Care Physicians Rate",
    "STATE_Social Association Rate",
    "STATE_% Smokers",
    "STATE_Preventable Hospitalization Rate",
    "STATE_Mentally Unhealthy Days",
]

LEAKY = {"econ_hire_rate", "state_hires_total"}
X_cols = [c for c in X_cols if c in df.columns and c not in LEAKY]

for c in X_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Keep rows with essential time + target
df = df.dropna(subset=["Year", "quarter", TARGET]).copy()

# -----------------------------
# 3) Reconstruct state grouping (state blocks)
#    Assumption (matches your sheet): each state starts at minYear+minQuarter.
# -----------------------------
min_year = int(df["Year"].min())
min_q = int(df["quarter"].min())

# IMPORTANT: this uses the existing row order (state blocks are contiguous in your file)
start_of_state = (df["Year"] == min_year) & (df["quarter"] == min_q)

# Force the first row to start a group even if marker fails
start_of_state.iloc[0] = True

# state_group = 0,1,2,... per block
df["state_group"] = start_of_state.cumsum() - 1

print("Derived # of state groups:", df["state_group"].nunique())
print("First 10 group sizes:\n", df["state_group"].value_counts().sort_index().head(10))

# -----------------------------
# 4) Create time index for splitting
# -----------------------------
df["time_index"] = df["Year"] * 4 + df["quarter"]

# -----------------------------
# 5) Build modeling frame
#    (Do NOT drop rows with missing X; let SimpleImputer handle it.)
# -----------------------------
df_model = df[[TARGET, "state_group", "time_index"] + X_cols].copy()
df_model = df_model.dropna(subset=[TARGET, "state_group", "time_index"]).reset_index(drop=True)

# MERF inputs
y = df_model[TARGET].values
clusters = df_model["state_group"].astype(str)  # MUST be pandas Series
Z = np.ones((len(df_model), 1))                 # random intercept only

X_raw = df_model[X_cols].values

# Preprocess X
x_pre = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
X = x_pre.fit_transform(X_raw)

# -----------------------------
# 6) Time-based split (last 20% of time_index)
# -----------------------------
df_model = df_model.sort_values("time_index").reset_index(drop=True)

split = int(len(df_model) * 0.8)
tr = np.arange(0, split)
te = np.arange(split, len(df_model))

X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
Z_tr, Z_te = Z[tr], Z[te]
cl_tr, cl_te = clusters.iloc[tr], clusters.iloc[te]

# -----------------------------
# 7) Fit MERF
# -----------------------------
rf = RandomForestRegressor(
    n_estimators=600,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

merf = MERF(rf)
merf.fit(X_tr, Z_tr, cl_tr, y_tr)

# -----------------------------
# 8) Evaluate
# -----------------------------
pred_tr = merf.predict(X_tr, Z_tr, cl_tr)
pred_te = merf.predict(X_te, Z_te, cl_te)

print("\nFEATURES USED:")
print(X_cols)

print("\nTRAIN:")
print("  R2  :", r2_score(y_tr, pred_tr))
print("  RMSE:", mean_squared_error(y_tr, pred_tr, squared=False))

print("\nTEST:")
print("  R2  :", r2_score(y_te, pred_te))
print("  RMSE:", mean_squared_error(y_te, pred_te, squared=False))


Derived # of state groups: 50
First 10 group sizes:
 state_group
0    44
1    10
2    44
3    44
4    44
5    44
6    44
7    44
8    44
9    44
Name: count, dtype: int64


INFO     [merf.py:307] Training GLL is 6598.840007552962 at iteration 1.
INFO     [merf.py:307] Training GLL is 6543.059052563239 at iteration 2.
INFO     [merf.py:307] Training GLL is 6504.205413058526 at iteration 3.
INFO     [merf.py:307] Training GLL is 6476.706636652364 at iteration 4.
INFO     [merf.py:307] Training GLL is 6444.936594394965 at iteration 5.
INFO     [merf.py:307] Training GLL is 6430.6331927418105 at iteration 6.
INFO     [merf.py:307] Training GLL is 6421.123105953389 at iteration 7.
INFO     [merf.py:307] Training GLL is 6413.839452454632 at iteration 8.
INFO     [merf.py:307] Training GLL is 6409.208651099883 at iteration 9.
INFO     [merf.py:307] Training GLL is 6406.990171419416 at iteration 10.
INFO     [merf.py:307] Training GLL is 6407.040857963105 at iteration 11.
INFO     [merf.py:307] Training GLL is 6404.581430865687 at iteration 12.
INFO     [merf.py:307] Training GLL is 6402.447439740721 at iteration 13.
INFO     [merf.py:307] Training GLL is 6403.78


FEATURES USED:
['econ_emp_per_1k', 'state_avg_earnings', 'growth_emp_qoq', 'growth_emp_yoy', 'growth_earn_qoq', 'STATE_% Children in Poverty', 'STATE_% Low Birthweight', 'STATE_Food Environment Index', 'STATE_Primary Care Physicians Rate', 'STATE_Social Association Rate', 'STATE_% Smokers', 'STATE_Preventable Hospitalization Rate', 'STATE_Mentally Unhealthy Days']

TRAIN:
  R2  : 0.9320257712997496
  RMSE: 3.690130444741052

TEST:
  R2  : 0.2546166989054003
  RMSE: 11.641365630400133


