In [14]:
import pandas as pd
import geopandas as gpd

from libpysal.weights.util import full2W
import warnings
warnings.simplefilter(action='ignore')

import math
import matplotlib.pyplot as plt

import numpy as np
np.random.seed(42)
import statsmodels.api as sm
from numpy.linalg import eigh
from esda.moran import Moran

In [15]:
df = pd.read_csv("../00_data/03_final/cbsa_level.csv")
gdf = gpd.read_file("../00_data/01_raw/tl_2024_us_cbsa", engine="pyogrio")
gdf['cbsacode'] = gdf['CBSAFP'].astype('float64')

In [16]:
num_unique = df["cbsacode"].nunique()
print("cbsa:", num_unique)

cbsa: 875


In [17]:
gdf_ll = gdf.to_crs("EPSG:4326").copy()
gdf_ll["centroid"] = gdf_ll.geometry.centroid  # may warn; matches old coordinates exactly
gdf_ll["lon"] = gdf_ll.centroid.x
gdf_ll["lat"] = gdf_ll.centroid.y
gdf_ll["ALAND_acres"] = gdf_ll["ALAND"] / 4046.8564224
df = df.merge(gdf_ll[['ALAND_acres','cbsacode','lon','lat']],how='left')

In [18]:

# ---------------------------
# centroids (projected)
# ---------------------------
def add_lonlat_centroids(gdf, proj_crs="EPSG:5070"):
    """
    Compute centroids in a projected CRS, then bring them back to WGS84.
    """
    gdf_proj = gdf.to_crs(proj_crs).copy()
    cent_proj = gdf_proj.geometry.centroid  # planar centroid in meters
    cent_wgs = gpd.GeoSeries(cent_proj, crs=proj_crs).to_crs("EPSG:4326")
    out = gdf.copy()
    out["centroid"] = cent_wgs
    out["lon"] = cent_wgs.x
    out["lat"] = cent_wgs.y
    return out

# ---------------------------------------
# Fast kNN weights 
# ---------------------------------------

def construct_knn_weights(coords, k):
    # coords: (n,2) [lon, lat] in degrees
    R = 6371.0
    lon = np.radians(coords[:,0])[:,None]  # (n,1)
    lat = np.radians(coords[:,1])[:,None]  # (n,1)

    dlon = lon.T - lon                     # (n,n)
    dlat = lat.T - lat                     # (n,n)

    a = (np.sin(dlat/2.0)**2
         + np.cos(lat) @ np.cos(lat).T * np.sin(dlon/2.0)**2)
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    dist_mat = R * c
    np.fill_diagonal(dist_mat, np.inf)     # exclude self exactly like before

    n = dist_mat.shape[0]
    W = np.zeros((n,n), dtype=float)
    # identical neighbor selection and weights
    for i in range(n):
        row = dist_mat[i].copy()
        knn_idx = np.argsort(row, kind="stable")[:k]      # same (default quicksort) behavior
        thr = row[knn_idx[-1]]
        if thr > 0:
            W[i, knn_idx] = 1.0 - (dist_mat[i, knn_idx] / thr)

    W_sym = (W + W.T) / 2.0
    row_sums = W_sym.sum(axis=1, keepdims=True)
    nz = row_sums[:,0] > 0
    W_sym[nz] /= row_sums[nz]
    return W_sym

In [19]:
# === Appendix-B forward selection ===
def forward_select_esf(
    eig_for_valid: np.ndarray,        # (n_valid, m_eigs)
    ln_scal_vals: np.ndarray,      # (n_valid,)
    ln_factor_vals: np.ndarray,       # (n_valid,)
    W_lps_sub,                        # PySAL weights for valid rows (full2W)
    alpha: float = 0.05,
    permutations: int = 999,          # Moran permutations; keep 999 for parity
    max_add: int = 100,
    tol_delta_I: float = 1e-6,
    seed: int = 42 
):
    """
    Appendix B-style greedy forward selection minimizing Moran's I on residuals.
    Guards against None/constant/NaN eigenvectors and singular fits.
    Stops when p >= alpha, or ΔI is tiny, or max_add reached.
    Returns: (best_model, best_resid, selected_idx)
    """
    np.random.seed(seed)

    n_valid, m_eigs = eig_for_valid.shape
    selected_idx = []

    # Baseline
    Xb = sm.add_constant(ln_scal_vals)
    base = sm.OLS(ln_factor_vals, Xb).fit()
    resid = base.resid
    MI = Moran(resid, W_lps_sub, permutations=permutations)
    if (MI.p_sim >= alpha) or (m_eigs == 0):
        return base, resid, selected_idx

    remaining = list(range(m_eigs))
    best_model = base
    best_resid = resid
    last_I = MI.I

    # Step 1
    best = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
    for j in remaining:
        xj = eig_for_valid[:, j]
        if not np.all(np.isfinite(xj)) or np.nanstd(xj) == 0:
            continue
        X = sm.add_constant(np.column_stack([ln_scal_vals, xj]))
        try:
            m = sm.OLS(ln_factor_vals, X).fit()
            r = m.resid
            Mi = Moran(r, W_lps_sub, permutations=permutations)
        except Exception:
            continue
        if (Mi.I < best["I"]) or (np.isclose(Mi.I, best["I"]) and (best["p"] is None or Mi.p_sim > best["p"])):
            best.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

    if best["j"] is None:
        return base, resid, selected_idx

    selected_idx.append(best["j"])
    if best["j"] in remaining:
        remaining.remove(best["j"])
    best_model = best["model"]
    best_resid = best["resid"]
    if best["p"] is not None and best["p"] >= alpha:
        return best_model, best_resid, selected_idx
    

    # Step 2+
    adds = 1
    while remaining and adds < max_add:
        cand = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
        for j in remaining:
            X_eigs = eig_for_valid[:, selected_idx + [j]]
            if not np.all(np.isfinite(X_eigs[:, -1])) or np.nanstd(X_eigs[:, -1]) == 0:
                continue
            X = sm.add_constant(np.column_stack([ln_scal_vals, X_eigs]))
            try:
                m = sm.OLS(ln_factor_vals, X).fit()
                r = m.resid
                Mi = Moran(r, W_lps_sub, permutations=permutations)
            except Exception:
                continue
            if (Mi.I < cand["I"]) or (np.isclose(Mi.I, cand["I"]) and (cand["p"] is None or Mi.p_sim > cand["p"])):
                cand.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

        if cand["j"] is None:
            break

        # early stop if I barely improves
        if (last_I - cand["I"]) < tol_delta_I:
            break

        selected_idx.append(cand["j"])
        if cand["j"] in remaining:
            remaining.remove(cand["j"])
        best_model = cand["model"]
        best_resid = cand["resid"]
        last_I = cand["I"]
        adds += 1
        if cand["p"] is not None and cand["p"] >= alpha:
            break

    return best_model, best_resid, selected_idx

In [20]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

def _kfold_rmse(X: np.ndarray, y: np.ndarray, folds: int, seed: int) -> float:
    """K-fold CV RMSE for linear regression with intercept assumed in X."""
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    rmses = []
    for tr, te in kf.split(X):
        lr = LinearRegression(fit_intercept=False)  # intercept is in X
        lr.fit(X[tr], y[tr])
        yhat = lr.predict(X[te])
        rmses.append(np.sqrt(mean_squared_error(y[te], yhat)))
    return float(np.mean(rmses))

def _aicc(aic: float, n: int, p: int) -> float:
    """Small-sample corrected AIC."""
    return aic + (2 * p * (p + 1)) / (n - p - 1) if (n - p - 1) > 0 else np.inf


def calculate_FsAMIs(
    df: pd.DataFrame,
    factor_columns,
    population_column: str,
    cbsa_column: str = "cbsacode",
    alpha: float = 0.05,
    moran_permutations: int = 999,
    k_values = (7, 8, 10, 12, 15, 18, 22, 26, 30),
    seed: int = 42,
    cv_folds: int = 5,                 # NEW: folds for CV-RMSE / LASSO
    compute_lasso_diag: bool = True    # NEW: toggle LASSO residual diagnostic
):
    np.random.seed(seed)
    n_all = len(df)
    print("Merged data rows:", n_all)
    if n_all == 0:
        return pd.DataFrame()

    # --- numeric copy ---
    df_num = df.copy()
    for col in [population_column, *factor_columns, "lon", "lat"]:
        if col in df_num.columns:
            df_num[col] = pd.to_numeric(df_num[col], errors="coerce")

    coords_all = df_num[["lon", "lat"]].to_numpy()

    # --- precompute KNN weights + eigenvectors cache ---
    k_cache = {}
    for k in k_values:
        W_sym_k = construct_knn_weights(coords_all, k=k)
        H = np.eye(n_all) - np.ones((n_all, n_all)) / n_all
        Omega = H @ W_sym_k @ H
        evals, evecs = np.linalg.eigh(Omega)
        pos = evals > 1e-12
        evals_pos = evals[pos]
        evecs_pos = evecs[:, pos]
        order = np.argsort(evals_pos)[::-1]
        evals_pos = evals_pos[order]
        evecs_pos = evecs_pos[:, order]
        k_cache[k] = {"W_sym": W_sym_k, "evecs": evecs_pos, "evals": evals_pos}

    ln_pop_all = np.log(df_num[population_column]).astype(float)
    all_rows = []

    # === per-factor loop ===
    for factor in factor_columns:
        fac_vals = df_num[factor]
        pop_vals = df_num[population_column]

        coord_ok = (
            df_num["lon"].notna() & df_num["lat"].notna() &
            np.isfinite(df_num["lon"].to_numpy()) &
            np.isfinite(df_num["lat"].to_numpy())
        )
        valid_mask = (
            fac_vals.notna() & pop_vals.notna() &
            (fac_vals > 0) & (pop_vals > 0) &
            np.isfinite(fac_vals.to_numpy()) &
            np.isfinite(pop_vals.to_numpy()) &
            coord_ok
        )
        if not valid_mask.any():
            print(f"[Skip] No valid data for {factor}")
            continue

        idx_valid = np.where(valid_mask)[0]
        ln_factor = np.log(fac_vals.loc[valid_mask].to_numpy())
        ln_pop    = ln_pop_all[valid_mask.to_numpy()]
        cbsa_vals = df_num.loc[valid_mask, cbsa_column].to_numpy()
        n = ln_factor.shape[0]

        evals_k = []

        # --- per-k loop ---
        for k in k_values:
            cache = k_cache[k]
            W_sub = cache["W_sym"][idx_valid][:, idx_valid]
            W_lps_sub = full2W(W_sub);  W_lps_sub.transform = 'r'

            # baseline: y ~ 1 + ln_pop
            Xb = sm.add_constant(ln_pop)
            model_b = sm.OLS(ln_factor, Xb).fit()
            resid_b = model_b.resid
            MI_b = Moran(resid_b, W_lps_sub, permutations=moran_permutations)
            b_I, b_p = MI_b.I, MI_b.p_sim

            # forward-select ESF if needed
            eig_for_valid = cache["evecs"][idx_valid, :]
            best_model = model_b
            best_resid = resid_b
            sel_idx = []

            if b_p < alpha:
                best_model, best_resid, sel_idx = forward_select_esf(
                    eig_for_valid=eig_for_valid,
                    ln_scal_vals=ln_pop,
                    ln_factor_vals=ln_factor,
                    W_lps_sub=W_lps_sub,
                    alpha=alpha,
                    permutations=moran_permutations,
                )

            selected_m = len(sel_idx)

            # final Moran on ESF residuals
            MI_f = Moran(best_resid, W_lps_sub, permutations=moran_permutations)
            f_I, f_p = MI_f.I, MI_f.p_sim

            # --- FINAL MODEL METRICS (this is what you asked for) ---
            aic  = best_model.aic
            bic  = best_model.bic
            p    = len(best_model.params)  # intercept + ln_pop + eigs
            aicc = _aicc(aic, n=n, p=p)

            # CV-RMSE for the final model (use the actual selected eigs)
            # Build design matrix with intercept explicitly (match statsmodels fit)
            if selected_m == 0:
                X_final = np.column_stack([np.ones(n), ln_pop])
            else:
                X_final = np.column_stack([np.ones(n), ln_pop, eig_for_valid[:, sel_idx]])
            cv_rmse_final = _kfold_rmse(X_final, ln_factor, folds=cv_folds, seed=seed)

            # LASSO residual diagnostic on UNUSED eigs (optional)
            lasso_R2_unused = None
            lasso_n_eigs = None
            if compute_lasso_diag:
                all_cols = np.arange(eig_for_valid.shape[1])
                unused = np.setdiff1d(all_cols, np.asarray(sel_idx, dtype=int))
                if unused.size > 0:
                    E_unused = eig_for_valid[:, unused]
                    pipe = make_pipeline(
                        StandardScaler(with_mean=True, with_std=True),
                        LassoCV(cv=cv_folds, random_state=seed, max_iter=20000)
                    )
                    pipe.fit(E_unused, best_resid)
                    resid_hat = pipe.predict(E_unused)
                    # R^2 on the whole set using refit alpha_ (diagnostic, not CV R^2)
                    lasso_R2_unused = float(1.0 - np.var(best_resid - resid_hat) / np.var(best_resid))
                    # how many unused eigs selected by LASSO
                    coef = pipe.named_steps["lassocv"].coef_
                    lasso_n_eigs = int(np.sum(np.abs(coef) > 1e-8))
                else:
                    lasso_R2_unused = 0.0
                    lasso_n_eigs = 0

            beta = best_model.params[1]
            pval = best_model.pvalues[1]
            ci_low, ci_up = best_model.conf_int(alpha=0.05)[1]

            # payload to store; we DO NOT select best k here
            evals_k.append({
                "k": k,
                # Moran diagnostics
                "final_p": f_p, "final_I": f_I,
                "baseline_p": b_p, "baseline_I": b_I,
                # final model metrics
                "aic": aic, "bic": bic, "aicc": aicc,
                "cv_rmse_final": cv_rmse_final,
                "lasso_R2_unused": lasso_R2_unused,
                "lasso_n_eigs_unused": lasso_n_eigs,
                # params
                "selected_m_eigs": selected_m,
                "beta": beta, "pval": pval, "ci_low": ci_low, "ci_up": ci_up,
                # payload for rows
                "resid": best_resid, "cbsa_vals": cbsa_vals
            })

        # --- write out rows for every k (no best-k flags) ---
        for d in evals_k:
            resid = d["resid"]
            for i_local, cval in enumerate(d["cbsa_vals"]):
                all_rows.append({
                    cbsa_column: cval,
                    "factor": factor,
                    "k": d["k"],
                    # Moran
                    "baseline_moran_value": d["baseline_I"],
                    "baseline_moran_p": d["baseline_p"],
                    "final_moran_value": d["final_I"],
                    "final_moran_p": d["final_p"],
                    # ESF details
                    "selected_m_eigs": d["selected_m_eigs"],
                    "FsAMI": resid[i_local],
                    "beta": d["beta"],
                    "p_value": d["pval"],
                    "CI_lower": d["ci_low"],
                    "CI_upper": d["ci_up"],
                    # --- final model metrics you want ---
                    "AIC": d["aic"],
                    "BIC": d["bic"],
                    "AICc": d["aicc"],
                    "CV_RMSE_final": d["cv_rmse_final"],
                    "LASSO_R2_unused": d["lasso_R2_unused"],
                    "LASSO_n_eigs_unused": d["lasso_n_eigs_unused"]
                })

    return pd.DataFrame(all_rows)


In [21]:
df.columns

Index(['cbsacode', 'TotalPopulation', 'BINGE', 'CSMOKING', 'DEPRESSION',
       'DIABETES', 'LPA', 'OBESITY', 'adult_smoking', 'adult_obesity',
       'excessive_drinking', 'diabetes_prevalence', 'some_college',
       'unemployment', 'children_single_parent', 'mental_health_providers',
       'median_household_income', 'driving_alone_to_work', 'sti', 'FFR20',
       'gdp', 'coverage_50', 'coverage_60', 'coverage_70', 'coverage_80',
       'coverage_90', 'noise50n', 'noise60n', 'noise70n', 'noise80n',
       'noise90n', 'CANCER', 'VISION', 'MOBILITY', 'life_expectancy',
       'SELFCARE', 'DISABILITY', 'Park_Area_Acres', 'road_network_total',
       'auto_net_total', 'multimodal_net_total', 'pedestrian_net_total',
       'street_intersection_total', 'auto_intersections_total',
       'multimodal_3leg_intersections_total',
       'multimodal_4leg_intersections_total',
       'pedestrian_3leg_intersections_total',
       'pedestrian_4leg_intersections_total', 'Ac_Land', 'ALAND_acres', 'l

In [None]:
# First, add these imports at the top of your notebook
from joblib import Parallel, delayed
import os

if __name__ == "__main__":
    # Set master random seed
    MASTER_SEED = 42
    np.random.seed(MASTER_SEED)
    
    # Set number of CPU cores to use
    n_jobs = max(1, os.cpu_count() - 1)
    
    factor_columns = ['life_expectancy','road_network_total', 'auto_net_total',
       'multimodal_net_total', 'pedestrian_net_total',
       'street_intersection_total', 'auto_intersections_total',
       'multimodal_3leg_intersections_total',
       'multimodal_4leg_intersections_total',
       'pedestrian_3leg_intersections_total',
       'pedestrian_4leg_intersections_total',]
    
    def process_factor(factor, seed):
        # Each worker gets its own deterministic seed
        worker_seed = seed + hash(factor) % 1000000
        np.random.seed(worker_seed)
        
        return calculate_FsAMIs(
            df=df,
            factor_columns=[factor],
            population_column="TotalPopulation",
            cbsa_column='cbsacode',
            alpha=0.05,
            moran_permutations=999,
            k_values=(5,6,7,8,9,10,11,12,13,14,15),
            seed=worker_seed  # Pass seed to calculate_FsAMIs
        )

    # Create seeds for each factor
    seeds = [MASTER_SEED + i for i in range(len(factor_columns))]
    
    # Run parallel processing with seeds
    results_list = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(process_factor)(factor, seed) 
        for factor, seed in zip(factor_columns, seeds)
    )
    
    # Combine results
    results_df = pd.concat(results_list, ignore_index=True)
    
    # Save results
    results_df.to_csv(f'../00_data/04_output/FsAMIs_pop_allk_allmatrc_add_feature.csv', 
                      index=False)

[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.


Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows:Merged data rows:  875875

Merged data rows: 875
Merged data rows: 875


[Parallel(n_jobs=11)]: Done   2 out of  11 | elapsed: 98.8min remaining: 444.5min
[Parallel(n_jobs=11)]: Done  11 out of  11 | elapsed: 125.3min finished


In [23]:
filtered_df = results_df[results_df['final_moran_p'] < 0.05]
unique_factors = filtered_df['factor'].unique()
print("Unique factors:", unique_factors)

Unique factors: ['life_expectancy' 'road_network_total' 'auto_net_total'
 'multimodal_net_total' 'pedestrian_net_total' 'street_intersection_total'
 'auto_intersections_total' 'multimodal_4leg_intersections_total'
 'pedestrian_4leg_intersections_total']


In [24]:
factors_crossing_zero = results_df[
    (results_df['CI_lower'] < 0) & (results_df['CI_upper'] > 0)
]['factor'].unique()

print("Factors with ln_pop beta CI crossing 0:", factors_crossing_zero)

Factors with ln_pop beta CI crossing 0: []
