In [30]:
import pandas as pd
import geopandas as gpd

from libpysal.weights.util import full2W
import warnings
warnings.simplefilter(action='ignore')

import math
import matplotlib.pyplot as plt

import numpy as np
np.random.seed(42)
import statsmodels.api as sm
from numpy.linalg import eigh
from esda.moran import Moran

In [31]:
df = pd.read_csv("../00_data/03_final/cbsa_level.csv")
gdf = gpd.read_file("../00_data/01_raw/tl_2024_us_cbsa", engine="pyogrio")
gdf['cbsacode'] = gdf['CBSAFP'].astype('float64')

In [32]:
num_unique = df["cbsacode"].nunique()
print("cbsa:", num_unique)

cbsa: 875


In [33]:
gdf_ll = gdf.to_crs("EPSG:4326").copy()
gdf_ll["centroid"] = gdf_ll.geometry.centroid  # may warn; matches old coordinates exactly
gdf_ll["lon"] = gdf_ll.centroid.x
gdf_ll["lat"] = gdf_ll.centroid.y
gdf_ll["ALAND_acres"] = gdf_ll["ALAND"] / 4046.8564224
df = df.merge(gdf_ll[['ALAND_acres','cbsacode','lon','lat']],how='left')

In [34]:

# ---------------------------
# centroids (projected)
# ---------------------------
def add_lonlat_centroids(gdf, proj_crs="EPSG:5070"):
    """
    Compute centroids in a projected CRS, then bring them back to WGS84.
    """
    gdf_proj = gdf.to_crs(proj_crs).copy()
    cent_proj = gdf_proj.geometry.centroid  # planar centroid in meters
    cent_wgs = gpd.GeoSeries(cent_proj, crs=proj_crs).to_crs("EPSG:4326")
    out = gdf.copy()
    out["centroid"] = cent_wgs
    out["lon"] = cent_wgs.x
    out["lat"] = cent_wgs.y
    return out

# ---------------------------------------
# Fast kNN weights 
# ---------------------------------------

def construct_knn_weights(coords, k):
    # coords: (n,2) [lon, lat] in degrees
    R = 6371.0
    lon = np.radians(coords[:,0])[:,None]  # (n,1)
    lat = np.radians(coords[:,1])[:,None]  # (n,1)

    dlon = lon.T - lon                     # (n,n)
    dlat = lat.T - lat                     # (n,n)

    a = (np.sin(dlat/2.0)**2
         + np.cos(lat) @ np.cos(lat).T * np.sin(dlon/2.0)**2)
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    dist_mat = R * c
    np.fill_diagonal(dist_mat, np.inf)     # exclude self exactly like before

    n = dist_mat.shape[0]
    W = np.zeros((n,n), dtype=float)
    # identical neighbor selection and weights
    for i in range(n):
        row = dist_mat[i].copy()
        knn_idx = np.argsort(row, kind="stable")[:k]      # same (default quicksort) behavior
        thr = row[knn_idx[-1]]
        if thr > 0:
            W[i, knn_idx] = 1.0 - (dist_mat[i, knn_idx] / thr)

    W_sym = (W + W.T) / 2.0
    row_sums = W_sym.sum(axis=1, keepdims=True)
    nz = row_sums[:,0] > 0
    W_sym[nz] /= row_sums[nz]
    return W_sym

In [None]:
# === Appendix-B forward selection ===
def forward_select_esf(
    eig_for_valid: np.ndarray,        # (n_valid, m_eigs)
    ln_scal_vals: np.ndarray,      # (n_valid,)
    ln_factor_vals: np.ndarray,       # (n_valid,)
    W_lps_sub,                        # PySAL weights for valid rows (full2W)
    alpha: float = 0.05,
    permutations: int = 999,          # Moran permutations; keep 999 for parity
    max_add: int = 100,
    tol_delta_I: float = 1e-6,
    seed: int = 42 
):
    """
    Appendix B-style greedy forward selection minimizing Moran's I on residuals.
    Guards against None/constant/NaN eigenvectors and singular fits.
    Stops when p >= alpha, or ΔI is tiny, or max_add reached.
    Returns: (best_model, best_resid, selected_idx)
    """
    np.random.seed(seed)

    n_valid, m_eigs = eig_for_valid.shape
    selected_idx = []

    # Baseline
    Xb = sm.add_constant(ln_scal_vals)
    base = sm.OLS(ln_factor_vals, Xb).fit()
    resid = base.resid
    MI = Moran(resid, W_lps_sub, permutations=permutations)
    if (MI.p_sim >= alpha) or (m_eigs == 0):
        return base, resid, selected_idx

    remaining = list(range(m_eigs))
    best_model = base
    best_resid = resid
    last_I = MI.I

    # Step 1
    best = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
    for j in remaining:
        xj = eig_for_valid[:, j]
        if not np.all(np.isfinite(xj)) or np.nanstd(xj) == 0:
            continue
        X = sm.add_constant(np.column_stack([ln_scal_vals, xj]))
        try:
            m = sm.OLS(ln_factor_vals, X).fit()
            r = m.resid
            Mi = Moran(r, W_lps_sub, permutations=permutations)
        except Exception:
            continue
        if (Mi.I < best["I"]) or (np.isclose(Mi.I, best["I"]) and (best["p"] is None or Mi.p_sim > best["p"])):
            best.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

    if best["j"] is None:
        return base, resid, selected_idx

    selected_idx.append(best["j"])
    if best["j"] in remaining:
        remaining.remove(best["j"])
    best_model = best["model"]
    best_resid = best["resid"]
    if best["p"] is not None and best["p"] >= alpha:
        return best_model, best_resid, selected_idx
    

    # Step 2+
    adds = 1
    while remaining and adds < max_add:
        cand = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
        for j in remaining:
            X_eigs = eig_for_valid[:, selected_idx + [j]]
            if not np.all(np.isfinite(X_eigs[:, -1])) or np.nanstd(X_eigs[:, -1]) == 0:
                continue
            X = sm.add_constant(np.column_stack([ln_scal_vals, X_eigs]))
            try:
                m = sm.OLS(ln_factor_vals, X).fit()
                r = m.resid
                Mi = Moran(r, W_lps_sub, permutations=permutations)
            except Exception:
                continue
            if (Mi.I < cand["I"]) or (np.isclose(Mi.I, cand["I"]) and (cand["p"] is None or Mi.p_sim > cand["p"])):
                cand.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

        if cand["j"] is None:
            break

        # early stop if I barely improves
        if (last_I - cand["I"]) < tol_delta_I:
            break

        selected_idx.append(cand["j"])
        if cand["j"] in remaining:
            remaining.remove(cand["j"])
        best_model = cand["model"]
        best_resid = cand["resid"]
        last_I = cand["I"]
        adds += 1
        if cand["p"] is not None and cand["p"] >= alpha:
            break

    return best_model, best_resid, selected_idx

In [54]:
def calculate_FsAMIs(
    df: pd.DataFrame,
    factor_columns,
    population_column: str,
    cbsa_column: str = "cbsacode",
    alpha: float = 0.05,
    moran_permutations: int = 999,
    k_values = (7, 8, 10, 12, 15, 18, 22, 26, 30),
    seed: int = 42 
):
    np.random.seed(seed)
    n_all = len(df)
    print("Merged data rows:", n_all)
    if n_all == 0:
        return pd.DataFrame()

    # --- make numeric copy -------------------------------------------------
    df_num = df.copy()
    for col in [population_column, *factor_columns, "lon", "lat"]:
        if col in df_num.columns:
            df_num[col] = pd.to_numeric(df_num[col], errors="coerce")

    coords_all = df_num[["lon", "lat"]].to_numpy()
    k_cache = {}
    for k in k_values:
        W_sym_k = construct_knn_weights(coords_all, k=k)
        H = np.eye(n_all) - np.ones((n_all, n_all)) / n_all
        Omega = H @ W_sym_k @ H
        evals, evecs = np.linalg.eigh(Omega)
        pos = evals > 1e-12
        evals_pos = evals[pos]
        evecs_pos = evecs[:, pos]
        order = np.argsort(evals_pos)[::-1]
        evals_pos = evals_pos[order]
        evecs_pos = evecs_pos[:, order]
        k_cache[k] = {"W_sym": W_sym_k, "evecs": evecs_pos, "evals": evals_pos}

    ln_pop_all = np.log(df_num[population_column]).astype(float)
    all_rows = []

    # === Loop through each factor sequentially ===
    for factor in factor_columns:

        fac_vals = df_num[factor]
        pop_vals = df_num[population_column]
        coord_ok = (
            df_num["lon"].notna() & df_num["lat"].notna() &
            np.isfinite(df_num["lon"].to_numpy()) &
            np.isfinite(df_num["lat"].to_numpy())
        )
        valid_mask = (
            fac_vals.notna() & pop_vals.notna() &
            (fac_vals > 0) & (pop_vals > 0) &
            np.isfinite(fac_vals.to_numpy()) &
            np.isfinite(pop_vals.to_numpy()) &
            coord_ok
        )
        if not valid_mask.any():
            print(f"[Skip] No valid data for {factor}")
            continue

        idx_valid = np.where(valid_mask)[0]
        ln_factor = np.log(fac_vals.loc[valid_mask].to_numpy())
        ln_pop    = ln_pop_all[valid_mask.to_numpy()]
        cbsa_vals = df_num.loc[valid_mask, cbsa_column].to_numpy()

        evals_k = []
        for k in k_values:
            cache = k_cache[k]
            W_sub = cache["W_sym"][idx_valid][:, idx_valid]
            W_lps_sub = full2W(W_sub);  W_lps_sub.transform = 'r'

            # --- baseline model
            Xb = sm.add_constant(ln_pop)
            model_b = sm.OLS(ln_factor, Xb).fit()
            resid_b = model_b.resid
            MI_b = Moran(
                resid_b, W_lps_sub,
                permutations=moran_permutations
            )
            b_I, b_p = MI_b.I, MI_b.p_sim

            best_model = model_b
            best_resid = resid_b
            selected_m = 0

            # --- forward selection if baseline Moran p < α
            if b_p < alpha:
                eig_for_valid = cache["evecs"][idx_valid, :]
                best_model, best_resid, sel_idx = forward_select_esf(
                    eig_for_valid=eig_for_valid,
                    ln_scal_vals=ln_pop,
                    ln_factor_vals=ln_factor,
                    W_lps_sub=W_lps_sub,
                    alpha=alpha,
                    permutations=moran_permutations,
                )
                selected_m = len(sel_idx)

            # --- final Moran with
            MI_f = Moran(
                best_resid, W_lps_sub,
                permutations=moran_permutations
            )
            f_I, f_p = MI_f.I, MI_f.p_sim

            aic = best_model.aic
            beta = best_model.params[1]
            pval = best_model.pvalues[1]
            ci_low, ci_up = best_model.conf_int(alpha=0.05)[1]

            evals_k.append({
                "k": k,
                "final_p": f_p,
                "final_I": f_I,
                "baseline_p": b_p,
                "baseline_I": b_I,
                "aic": aic,
                "selected_m_eigs": selected_m,
                "beta": beta, "pval": pval, "ci_low": ci_low, "ci_up": ci_up,
                "resid": best_resid,
                "cbsa_vals": cbsa_vals
            })

        # --- choose best k
        feasible = [d for d in evals_k if d["final_p"] >= alpha]
        if feasible:
            best = min(feasible, key=lambda d: (d["aic"], d["k"], -d["final_p"]))
        else:
            best = min(evals_k, key=lambda d: (-d["final_p"], d["aic"], d["k"]))
        best_k = best["k"]

        # --- record all k
        for d in evals_k:
            k = d["k"]
            beta = d["beta"]; pval = d["pval"]; ci_low = d["ci_low"]; ci_up = d["ci_up"]
            b_I = d["baseline_I"]; b_p = d["baseline_p"]
            f_I = d["final_I"];    f_p = d["final_p"]
            aic = d["aic"];        selected_m = d["selected_m_eigs"]
            resid = d["resid"];    cvals = d["cbsa_vals"]
            is_best = (k == best_k)

            for i_local, cval in enumerate(cvals):
                all_rows.append({
                    cbsa_column: cval,
                    "factor": factor,
                    "k": k,
                    "is_best": is_best,
                    "best_k_overall": best_k,
                    "baseline_moran_value": b_I,
                    "baseline_moran_p": b_p,
                    "final_moran_value": f_I,
                    "final_moran_p": f_p,
                    "selected_m_eigs": selected_m,
                    "FsAMI": resid[i_local],
                    "beta": beta,
                    "p_value": pval,
                    "CI_lower": ci_low,
                    "CI_upper": ci_up,
                    "AIC": aic
                })

    return pd.DataFrame(all_rows)

In [55]:
# First, add these imports at the top of your notebook
from joblib import Parallel, delayed
import os

if __name__ == "__main__":
    # Set master random seed
    MASTER_SEED = 42
    np.random.seed(MASTER_SEED)
    
    # Set number of CPU cores to use
    n_jobs = max(1, os.cpu_count() - 1)
    
    factor_columns = ['BINGE',
        'CSMOKING',
        'DEPRESSION',
        'DIABETES',
        'LPA',
        'OBESITY',
        'adult_smoking',
        'adult_obesity',
        'excessive_drinking',
        'diabetes_prevalence',
        'some_college',
        'unemployment',
        'children_single_parent',
        'mental_health_providers',
        'median_household_income',
        'driving_alone_to_work',
        'sti',
        'FFR20',
        'gdp',
        'coverage_50',
        'coverage_60',
        'coverage_70',
        'coverage_80',
        'coverage_90',
        'noise50n',
        'noise60n',
        'noise70n',
        'noise80n',
        'noise90n',
        'CANCER',
        'VISION',
        'MOBILITY',
        'SELFCARE',
        'DISABILITY',
        'Park_Area_Acres',
        'ALAND_acres']
    
    def process_factor(factor, seed):
        # Each worker gets its own deterministic seed
        worker_seed = seed + hash(factor) % 1000000
        np.random.seed(worker_seed)
        
        return calculate_FsAMIs(
            df=df,
            factor_columns=[factor],
            population_column="TotalPopulation",
            cbsa_column='cbsacode',
            alpha=0.05,
            moran_permutations=999,
            k_values=(5,6,7,8,9,10,11,12,13,14,15),
            seed=worker_seed  # Pass seed to calculate_FsAMIs
        )

    # Create seeds for each factor
    seeds = [MASTER_SEED + i for i in range(len(factor_columns))]
    
    # Run parallel processing with seeds
    results_list = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(process_factor)(factor, seed) 
        for factor, seed in zip(factor_columns, seeds)
    )
    
    # Combine results
    results_df = pd.concat(results_list, ignore_index=True)
    
    # Save results
    results_df.to_csv(f'../00_data/04_output/FsAMIs_pop_allk_test2.csv', 
                      index=False)

[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.


Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
Merged data rows: 875
  Selected first eig 8 with I=0.596668, p=0.0010
  Selected first eig 8 with I=0.473236, p=0.0010
  Selected first eig 31 with I=0.374957, p=0.0010
  Selected first eig 8 with I=0.363889, p=0.0010
  Selected first eig 21 with I=0.646039, p=0.0010
  Selected first eig 31 with I=0.287169, p=0.0010
  Selected first eig 3 with I=0.707655, p=0.0010
  Selected first eig 8 with I=0.332603, p=0.0010
  Selected first eig 21 with I=0.519695, p=0.0010
  Selected first eig 8 with I=0.561843, p=0.0010
  Selected first eig 42 with I=0.343059, p=0.0010
  Selected eig 30 with I=0.572029, p=0.0010
  Selected eig 8 with I=0.348332, p=0.0010
  Selected eig 18 with I=0.447971, p=0.0010
  Selected eig 31 with I=0.304834, p=0.0010
  Selected eig 32 with I=0.269660, p=0.

 There are 2 disconnected components.
 There is 1 island with id: 367.


  Selected eig 149 with I=0.091933, p=0.0040
  Selected eig 4 with I=0.079886, p=0.0010
  Selected eig 20 with I=0.247066, p=0.0010
  Selected eig 41 with I=0.117889, p=0.0010
  Selected eig 148 with I=0.153379, p=0.0010
  Selected eig 33 with I=0.037524, p=0.0100
  Selected eig 48 with I=0.059724, p=0.0010
  Selected eig 58 with I=0.124304, p=0.0010
  Selected eig 103 with I=0.095192, p=0.0010
  Selected eig 47 with I=0.206336, p=0.0010
  Selected first eig 42 with I=0.236018, p=0.0010
  Selected eig 46 with I=0.085473, p=0.0020
  Selected eig 143 with I=0.074525, p=0.0020
  Selected eig 19 with I=0.235564, p=0.0010
  Selected eig 78 with I=0.109594, p=0.0010
  Selected eig 137 with I=0.146922, p=0.0010
  Selected eig 50 with I=0.033619, p=0.0160
  Selected eig 43 with I=0.052581, p=0.0020
  Selected eig 97 with I=0.116325, p=0.0010
  Selected eig 32 with I=0.090384, p=0.0010
  Selected eig 62 with I=0.218770, p=0.0010
  Selected eig 24 with I=0.199771, p=0.0010
  Selected eig 87 with

[Parallel(n_jobs=11)]: Done  36 out of  36 | elapsed: 213.1min finished


In [39]:
filtered_df = results_df[results_df['final_moran_p'] < 0.05]
unique_factors = filtered_df['factor'].unique()
print("Unique factors:", unique_factors)

Unique factors: ['BINGE' 'CSMOKING' 'DIABETES' 'LPA' 'OBESITY' 'adult_smoking'
 'adult_obesity' 'diabetes_prevalence' 'some_college' 'unemployment'
 'children_single_parent' 'median_household_income'
 'driving_alone_to_work' 'sti' 'gdp' 'coverage_50' 'coverage_60'
 'coverage_70' 'coverage_80' 'noise50n' 'noise70n' 'noise80n' 'noise90n'
 'VISION' 'MOBILITY' 'SELFCARE' 'DISABILITY' 'Park_Area_Acres']


In [40]:
factors_crossing_zero = results_df[
    (results_df['CI_lower'] < 0) & (results_df['CI_upper'] > 0)
]['factor'].unique()

print("Factors with ln_pop beta CI crossing 0:", factors_crossing_zero)

Factors with ln_pop beta CI crossing 0: []
