In [1]:
import pandas as pd
import geopandas as gpd

from libpysal.weights.util import full2W
import warnings
warnings.simplefilter(action='ignore')

import math
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../00_data/13_final/cbsa_level.csv")
gdf = gpd.read_file("../00_data/01_raw/tl_2024_us_cbsa", engine="pyogrio")
gdf['cbsacode'] = gdf['CBSAFP'].astype('float64')

In [3]:
num_unique = df["cbsacode"].nunique()
print("cbsa:", num_unique)

cbsa: 875


In [4]:
gdf_ll = gdf.to_crs("EPSG:4326").copy()
gdf_ll["centroid"] = gdf_ll.geometry.centroid  # may warn; matches old coordinates exactly
gdf_ll["lon"] = gdf_ll.centroid.x
gdf_ll["lat"] = gdf_ll.centroid.y
gdf_ll["ALAND_acres"] = gdf_ll["ALAND"] / 4046.8564224
df = df.merge(gdf_ll[['ALAND_acres','cbsacode','lon','lat']],how='left')

In [5]:
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
from numpy.linalg import eigh
from esda.moran import Moran
from sklearn.preprocessing import StandardScaler


# ---------------------------
# Safe centroids (projected)
# ---------------------------
def add_lonlat_centroids(gdf, proj_crs="EPSG:5070"):
    """
    Compute centroids in a projected CRS, then bring them back to WGS84.
    """
    gdf_proj = gdf.to_crs(proj_crs).copy()
    cent_proj = gdf_proj.geometry.centroid  # planar centroid in meters
    cent_wgs = gpd.GeoSeries(cent_proj, crs=proj_crs).to_crs("EPSG:4326")
    out = gdf.copy()
    out["centroid"] = cent_wgs
    out["lon"] = cent_wgs.x
    out["lat"] = cent_wgs.y
    return out

# ---------------------------------------
# Fast kNN weights 
# ---------------------------------------

def construct_knn_weights(coords, k):
    # coords: (n,2) [lon, lat] in degrees
    R = 6371.0
    lon = np.radians(coords[:,0])[:,None]  # (n,1)
    lat = np.radians(coords[:,1])[:,None]  # (n,1)

    dlon = lon.T - lon                     # (n,n)
    dlat = lat.T - lat                     # (n,n)

    a = (np.sin(dlat/2.0)**2
         + np.cos(lat) @ np.cos(lat).T * np.sin(dlon/2.0)**2)
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    dist_mat = R * c
    np.fill_diagonal(dist_mat, np.inf)     # exclude self exactly like before

    n = dist_mat.shape[0]
    W = np.zeros((n,n), dtype=float)
    # identical neighbor selection and weights
    for i in range(n):
        row = dist_mat[i].copy()
        knn_idx = np.argsort(row)[:k]      # same (default quicksort) behavior
        thr = row[knn_idx[-1]]
        if thr > 0:
            W[i, knn_idx] = 1.0 - (dist_mat[i, knn_idx] / thr)

    W_sym = (W + W.T) / 2.0
    row_sums = W_sym.sum(axis=1, keepdims=True)
    nz = row_sums[:,0] > 0
    W_sym[nz] /= row_sums[nz]
    return W_sym

In [6]:
# === Robust Appendix-B forward selection ===
def forward_select_esf(
    eig_for_valid: np.ndarray,        # (n_valid, m_eigs)
    ln_scal_vals: np.ndarray,      # (n_valid,)
    ln_factor_vals: np.ndarray,       # (n_valid,)
    W_lps_sub,                        # PySAL weights for valid rows (full2W)
    alpha: float = 0.05,
    permutations: int = 999,          # Moran permutations; keep 999 for parity
    max_add: int = 100,
    tol_delta_I: float = 1e-6,
):
    """
    Appendix B-style greedy forward selection minimizing Moran's I on residuals.
    Guards against None/constant/NaN eigenvectors and singular fits.
    Stops when p >= alpha, or ΔI is tiny, or max_add reached.
    Returns: (best_model, best_resid, selected_idx)
    """
    import numpy as np
    import statsmodels.api as sm
    from esda.moran import Moran

    n_valid, m_eigs = eig_for_valid.shape
    selected_idx = []

    # Baseline
    Xb = sm.add_constant(ln_scal_vals)
    base = sm.OLS(ln_factor_vals, Xb).fit()
    resid = base.resid
    MI = Moran(resid, W_lps_sub, permutations=permutations)
    if (MI.p_sim >= alpha) or (m_eigs == 0):
        return base, resid, selected_idx

    remaining = list(range(m_eigs))
    best_model = base
    best_resid = resid
    last_I = MI.I

    # Step 1
    best = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
    for j in remaining:
        xj = eig_for_valid[:, j]
        if not np.all(np.isfinite(xj)) or np.nanstd(xj) == 0:
            continue
        X = sm.add_constant(np.column_stack([ln_scal_vals, xj]))
        try:
            m = sm.OLS(ln_factor_vals, X).fit()
            r = m.resid
            Mi = Moran(r, W_lps_sub, permutations=permutations)
        except Exception:
            continue
        if (Mi.I < best["I"]) or (np.isclose(Mi.I, best["I"]) and (best["p"] is None or Mi.p_sim > best["p"])):
            best.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

    if best["j"] is None:
        return base, resid, selected_idx

    selected_idx.append(best["j"])
    if best["j"] in remaining:
        remaining.remove(best["j"])
    best_model = best["model"]
    best_resid = best["resid"]
    if best["p"] is not None and best["p"] >= alpha:
        return best_model, best_resid, selected_idx

    # Step 2+
    adds = 1
    while remaining and adds < max_add:
        cand = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
        for j in remaining:
            X_eigs = eig_for_valid[:, selected_idx + [j]]
            if not np.all(np.isfinite(X_eigs[:, -1])) or np.nanstd(X_eigs[:, -1]) == 0:
                continue
            X = sm.add_constant(np.column_stack([ln_scal_vals, X_eigs]))
            try:
                m = sm.OLS(ln_factor_vals, X).fit()
                r = m.resid
                Mi = Moran(r, W_lps_sub, permutations=permutations)
            except Exception:
                continue
            if (Mi.I < cand["I"]) or (np.isclose(Mi.I, cand["I"]) and (cand["p"] is None or Mi.p_sim > cand["p"])):
                cand.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

        if cand["j"] is None:
            break

        # early stop if I barely improves
        if (last_I - cand["I"]) < tol_delta_I:
            break

        selected_idx.append(cand["j"])
        if cand["j"] in remaining:
            remaining.remove(cand["j"])
        best_model = cand["model"]
        best_resid = cand["resid"]
        last_I = cand["I"]
        adds += 1
        if cand["p"] is not None and cand["p"] >= alpha:
            break

    return best_model, best_resid, selected_idx

In [7]:
from joblib import Parallel, delayed

def calculate_FsAMIs(
    df: pd.DataFrame,
    factor_columns,
    population_column: str,
    cbsa_column: str = "cbsacode",
    alpha: float = 0.05,
    moran_permutations_search: int = 199,
    moran_permutations_final: int = 999,
    n_jobs: int = -1,
    backend: str = "loky",
    k_values = (7, 8, 10, 12, 15, 18, 22, 26, 30),
):
    n_all = len(df)
    print("Merged data rows:", n_all)
    if n_all == 0:
        return pd.DataFrame()

    # --- make numeric copy -----------------------------------------------  # <<<
    df_num = df.copy()                                                      # <<<
    for col in [population_column, *factor_columns, "lon", "lat"]:          # <<<
        if col in df_num.columns:                                           # <<<
            df_num[col] = pd.to_numeric(df_num[col], errors="coerce")       # <<<

    coords_all = df_num[["lon", "lat"]].to_numpy()
    k_cache = {}
    for k in k_values:
        W_sym_k = construct_knn_weights(coords_all, k=k)
        H = np.eye(n_all) - np.ones((n_all, n_all)) / n_all
        Omega = H @ W_sym_k @ H
        evals, evecs = np.linalg.eigh(Omega)
        pos = evals > 1e-12
        evals_pos = evals[pos]
        evecs_pos = evecs[:, pos]
        order = np.argsort(evals_pos)[::-1]
        evals_pos = evals_pos[order]
        evecs_pos = evecs_pos[:, order]
        k_cache[k] = {"W_sym": W_sym_k, "evecs": evecs_pos, "evals": evals_pos}

    ln_pop_all = np.log(df_num[population_column]).astype(float)            # <<<

    def _run_one_factor(factor: str):
        rows = []

        fac_vals = df_num[factor]                                           # <<<
        pop_vals = df_num[population_column]                                # <<<
        coord_ok = (
            df_num["lon"].notna() & df_num["lat"].notna() &
            np.isfinite(df_num["lon"].to_numpy()) &
            np.isfinite(df_num["lat"].to_numpy())
        )
        valid_mask = (
            fac_vals.notna() & pop_vals.notna() &
            (fac_vals > 0) & (pop_vals > 0) &
            np.isfinite(fac_vals.to_numpy()) &
            np.isfinite(pop_vals.to_numpy()) &
            coord_ok
        )
        if not valid_mask.any():
            print(f"[Skip] No valid data for {factor}")
            return rows

        idx_valid = np.where(valid_mask)[0]
        ln_factor = np.log(fac_vals.loc[valid_mask].to_numpy())             # <<<
        ln_pop    = ln_pop_all[valid_mask.to_numpy()]                        # <<<
        cbsa_vals = df_num.loc[valid_mask, cbsa_column].to_numpy()          # <<<

        evals_k = []
        for k in k_values:
            cache = k_cache[k]
            W_sub = cache["W_sym"][idx_valid][:, idx_valid]
            W_lps_sub = full2W(W_sub);  W_lps_sub.transform = 'r'

            Xb = sm.add_constant(ln_pop)
            model_b = sm.OLS(ln_factor, Xb).fit()
            resid_b = model_b.resid
            MI_b = Moran(resid_b, W_lps_sub, permutations=moran_permutations_search)
            b_I, b_p = MI_b.I, MI_b.p_sim

            best_model = model_b
            best_resid = resid_b
            selected_m = 0
            if b_p < alpha:
                eig_for_valid = cache["evecs"][idx_valid, :]
                best_model, best_resid, sel_idx = forward_select_esf(
                    eig_for_valid=eig_for_valid,
                    ln_scal_vals=ln_pop,
                    ln_factor_vals=ln_factor,
                    W_lps_sub=W_lps_sub,
                    alpha=alpha,
                    permutations=moran_permutations_search
                )
                selected_m = len(sel_idx)

            MI_f = Moran(best_resid, W_lps_sub, permutations=moran_permutations_final)
            f_I, f_p = MI_f.I, MI_f.p_sim
            aic = best_model.aic
            beta = best_model.params[1]
            pval = best_model.pvalues[1]
            ci_low, ci_up = best_model.conf_int(alpha=0.05)[1]

            evals_k.append({
                "k": k,
                "final_p": f_p,
                "final_I": f_I,
                "baseline_p": b_p,
                "baseline_I": b_I,
                "aic": aic,
                "selected_m_eigs": selected_m,
                "beta": beta, "pval": pval, "ci_low": ci_low, "ci_up": ci_up,
                "resid": best_resid,
                "cbsa_vals": cbsa_vals
            })

        feasible = [d for d in evals_k if d["final_p"] >= alpha]
        if feasible:
            # primary: lowest AIC; tie-breakers: smaller k then higher final_p
            best = min(feasible, key=lambda d: (d["aic"], d["k"], -d["final_p"]))
        else:
            # fallback when nothing clears p>α:
            # prioritize largest p, then lowest AIC, then smaller k
            best = min(evals_k, key=lambda d: (-d["final_p"], d["aic"], d["k"]))

        best_k = best["k"]
        beta = best["beta"]; pval = best["pval"]; ci_low = best["ci_low"]; ci_up = best["ci_up"]
        b_I = best["baseline_I"]; b_p = best["baseline_p"]
        f_I = best["final_I"];    f_p = best["final_p"]
        aic = best["aic"];        selected_m = best["selected_m_eigs"]
        resid = best["resid"];    cvals = best["cbsa_vals"]

        for i_local, cval in enumerate(cvals):
            rows.append({
                cbsa_column: cval,
                "factor": factor,
                "best_k": best_k,
                "baseline_moran_value": b_I,
                "baseline_moran_p": b_p,
                "final_moran_value": f_I,
                "final_moran_p": f_p,
                "selected_m_eigs": selected_m,
                "FsAMI": resid[i_local],
                "beta": beta,
                "p_value": pval,
                "CI_lower": ci_low,
                "CI_upper": ci_up,
                "AIC": aic
            })
        return rows

    results_lists = Parallel(n_jobs=n_jobs, backend=backend, verbose=0)(
        delayed(_run_one_factor)(factor) for factor in factor_columns
    )
    flat_rows = [row for rows in results_lists for row in (rows or [])]
    return pd.DataFrame(flat_rows)

In [8]:
df

Unnamed: 0,cbsacode,TotalPopulation,BINGE,CSMOKING,DEPRESSION,DIABETES,LPA,OBESITY,adult_smoking,adult_obesity,...,coverage_90,noise50n,noise60n,noise70n,noise80n,noise90n,Park_Area_Acres,ALAND_acres,lon,lat
0,10100.0,42037,8802.089,5942.090,7630.409,3886.342,9800.073,14822.801,5446.675000,11129.643000,...,22.000000,3717.083034,607.189173,44.967758,6.824776,3.068829,550.773301,1.816967e+06,-98.695954,45.521579
1,10140.0,77038,12865.346,11555.700,20954.336,9937.902,19105.424,28427.022,11515.519999,21818.879999,...,10.000000,3744.330966,733.378889,107.629504,7.306492,1.801527,104080.141400,1.216937e+06,-123.828621,47.144409
2,10180.0,179308,33724.319,27437.232,43682.746,22629.731,49045.578,64315.187,22934.902001,48604.362001,...,167.000000,19453.578040,5776.179243,1259.546092,134.775549,21.393130,3252.752494,1.755849e+06,-99.717678,32.449690
3,10220.0,38141,5263.458,7475.636,11327.877,5377.881,12433.966,16400.630,6250.512000,10719.336000,...,3.000000,2481.170945,552.701071,101.550631,3.799527,0.091602,87.794807,4.610259e+05,-96.684376,34.728045
4,10300.0,98567,16559.256,16066.421,23754.647,11335.205,26021.688,41496.707,15798.168001,30589.590001,...,14.000000,5334.595050,801.912541,59.860893,7.036913,0.545557,741.377599,4.797683e+05,-84.066367,41.895084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,49660.0,426279,70236.063,90823.818,107573.664,67051.467,128537.013,177133.518,84594.889997,129905.928995,...,111.310003,35991.149063,8202.722856,1981.787831,130.858132,15.124541,5621.076757,6.586099e+05,-80.767277,41.196197
871,49700.0,182813,33171.767,28898.647,41820.750,22149.607,48250.135,58527.698,19167.600000,39108.972000,...,128.000000,24140.378031,6757.409025,2719.702058,85.952259,11.647516,888.207710,7.902092e+05,-121.517893,39.155307
872,49740.0,207842,35748.824,27642.986,40529.190,32215.510,65054.546,84176.010,24739.689001,62914.176003,...,81.197429,42701.812167,7451.757500,835.325759,24.067545,3.407328,434.539877,3.528844e+06,-113.905615,32.769620
873,49780.0,86113,15328.114,16533.696,22217.154,12141.933,24628.318,35392.443,17409.743999,26281.375999,...,29.000000,5791.755148,1330.386560,353.883381,27.677967,1.952414,15833.810080,4.253093e+05,-81.944383,39.965422


In [9]:
df.columns

Index(['cbsacode', 'TotalPopulation', 'BINGE', 'CSMOKING', 'DEPRESSION',
       'DIABETES', 'LPA', 'OBESITY', 'adult_smoking', 'adult_obesity',
       'excessive_drinking', 'diabetes_prevalence', 'some_college',
       'unemployment', 'children_single_parent', 'mental_health_providers',
       'median_household_income', 'driving_alone_to_work', 'sti', 'FFR20',
       'gdp', 'coverage_50', 'coverage_60', 'coverage_70', 'coverage_80',
       'coverage_90', 'noise50n', 'noise60n', 'noise70n', 'noise80n',
       'noise90n', 'Park_Area_Acres', 'ALAND_acres', 'lon', 'lat'],
      dtype='object')

In [10]:
if __name__ == "__main__":

    factor_columns = ['BINGE', 'CSMOKING', 'DEPRESSION',
       'DIABETES', 'LPA', 'OBESITY', 'adult_smoking', 'adult_obesity',
       'excessive_drinking', 'diabetes_prevalence', 'some_college',
       'unemployment', 'children_single_parent', 'mental_health_providers',
       'median_household_income', 'driving_alone_to_work', 'sti', 'FFR20',
       'gdp', 'coverage_50', 'coverage_60', 'coverage_70', 'coverage_80',
       'coverage_90', 'noise50n', 'noise60n', 'noise70n', 'noise80n',
       'noise90n', 'Park_Area_Acres']

    results_df = calculate_FsAMIs(
        df=df,
        factor_columns=factor_columns,
        population_column="TotalPopulation",
        cbsa_column='cbsacode',
        alpha=0.05,
        k_values=(5,6,7,8,9,10,11,12,13,14,15)
    )

    print(results_df.head(10))

Merged data rows: 875


 There are 2 disconnected components.
 There is 1 island with id: 367.


   cbsacode factor  best_k  baseline_moran_value  baseline_moran_p  \
0   10100.0  BINGE      12              0.647656             0.005   
1   10140.0  BINGE      12              0.647656             0.005   
2   10180.0  BINGE      12              0.647656             0.005   
3   10220.0  BINGE      12              0.647656             0.005   
4   10300.0  BINGE      12              0.647656             0.005   
5   10420.0  BINGE      12              0.647656             0.005   
6   10460.0  BINGE      12              0.647656             0.005   
7   10500.0  BINGE      12              0.647656             0.005   
8   10540.0  BINGE      12              0.647656             0.005   
9   10580.0  BINGE      12              0.647656             0.005   

   final_moran_value  final_moran_p  selected_m_eigs     FsAMI      beta  \
0           0.021401          0.083               42  0.023807  1.019515   
1           0.021401          0.083               42  0.034397  1.019515   
2

In [11]:
results_df

Unnamed: 0,cbsacode,factor,best_k,baseline_moran_value,baseline_moran_p,final_moran_value,final_moran_p,selected_m_eigs,FsAMI,beta,p_value,CI_lower,CI_upper,AIC
0,10100.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.023807,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
1,10140.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.034397,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
2,10180.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.009526,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
3,10220.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,-0.095589,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
4,10300.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,-0.071233,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25997,49660.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-0.655686,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
25998,49700.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-2.663308,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
25999,49740.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-3.628451,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
26000,49780.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,2.830239,1.165831,6.291230e-90,1.065197,1.266464,3520.646441


In [12]:
results_df.to_csv(f'../00_data/14_output/FsAMIs_0903_pop_ko.csv',index=False)

In [13]:
results_df

Unnamed: 0,cbsacode,factor,best_k,baseline_moran_value,baseline_moran_p,final_moran_value,final_moran_p,selected_m_eigs,FsAMI,beta,p_value,CI_lower,CI_upper,AIC
0,10100.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.023807,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
1,10140.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.034397,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
2,10180.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,0.009526,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
3,10220.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,-0.095589,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
4,10300.0,BINGE,12,0.647656,0.005,0.021401,0.083,42,-0.071233,1.019515,0.000000e+00,1.014879,1.024150,-1833.959834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25997,49660.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-0.655686,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
25998,49700.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-2.663308,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
25999,49740.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,-3.628451,1.165831,6.291230e-90,1.065197,1.266464,3520.646441
26000,49780.0,Park_Area_Acres,11,0.293146,0.005,0.020656,0.090,16,2.830239,1.165831,6.291230e-90,1.065197,1.266464,3520.646441


In [16]:
filtered_df = results_df[results_df['final_moran_p'] < 0.05]
unique_factors = filtered_df['factor'].unique()
print("Unique factors:", unique_factors)

Unique factors: []


In [17]:
factors_crossing_zero = results_df[
    (results_df['CI_lower'] < 0) & (results_df['CI_upper'] > 0)
]['factor'].unique()

print("Factors with ln_pop beta CI crossing 0:", factors_crossing_zero)

Factors with ln_pop beta CI crossing 0: []
