In [2]:

import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
from numpy.linalg import eigh
from sklearn.preprocessing import StandardScaler
from esda.moran import Moran
from libpysal.weights import full2W
from joblib import Parallel, delayed
import warnings
warnings.simplefilter(action='ignore')


In [3]:
df = pd.read_csv("../00_data/03_combined/all_factor_72_12.csv")
gdf = gpd.read_file("../00_data/01_raw/tl_2024_us_cbsa", engine="pyogrio")
gdf['cbsacode'] = gdf['CBSAFP'].astype('float64')

In [4]:
num_unique = df["cbsacode"].nunique()
print("cbsa:", num_unique)

cbsa: 72


In [5]:

gdf["ALAND_acres"] = gdf["ALAND"] / 4046.8564224
df = df.merge(gdf[['ALAND_acres','cbsacode']],how='left')

In [8]:


# -----------------------------
# Vectorized pairwise haversine (km)
# -----------------------------
def haversine_matrix(coords_deg: np.ndarray) -> np.ndarray:
    """
    coords_deg: (n,2) [lon, lat] in degrees
    returns: (n,n) symmetric km distances, diag = 0
    """
    R = 6371.0
    lon = np.radians(coords_deg[:, 0])[:, None]
    lat = np.radians(coords_deg[:, 1])[:, None]
    dlon = lon.T - lon
    dlat = lat.T - lat
    a = (np.sin(dlat/2.0)**2 +
         (np.cos(lat) @ np.cos(lat).T) * (np.sin(dlon/2.0)**2))
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    D = R * c
    np.fill_diagonal(D, 0.0)
    return D

# -----------------------------
# Composite spatio-temporal distance
# -----------------------------
def composite_distance_matrix(
    coords_deg: np.ndarray,
    years: np.ndarray,
    gamma_km_per_year: float = 100.0,
    mode: str = "quadratic",
) -> np.ndarray:
    """
    Build composite distance D_comp from geographic distance (km) and time (years).
    - mode="quadratic": D = sqrt(D_geo^2 + (gamma*|dt|)^2)
    - mode="additive":  D = D_geo + gamma*|dt|
    returns: (n,n), diag = 0
    """
    D_geo = haversine_matrix(coords_deg)
    dt = np.abs(years[:, None] - years[None, :])
    if mode == "quadratic":
        D = np.sqrt(D_geo**2 + (gamma_km_per_year * dt)**2)
    elif mode == "additive":
        D = D_geo + gamma_km_per_year * dt
    else:
        raise ValueError("mode must be 'quadratic' or 'additive'")
    np.fill_diagonal(D, 0.0)
    return D

# -----------------------------
# kNN weights from a distance matrix
# -----------------------------
def construct_knn_weights_from_D(D: np.ndarray, k: int) -> np.ndarray:
    """
    Construct kNN weights from any distance matrix D (diag should be 0).
    weight = 1 - d_ij / d_k, using the k-th neighbor distance per row.
    Symmetrize, then row-normalize. Returns dense (n,n) float64.
    """
    n = D.shape[0]
    D_inf = D.copy()
    np.fill_diagonal(D_inf, np.inf)

    W = np.zeros((n, n), dtype=float)
    for i in range(n):
        row = D_inf[i]
        knn_idx = np.argsort(row)[:k]
        thr = row[knn_idx[-1]]
        if thr > 0:
            W[i, knn_idx] = 1.0 - (D[i, knn_idx] / thr)

    W_sym = (W + W.T) / 2.0
    rs = W_sym.sum(axis=1, keepdims=True)
    mask = rs[:, 0] > 0
    W_sym[mask] = W_sym[mask] / rs[mask]
    return W_sym

# -----------------------------
# Robust Appendix-B forward selection 
# -----------------------------
def forward_select_esf(
    eig_for_valid: np.ndarray,
    ln_pop_vals: np.ndarray,
    ln_factor_vals: np.ndarray,
    W_lps_sub,
    alpha: float = 0.05,
    permutations: int = 199,
    max_add: int = 50,
    tol_delta_I: float = 1e-6,
):
    """
    Greedy forward selection of ESF eigenvectors minimizing Moran's I on residuals.
    """
    Xb = sm.add_constant(ln_pop_vals)
    base = sm.OLS(ln_factor_vals, Xb).fit()
    resid = base.resid
    MI = Moran(resid, W_lps_sub, permutations=permutations)
    if (MI.p_sim >= alpha) or (eig_for_valid.shape[1] == 0):
        return base, resid, []

    remaining = list(range(eig_for_valid.shape[1]))
    best_model, best_resid, last_I = base, resid, MI.I
    selected_idx = []

    # Step 1
    best = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
    for j in remaining:
        xj = eig_for_valid[:, j]
        if not np.all(np.isfinite(xj)) or np.nanstd(xj) == 0:  # skip bad/constant cols
            continue
        X = sm.add_constant(np.column_stack([ln_pop_vals, xj]))
        try:
            m = sm.OLS(ln_factor_vals, X).fit()
            r = m.resid
            Mi = Moran(r, W_lps_sub, permutations=permutations)
        except Exception:
            continue
        if (Mi.I < best["I"]) or (np.isclose(Mi.I, best["I"]) and (best["p"] is None or Mi.p_sim > best["p"])):
            best.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

    if best["j"] is None:
        return base, resid, []
    selected_idx.append(best["j"]); remaining.remove(best["j"])
    best_model, best_resid, last_I = best["model"], best["resid"], best["I"]
    if best["p"] is not None and best["p"] >= alpha:
        return best_model, best_resid, selected_idx

    # Step 2+
    adds = 1
    while remaining and adds < max_add:
        cand = {"I": np.inf, "p": None, "j": None, "model": None, "resid": None}
        for j in remaining:
            X_eigs = eig_for_valid[:, selected_idx + [j]]
            if not np.all(np.isfinite(X_eigs[:, -1])) or np.nanstd(X_eigs[:, -1]) == 0:
                continue
            X = sm.add_constant(np.column_stack([ln_pop_vals, X_eigs]))
            try:
                m = sm.OLS(ln_factor_vals, X).fit()
                r = m.resid
                Mi = Moran(r, W_lps_sub, permutations=permutations)
            except Exception:
                continue
            if (Mi.I < cand["I"]) or (np.isclose(Mi.I, cand["I"]) and (cand["p"] is None or Mi.p_sim > cand["p"])):
                cand.update({"I": Mi.I, "p": Mi.p_sim, "j": j, "model": m, "resid": r})

        if cand["j"] is None:
            break
        if (last_I - cand["I"]) < tol_delta_I:
            break

        selected_idx.append(cand["j"]); remaining.remove(cand["j"])
        best_model, best_resid, last_I = cand["model"], cand["resid"], cand["I"]
        adds += 1
        if cand["p"] is not None and cand["p"] >= alpha:
            break

    return best_model, best_resid, selected_idx

# -----------------------------
# Main: FsAMI with composite spatio-temporal kNN (parallel + auto-k)
# -----------------------------
def calculate_FsAMIs(
    df: pd.DataFrame,
    gdf: gpd.GeoDataFrame,
    factor_columns,
    population_column: str,
    area_column: str,
    year_column: str,                     # <--- add your year column name (e.g., "year")
    cbsa_column: str = "cbsacode",
    alpha: float = 0.05,
    moran_permutations_search: int = 199, # fewer perms while searching
    moran_permutations_final: int = 999,  # higher perms for final reporting
    n_jobs: int = -1,
    backend: str = "loky",
    k_values = (7, 8, 10, 12, 15, 18, 22, 26, 30),  # candidate k list
    gamma_km_per_year: float = 500.0,     # space-time tradeoff (km per year)
    composite_mode: str = "quadratic",    # "quadratic" or "additive"
):
    """
    Parallel FsAMI with automatic k selection per factor using a composite
    spatio-temporal kNN (geography + time).
    Selection rule per factor:
      1) pick the smallest k with final Moran's p >= alpha;
      2) if tie, pick the one with lowest AIC;
      3) if none reach p >= alpha, pick the k with largest p (tie -> smaller k, then lower AIC).

    Each (city, year) row is treated as an independent location.
    Requirements in df: columns [cbsa_column, year_column, population_column, area_column, factor_columns...]
    Requirements in gdf: geometry for each cbsa; used to fetch lon/lat (EPSG:4326 centroid).
    """
    # 1) Merge lon/lat to df (repeat coords across years)
    gdf_ll = gdf.to_crs("EPSG:4326").copy()
    gdf_ll["centroid"] = gdf_ll.geometry.centroid  # same behavior as before
    gdf_ll["lon"] = gdf_ll.centroid.x
    gdf_ll["lat"] = gdf_ll.centroid.y

    df_all = df.merge(gdf_ll[[cbsa_column, "lon", "lat"]], on=cbsa_column, how="left").reset_index(drop=True)

    # Ensure we have all required columns
    needed = [cbsa_column, year_column, "lon", "lat", population_column, area_column]
    missing = [c for c in needed if c not in df_all.columns]
    if missing:
        raise ValueError(f"Missing required columns in df/gdf merge: {missing}")

    n_all = len(df_all)
    print("Merged spatio-temporal rows:", n_all)
    if n_all == 0:
        return pd.DataFrame()

    coords_all = df_all[["lon", "lat"]].to_numpy()
    years_all = df_all[year_column].to_numpy().astype(float)

    # 2) Build composite spatio-temporal distance ONCE, then reuse for all k
    D_comp = composite_distance_matrix(
        coords_deg=coords_all,
        years=years_all,
        gamma_km_per_year=gamma_km_per_year,
        mode=composite_mode
    )

    # 3) Precompute W_sym and ESF basis for each k once (cache shared by all factors)
    k_cache = {}
    H = np.eye(n_all) - np.ones((n_all, n_all)) / n_all
    for k in k_values:
        W_sym_k = construct_knn_weights_from_D(D_comp, k=k)
        Omega = H @ W_sym_k @ H
        evals, evecs = eigh(Omega)
        evecs = StandardScaler().fit_transform(evecs)
        idx = np.argsort(np.abs(evals))[::-1]
        evecs = evecs[:, idx]
        k_cache[k] = {"W_sym": W_sym_k, "evecs": evecs}

    # 4) Precompute ln_Density once (NaN where invalid)
    ln_density_all = np.where(
        (df_all[population_column] > 0) & (df_all[area_column] > 0),
        np.log(df_all[population_column] / df_all[area_column]),
        np.nan
    ).astype(float)

    # 5) Worker: run one factor (search over k and pick best)
    def _run_one_factor(factor: str):
        rows = []

        if factor not in df_all.columns:
            print(f"[Warning] {factor} not in data, skip.")
            return rows

        valid_mask = (
            df_all[factor].notnull()
            & (df_all[factor] > 0)
            & np.isfinite(ln_density_all)
        )
        if not np.any(valid_mask):
            print(f"[Skip] No valid data for {factor}")
            return rows

        idx_valid = np.where(valid_mask)[0]
        ln_factor = np.log(df_all.loc[valid_mask, factor].to_numpy())
        ln_density = ln_density_all[valid_mask.to_numpy()]
        cbsa_vals = df_all.loc[valid_mask, cbsa_column].to_numpy()
        year_vals = df_all.loc[valid_mask, year_column].to_numpy()

        evals_k = []
        for k in k_values:
            cache = k_cache[k]
            W_sub = cache["W_sym"][idx_valid][:, idx_valid]
            W_lps_sub = full2W(W_sub);  W_lps_sub.transform = 'r'

            # Baseline
            Xb = sm.add_constant(ln_density)
            model_b = sm.OLS(ln_factor, Xb).fit()
            resid_b = model_b.resid
            MI_b = Moran(resid_b, W_lps_sub, permutations=moran_permutations_search)
            b_I, b_p = MI_b.I, MI_b.p_sim

            # Forward selection
            best_model = model_b
            best_resid = resid_b
            selected_m = 0
            if b_p < alpha:
                eig_for_valid = cache["evecs"][idx_valid, :]
                best_model, best_resid, sel_idx = forward_select_esf(
                    eig_for_valid=eig_for_valid,
                    ln_pop_vals=ln_density,
                    ln_factor_vals=ln_factor,
                    W_lps_sub=W_lps_sub,
                    alpha=alpha,
                    permutations=moran_permutations_search
                )
                selected_m = len(sel_idx)

            # Final stats with more permutations
            MI_f = Moran(best_resid, W_lps_sub, permutations=moran_permutations_final)
            f_I, f_p = MI_f.I, MI_f.p_sim
            aic = best_model.aic
            beta = best_model.params[1]
            pval = best_model.pvalues[1]
            ci_low, ci_up = best_model.conf_int(alpha=0.05)[1]

            evals_k.append({
                "k": k,
                "final_p": f_p,
                "final_I": f_I,
                "baseline_p": b_p,
                "baseline_I": b_I,
                "aic": aic,
                "selected_m_eigs": selected_m,
                "beta": beta, "pval": pval, "ci_low": ci_low, "ci_up": ci_up,
                "resid": best_resid,
                "cbsa_vals": cbsa_vals,
                "year_vals": year_vals
            })

        # Choose best k: smallest k with final_p >= alpha; tie -> lowest AIC
        feasible = [d for d in evals_k if d["final_p"] >= alpha]
        if len(feasible) == 0:
            best = sorted(evals_k, key=lambda d: (-d["final_p"], d["k"], d["aic"]))[0]
        else:
            min_k = min(d["k"] for d in feasible)
            candidates = [d for d in feasible if d["k"] == min_k]
            best = min(candidates, key=lambda d: d["aic"])

        best_k = best["k"]
        beta = best["beta"]; pval = best["pval"]; ci_low = best["ci_low"]; ci_up = best["ci_up"]
        b_I = best["baseline_I"]; b_p = best["baseline_p"]
        f_I = best["final_I"];    f_p = best["final_p"]
        aic = best["aic"];        selected_m = best["selected_m_eigs"]
        resid = best["resid"];    cvals = best["cbsa_vals"]; yvals = best["year_vals"]

        for i_local, (cval, yval) in enumerate(zip(cvals, yvals)):
            rows.append({
                cbsa_column: cval,
                year_column: int(yval),
                "factor": factor,
                "best_k": best_k,
                "baseline_moran_value": b_I,
                "baseline_moran_p": b_p,
                "final_moran_value": f_I,
                "final_moran_p": f_p,
                "selected_m_eigs": selected_m,
                "FsAMI": resid[i_local],
                "beta": beta,
                "p_value": pval,
                "CI_lower": ci_low,
                "CI_upper": ci_up,
                "AIC": aic,
                "gamma_km_per_year": gamma_km_per_year,
                "composite_mode": composite_mode
            })
        return rows

    # 6) Parallel over factors and flatten
    results_lists = Parallel(n_jobs=n_jobs, backend=backend, verbose=0)(
        delayed(_run_one_factor)(factor) for factor in factor_columns
    )
    flat_rows = [row for rows in results_lists for row in (rows or [])]
    return pd.DataFrame(flat_rows)

In [9]:
if __name__ == "__main__":

    factor_columns = [ 'adult_smoking',
       'adult_obesity', 'excessive_drinking', 'diabetes_prevalence',
       'no_time_activity', 'some_college', 'unemployment',
       'children_single_parent', 'mental_health_providers',
       'median_household_income', 'driving_alone_to_work', 'sti', 'gdp',
       'Park_Area_Acres', 'Weighted_RESP', 'noise70n', 'noise80n', 'noise90n',
       'smo_obe_dia', 'smo_obe', 'smo_dia', 'obe_dia', 'depressed']

    results_df = calculate_FsAMIs(
        df=df, 
        gdf=gdf, 
        factor_columns=factor_columns,
        population_column="population",
        area_column='ALAND_acres',
        year_column="Year",
        cbsa_column='cbsacode',
        alpha=0.05,
        k_values=(10, 14, 18, 22, 26, 30)
    )

    print(results_df.head(10))

Merged spatio-temporal rows: 864


 There are 6 disconnected components.
 There are 4 disconnected components.
 There are 4 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.
 There are 6 disconnected components.
 There are 4 disconnected components.
 There are 4 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.
 There are 6 disconnected components.
 There are 4 disconnected components.
 There are 4 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.
 There are 6 disconnected components.
 There are 4 disconnected components.
 There are 4 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.


   cbsacode  Year         factor  best_k  baseline_moran_value  \
0     10740  2011  adult_smoking      10              0.297763   
1     11260  2011  adult_smoking      10              0.297763   
2     12060  2011  adult_smoking      10              0.297763   
3     12260  2011  adult_smoking      10              0.297763   
4     12420  2011  adult_smoking      10              0.297763   
5     12580  2011  adult_smoking      10              0.297763   
6     12940  2011  adult_smoking      10              0.297763   
7     13740  2011  adult_smoking      10              0.297763   
8     13820  2011  adult_smoking      10              0.297763   
9     13900  2011  adult_smoking      10              0.297763   

   baseline_moran_p  final_moran_value  final_moran_p  selected_m_eigs  \
0             0.005           0.026549          0.071               12   
1             0.005           0.026549          0.071               12   
2             0.005           0.026549          0.0

In [10]:
results_df

Unnamed: 0,cbsacode,Year,factor,best_k,baseline_moran_value,baseline_moran_p,final_moran_value,final_moran_p,selected_m_eigs,FsAMI,beta,p_value,CI_lower,CI_upper,AIC,gamma_km_per_year,composite_mode
0,10740,2011,adult_smoking,10,0.297763,0.005,0.026549,0.071,12,0.657353,0.937520,4.787102e-214,0.894485,0.980555,1353.889795,500.0,quadratic
1,11260,2011,adult_smoking,10,0.297763,0.005,0.026549,0.071,12,1.414961,0.937520,4.787102e-214,0.894485,0.980555,1353.889795,500.0,quadratic
2,12060,2011,adult_smoking,10,0.297763,0.005,0.026549,0.071,12,0.371112,0.937520,4.787102e-214,0.894485,0.980555,1353.889795,500.0,quadratic
3,12260,2011,adult_smoking,10,0.297763,0.005,0.026549,0.071,12,-0.342882,0.937520,4.787102e-214,0.894485,0.980555,1353.889795,500.0,quadratic
4,12420,2011,adult_smoking,10,0.297763,0.005,0.026549,0.071,12,-0.324687,0.937520,4.787102e-214,0.894485,0.980555,1353.889795,500.0,quadratic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17395,45820,2022,depressed,10,0.297052,0.005,0.024964,0.083,9,-0.346585,0.964426,7.322820e-258,0.926852,1.002000,1259.305388,500.0,quadratic
17396,46140,2022,depressed,10,0.297052,0.005,0.024964,0.083,9,0.428617,0.964426,7.322820e-258,0.926852,1.002000,1259.305388,500.0,quadratic
17397,47260,2022,depressed,10,0.297052,0.005,0.024964,0.083,9,-0.042374,0.964426,7.322820e-258,0.926852,1.002000,1259.305388,500.0,quadratic
17398,48620,2022,depressed,10,0.297052,0.005,0.024964,0.083,9,-0.248329,0.964426,7.322820e-258,0.926852,1.002000,1259.305388,500.0,quadratic


In [11]:
results_df.to_csv(f'../00_data/14_output/FsAMIs_72.csv',index=False)

In [12]:
filtered_df = results_df[results_df['final_moran_p'] <= 0.05]
unique_factors = filtered_df['factor'].unique()
print("Unique factors:", unique_factors)

Unique factors: ['sti']


In [None]:
factors_crossing_zero = results_df[
    (results_df['CI_lower'] < 0) & (results_df['CI_upper'] > 0)
]['factor'].unique()

print("Factors with ln_pop beta CI crossing 0:", factors_crossing_zero)

Factors with ln_pop beta CI crossing 0: []


In [None]:
df_factor = results_df.drop_duplicates(subset=["factor"]).copy()

# Exclude 'Weighted_RESP' and 'health_pca' from the factors
df_factor = df_factor[~df_factor["factor"].isin(['Weighted_RESP', 'health_pca'])]

df_factor.sort_values("factor", inplace=True)
factors = df_factor["factor"].values
x = np.arange(len(factors))

# Moran’s I
baseline_y = df_factor["baseline_moran_value"].values
final_y = df_factor["final_moran_value"].values
baseline_p = df_factor["baseline_moran_p"].values
final_p = df_factor["final_moran_p"].values

plt.figure(figsize=(10, 5))

plt.plot(x, baseline_y, 'o-', color='blue', label="Baseline Moran's I")
plt.plot(x, final_y, 'o-', color='yellow', label="Final Moran's I")

y_offset = 0.01
for xx, yy, pp in zip(x, baseline_y, baseline_p):
    plt.text(xx, yy + y_offset, f"p={pp:.3f}", 
             ha='center', va='bottom', color='black', fontsize=8)

for xx, yy, pp in zip(x, final_y, final_p):
    plt.text(xx, yy - y_offset, f"p={pp:.3f}",
             ha='center', va='top', color='black', fontsize=8)

plt.xticks(x, factors, rotation=45, ha='right')
plt.xlim([-0.5, len(x)-0.5])

plt.ylabel("Moran's I")
plt.title("Baseline vs. Final Moran's I by Factor")

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()


In [None]:
df_plot = results_df[["factor", "ln_pop_beta", "ln_pop_CI_lower", "ln_pop_CI_upper"]].copy()

# Exclude 'health_pca' factor
df_plot = df_plot[df_plot["factor"] != "health_pca"].copy()

# Remove duplicate entries based on the 'factor' column
df_plot.drop_duplicates(subset="factor", inplace=True)

# Define custom order for factors
custom_order = [
    'depressed',
    'suicide',
    'some_college',
    'single_parent',
    'driving_alone',
    'median_household_income',
    'mental_health_providers',
    'Park_Area_Acres',
    'unemployment',
    'social_associations'
]

# Filter the factors to include only those in the custom order
df_plot = df_plot[df_plot["factor"].isin(custom_order)].copy()

# Set the custom order as categorical and ordered
df_plot["factor"] = pd.Categorical(df_plot["factor"], categories=custom_order, ordered=True)
df_plot.sort_values("factor", inplace=True)

# Extract data for plotting
factors = df_plot["factor"].values
beta = df_plot["ln_pop_beta"].values
ci_low = df_plot["ln_pop_CI_lower"].values
ci_up = df_plot["ln_pop_CI_upper"].values

# Calculate the error bars
err_lower = beta - ci_low
err_upper = ci_up - beta
x = np.arange(len(factors))  # x-axis values (0 to N-1)

# Increase figure size for better readability
plt.figure(figsize=(12, 6))

plt.errorbar(
    x, beta,
    yerr=[err_lower, err_upper],
    fmt='o',
    color='blue',
    ecolor='black',
    capsize=4
)

# Add factor names as x-ticks and format the plot
plt.xticks(x, factors, rotation=45, ha='right')
plt.ylabel("ln_pop_beta")
plt.title("Final ln_pop_beta + CI (Exclude health_pca)")
plt.axhline(y=0, color='gray', linestyle='--', linewidth=1)  # Optional: Add a line at y=0

# Add description to the right side of the plot
plt.text(
    0, -0.7, 
    "factors 'Weighted_RESP' & 'health_pca' not significant.", 
    ha='left', va='center', 
    transform=plt.gca().transAxes, 
    fontsize=10, color='black'
)

plt.tight_layout()
plt.show()