In [25]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np
import pandas as pd
from scipy import stats

import constants as c
import helpers as h

from logger import setup_logger 
log = setup_logger("fair-model-tests")
log.setLevel("INFO")
log.info("Modules loaded.")

[34m2025-02-05 22:22:54 - fair-model-tests - INFO - Modules loaded.[0m


In [26]:
analysis_df = pd.read_csv(c.CURRENT_NO_COVARIATES_DF)
analysis_df = h.add_helper_cols(analysis_df)
analysis_df = h.add_demo_cols(analysis_df)


[34m2025-02-05 22:22:54 - analysis-helpers - INFO - Found 192 tracts with at least one FloodNet sensor.[0m
[34m2025-02-05 22:22:54 - analysis-helpers - INFO - Found 2171 311 requests.[0m
[34m2025-02-05 22:22:54 - analysis-helpers - INFO - Found 878 tracts with at least one 311 report.[0m
[34m2025-02-05 22:22:54 - analysis-helpers - INFO - Found 1001 tracts with no DEP flooding.[0m


In [27]:
# run HSIC between p_y and each demographic variable 
demo_variables = ['frac_white', 'frac_black', 'frac_hispanic', 'frac_asian', 'frac_hs', 'frac_bachelors', 'frac_grad', 'frac_children', 'frac_elderly', 'total_population', 'frac_internet', 'frac_smartphone', 'median_household_income', 'frac_limited_english']


In [28]:
import numpy as np
from scipy.stats import gamma
from sklearn.metrics.pairwise import rbf_kernel

def centering_matrix(n):
    """Create a centering matrix of size n x n"""
    return np.eye(n) - np.ones((n, n)) / n

def rbf_kernel_width(X):
    """Compute median distance for RBF kernel width"""
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
    pairwise_distances = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=2))
    return np.median(pairwise_distances[pairwise_distances > 0])

def hsic_test(X, Y, alpha=0.05):
    """
    Perform HSIC independence test between X and Y.
    
    Parameters:
    -----------
    X : array-like, shape (n_samples,) or (n_samples, n_features)
        First variable
    Y : array-like, shape (n_samples,) or (n_samples, n_features)
        Second variable
    alpha : float, optional (default=0.05)
        Significance level
        
    Returns:
    --------
    test_stat : float
        HSIC test statistic
    threshold : float
        Critical value at significance level alpha
    p_value : float
        p-value of the test
    independent : bool
        True if variables are independent at significance level alpha
    """
    # Ensure inputs are 2D arrays
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
    if len(Y.shape) == 1:
        Y = Y.reshape(-1, 1)
        
    n = X.shape[0]
    
    # Compute kernel widths using median heuristic
    sigma_x = rbf_kernel_width(X)
    sigma_y = rbf_kernel_width(Y)
    
    # Compute centered kernel matrices
    H = centering_matrix(n)
    K = rbf_kernel(X, gamma=1/(2*sigma_x**2))
    L = rbf_kernel(Y, gamma=1/(2*sigma_y**2))
    
    Kc = H @ K @ H
    Lc = H @ L @ H
    
    # Compute test statistic
    test_stat = np.sum(Kc * Lc) / n**2
    
    # Compute approximate null distribution parameters
    mean_approx = (1 + np.trace(Kc @ Kc) * np.trace(Lc @ Lc) / n**2) / n
    var_approx = 2 * np.trace(Kc @ Kc @ Lc @ Lc) / n**4
    
    # Compute p-value and threshold using gamma approximation
    k = mean_approx**2 / var_approx
    theta = var_approx / mean_approx
    p_value = 1 - gamma.cdf(test_stat, k, scale=theta)
    threshold = gamma.ppf(1-alpha, k, scale=theta)
    
    return {
        'test_statistic': test_stat,
        'threshold': threshold,
        'p_value': p_value,
        'independent': test_stat < threshold
    }

# Example usage with your flood risk data:
def test_flood_risk_independence(risk_scores, demographic_var, group_name=None):
    """
    Test independence between flood risk scores and a demographic variable
    
    Parameters:
    -----------
    risk_scores : array-like
        Predicted flood risk scores
    demographic_var : array-like
        Demographic variable to test against
    group_name : str, optional
        Name of demographic variable for printing results
    """
    result = hsic_test(risk_scores, demographic_var)
    
    if group_name:
        print(f"\nHSIC Test Results for {group_name}")
        print("-" * 50)
        print(f"Test statistic: {result['test_statistic']:.6f}")
        print(f"Critical value: {result['threshold']:.6f}")
        print(f"p-value: {result['p_value']:.6f}")
        print(f"Independent at α=0.05: {result['independent']}")
    
    return result



In [29]:
# drop rows with total population == 0 
analysis_df = analysis_df[analysis_df['total_population'] > 0]


for demo_var in demo_variables:
    # drop rows with missing values
    for_anl = analysis_df.dropna(subset=[demo_var])
    test_flood_risk_independence(for_anl['p_y'].values, for_anl[demo_var].values, group_name=demo_var)


HSIC Test Results for frac_white
--------------------------------------------------
Test statistic: 0.000836
Critical value: 13.601644
p-value: 1.000000
Independent at α=0.05: True

HSIC Test Results for frac_black
--------------------------------------------------
Test statistic: 0.000182
Critical value: 12.000932
p-value: 1.000000
Independent at α=0.05: True

HSIC Test Results for frac_hispanic
--------------------------------------------------
Test statistic: 0.000246
Critical value: 11.308503
p-value: 1.000000
Independent at α=0.05: True

HSIC Test Results for frac_asian
--------------------------------------------------
Test statistic: 0.000300
Critical value: 9.761414
p-value: 1.000000
Independent at α=0.05: True

HSIC Test Results for frac_hs
--------------------------------------------------
Test statistic: 0.000857
Critical value: 9.526468
p-value: 1.000000
Independent at α=0.05: True

HSIC Test Results for frac_bachelors
--------------------------------------------------
Tes