In [22]:
import pandas as pd
import numpy as np
from scipy import stats
from typing import Tuple, Dict
import os
import glob
from math import sqrt

In [23]:
def calculate_cohens_d(group1: np.ndarray, group2: np.ndarray) -> float:
    """
    Calculate Cohen's d effect size
    """
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    
    pooled_se = sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    
    return (np.mean(group1) - np.mean(group2)) / pooled_se

In [24]:
def calculate_hodges_g(group1: np.ndarray, group2: np.ndarray) -> float:
    """
    Calculate Hodges' g effect size (corrected Cohen's d)
    """
    d = calculate_cohens_d(group1, group2)
    
    n1, n2 = len(group1), len(group2)
    N = n1 + n2
    correction_factor = (1 - (3 / (4 * N - 9)))
    
    return d * correction_factor

In [25]:
def calculate_statistics(series1: np.ndarray, series2: np.ndarray) -> Dict:
    """
    Calculate all statistical measures between two series
    """
    # Pearson correlation and p-value
    pearson_corr, pearson_p = stats.pearsonr(series1, series2)
    
    # R-squared (coefficient of determination)
    r_squared = pearson_corr ** 2
    
    # Kendall's tau and p-value
    kendall_tau, kendall_p = stats.kendalltau(series1, series2)
    
    # Effect sizes
    cohens_d = calculate_cohens_d(series1, series2)
    hodges_g = calculate_hodges_g(series1, series2)
    
    return {
        'pearson_correlation': pearson_corr,
        'pearson_pvalue': pearson_p,
        'r_squared': r_squared,
        'kendall_tau': kendall_tau,
        'kendall_pvalue': kendall_p,
        'cohens_d': cohens_d,
        'hodges_g': hodges_g
    }

In [26]:
def analyze_pca_group_statistics(base_path: str, disease_dfs: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """
    Analyze statistical measures for PCA group metrics against disease cases
    """
    results = []
    
    # Get all PCA group directories
    pca_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.startswith('pc')]
    
    for pca_dir in pca_dirs:
        # Get group name without pc#_ prefix
        group_name = '_'.join(pca_dir.split('_')[1:])
        dir_path = os.path.join(base_path, pca_dir)
        
        print(f"\nProcessing PCA group: {group_name}")
        
        # Process network density files
        density_pattern = os.path.join(dir_path, "netdense_*.csv")
        for file_path in glob.glob(density_pattern):
            filename = os.path.basename(file_path)
            thresh = filename.split('threshold_')[0].split('_')[-1]
            window = filename.split('threshold_')[1].split('.')[0]
            
            metric_df = pd.read_csv(file_path)
            
            # Skip if all zeros
            if np.all(metric_df['network_density'] == 0):
                print(f"Skipping {filename} - network density contains all zeros")
                continue
                
            metric_df['date'] = pd.to_datetime(metric_df['date'])
            
            # Compare with each disease metric
            for disease_name, disease_df in disease_dfs.items():
                merged_df = pd.merge(metric_df, disease_df, on='date', how='inner')
                
                if len(merged_df) == 0:
                    print(f"No overlapping dates found for {filename} and {disease_name}")
                    continue
                    
                metric_series = merged_df['network_density'].values
                disease_series = merged_df.iloc[:, -1].values
                
                try:
                    # Calculate statistics
                    stats_results = calculate_statistics(disease_series, metric_series)
                    
                    results.append({
                        'pca_group': group_name,
                        'metric_type': 'Network Density',
                        'threshold': thresh,
                        'window': window,
                        'comparison': disease_name,
                        **stats_results
                    })
                    
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    import traceback
                    print(traceback.format_exc())
        
        # Process clustering coefficient files
        coeff_pattern = os.path.join(dir_path, "cluscoeff_*.csv")
        for file_path in glob.glob(coeff_pattern):
            filename = os.path.basename(file_path)
            thresh = filename.split('threshold_')[0].split('_')[-1]
            window = filename.split('threshold_')[1].split('.')[0]
            
            metric_df = pd.read_csv(file_path)
            
            # Skip if all zeros
            if np.all(metric_df['clustering_coefficient'] == 0):
                print(f"Skipping {filename} - clustering coefficient contains all zeros")
                continue
                
            metric_df['date'] = pd.to_datetime(metric_df['date'])
            
            # Compare with each disease metric
            for disease_name, disease_df in disease_dfs.items():
                merged_df = pd.merge(metric_df, disease_df, on='date', how='inner')
                
                if len(merged_df) == 0:
                    print(f"No overlapping dates found for {filename} and {disease_name}")
                    continue
                    
                metric_series = merged_df['clustering_coefficient'].values
                disease_series = merged_df.iloc[:, -1].values
                
                try:
                    # Calculate statistics
                    stats_results = calculate_statistics(disease_series, metric_series)
                    
                    results.append({
                        'pca_group': group_name,
                        'metric_type': 'Clustering Coefficient',
                        'threshold': thresh,
                        'window': window,
                        'comparison': disease_name,
                        **stats_results
                    })
                    
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    import traceback
                    print(traceback.format_exc())
    
    return pd.DataFrame(results)

In [27]:
confirmed_df = pd.read_csv("../gt_stat_analysis/disease_confirmed_daily_cases.csv")
active_df = pd.read_csv("../gt_stat_analysis/disease_active_cases.csv")

In [28]:
confirmed_df['date'] = pd.to_datetime(confirmed_df['date'])
active_df['date'] = pd.to_datetime(active_df['date'])

In [29]:
confirmed_df['confirmed_cases'] = confirmed_df['confirmed_cases'] / 1000
active_df['active_cases'] = active_df['active_cases'] / 100000

In [30]:
disease_dfs = {
    'Confirmed Cases': confirmed_df,
    'Active Cases': active_df
}

In [31]:
base_path = "../gt_netdense_cluscoeff"

In [32]:
all_results = analyze_pca_group_statistics(base_path, disease_dfs)


Processing PCA group: MSVFaceWearing&Others-0.5
Skipping netdense_MSVFaceWearing&Others-0.5_0.8threshold_30day.csv - network density contains all zeros
Skipping cluscoeff_MSVFaceWearing&Others-0.5_0.8threshold_15day.csv - clustering coefficient contains all zeros
Skipping cluscoeff_MSVFaceWearing&Others-0.5_0.8threshold_30day.csv - clustering coefficient contains all zeros
Skipping cluscoeff_MSVFaceWearing&Others-0.5_0.6threshold_30day.csv - clustering coefficient contains all zeros

Processing PCA group: RSVSymptoms&NewNormalProtocols1-0.6

Processing PCA group: RSVFaceWearing&Others-0.5
Skipping netdense_RSVFaceWearing&Others-0.5_0.8threshold_30day.csv - network density contains all zeros
Skipping cluscoeff_RSVFaceWearing&Others-0.5_0.8threshold_15day.csv - clustering coefficient contains all zeros
Skipping cluscoeff_RSVFaceWearing&Others-0.5_0.6threshold_30day.csv - clustering coefficient contains all zeros
Skipping cluscoeff_RSVFaceWearing&Others-0.5_0.8threshold_30day.csv - clust

In [33]:
all_results.to_csv('pca_statistical_analysis.csv', index=False)

In [34]:
grouped_results = all_results.groupby(['metric_type', 'comparison'])

print("\nSummary Statistics:")
for (metric, comparison), group in grouped_results:
    print(f"\n{metric} vs {comparison}:")
    print("Average Results:")
    print(f"Pearson Correlation: {group['pearson_correlation'].mean():.3f}")
    print(f"Pearson P-value: {group['pearson_pvalue'].mean():.3f}")
    print(f"R-squared: {group['r_squared'].mean():.3f}")
    print(f"Kendall's Tau: {group['kendall_tau'].mean():.3f}")
    print(f"Kendall P-value: {group['kendall_pvalue'].mean():.3f}")
    print(f"Cohen's d: {group['cohens_d'].mean():.3f}")
    print(f"Hodges' g: {group['hodges_g'].mean():.3f}")

# Display top correlations
print("\nTop 10 strongest correlations (by absolute Pearson correlation):")
top_correlations = all_results.copy()
top_correlations['abs_correlation'] = abs(top_correlations['pearson_correlation'])
top_correlations = top_correlations.sort_values('abs_correlation', ascending=False)
print(top_correlations.head(10)[['pca_group', 'metric_type', 'threshold', 'window', 
                                'comparison', 'pearson_correlation', 'pearson_pvalue']])


Summary Statistics:

Clustering Coefficient vs Active Cases:
Average Results:
Pearson Correlation: -0.108
Pearson P-value: 0.093
R-squared: 0.029
Kendall's Tau: -0.116
Kendall P-value: 0.117
Cohen's d: 1.257
Hodges' g: 1.256

Clustering Coefficient vs Confirmed Cases:
Average Results:
Pearson Correlation: -0.081
Pearson P-value: 0.121
R-squared: 0.018
Kendall's Tau: -0.116
Kendall P-value: 0.136
Cohen's d: 1.340
Hodges' g: 1.338

Network Density vs Active Cases:
Average Results:
Pearson Correlation: -0.021
Pearson P-value: 0.199
R-squared: 0.037
Kendall's Tau: -0.029
Kendall P-value: 0.166
Cohen's d: 0.224
Hodges' g: 0.224

Network Density vs Confirmed Cases:
Average Results:
Pearson Correlation: -0.016
Pearson P-value: 0.199
R-squared: 0.028
Kendall's Tau: -0.048
Kendall P-value: 0.145
Cohen's d: 1.133
Hodges' g: 1.132

Top 10 strongest correlations (by absolute Pearson correlation):
                               pca_group             metric_type threshold  \
58             RSVFaceW