In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.spatial.distance import correlation
import glob
import os

In [2]:
def calculate_statistics(metric_data, disease_data):
    """
    Calculate statistical measures between network metrics and disease data
    """
    correlation, p_value = stats.pearsonr(metric_data, disease_data)
    r_squared = correlation ** 2

    # Kendall's Tau correlation
    kendall_corr, kendall_pval = stats.kendalltau(metric_data, disease_data)
    
    # Cohen's d
    n1 = len(metric_data)
    n2 = len(disease_data)
    var1 = np.var(metric_data, ddof=1)
    var2 = np.var(disease_data, ddof=1)
    pooled_sd = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    cohens_d = (np.mean(metric_data) - np.mean(disease_data)) / pooled_sd
    
    # Hedges' g
    hedges_g = cohens_d * (1 - (3 / (4 * (n1 + n2 - 2) - 1)))
    
    return {
        'pearson': {
            'correlation': correlation,
            'p_value': p_value
        },
        'kendall': {
            'correlation': kendall_corr,
            'p_value': kendall_pval
        },
        'p_value': p_value,
        'r_squared': r_squared,
        'cohens_d': cohens_d,
        'hedges_g': hedges_g
    }

In [3]:
def analyze_metric_file(filepath, confirmed_cases, active_cases, metric_col):
    """
    Analyze a single metric file against both disease metrics
    """
    # Extract file information from filename
    filename = os.path.basename(filepath)
    metric_type, analysis_type, threshold, window = filename.replace('.csv', '').split('_')
    
    # Read the metric data
    metric_df = pd.read_csv(filepath)
    
    # Convert dates to datetime
    metric_df['date'] = pd.to_datetime(metric_df['date'])
    confirmed_cases['date'] = pd.to_datetime(confirmed_cases['date'])
    active_cases['date'] = pd.to_datetime(active_cases['date'])
    
    # Merge with disease data
    merged_confirmed = pd.merge(metric_df, confirmed_cases, on='date', how='inner')
    merged_active = pd.merge(metric_df, active_cases, on='date', how='inner')
    
    # Scale disease data
    merged_confirmed['confirmed_cases'] = merged_confirmed['confirmed_cases'] / 10000
    merged_active['active_cases'] = merged_active['active_cases'] / 100000
    
    # Calculate statistics
    confirmed_stats = calculate_statistics(
        merged_confirmed[metric_col],
        merged_confirmed['confirmed_cases']
    )
    
    active_stats = calculate_statistics(
        merged_active[metric_col],
        merged_active['active_cases']
    )
    
    return {
        'filename': filename,
        'metric_type': metric_type,
        'analysis_type': analysis_type,
        'threshold': threshold,
        'window': window,
        'n_observations': len(merged_confirmed),
        'date_range': f"{merged_confirmed['date'].min().date()} to {merged_confirmed['date'].max().date()}",
        'confirmed_cases_stats': confirmed_stats,
        'active_cases_stats': active_stats
    }

In [4]:
def analyze_all_metrics(base_dir, confirmed_cases, active_cases):
    """
    Analyze all metric files in the directory
    """
    # Determine metric column based on directory name
    metric_col = 'clustering_coefficient' if 'cluscoeff' in base_dir else 'network_density'
    
    # Get all CSV files in the directory
    file_pattern = os.path.join(base_dir, "*.csv")
    all_files = glob.glob(file_pattern)
    
    results = []
    for filepath in all_files:
        try:
            result = analyze_metric_file(filepath, confirmed_cases, active_cases, metric_col)
            results.append(result)
        except Exception as e:
            print(f"Error processing {filepath}: {str(e)}")
    
    return results

In [5]:
# Read disease data
confirmed_cases = pd.read_csv('disease_confirmed_daily_cases.csv')
active_cases = pd.read_csv('disease_active_cases.csv')

In [6]:
# Process clustering coefficient files
cluscoeff_dir = "../gt_netdense_cluscoeff/gt_cluscoeff_rsvmsv_15or30day"
print("\nAnalyzing Clustering Coefficient Files:")
cluscoeff_results = analyze_all_metrics(cluscoeff_dir, confirmed_cases, active_cases)


Analyzing Clustering Coefficient Files:


In [7]:
# Process network density files
netdense_dir = "./gt_netdense_rsvmsv_15or30day"
print("\nAnalyzing Network Density Files:")
netdense_results = analyze_all_metrics(netdense_dir, confirmed_cases, active_cases)


Analyzing Network Density Files:


In [8]:
# Display results
def display_results(results):
    for result in results:
        print(f"\nFile: {result['filename']}")
        print(f"Analysis Type: {result['analysis_type']}")
        print(f"Threshold: {result['threshold']}")
        print(f"Window: {result['window']}")
        print(f"Time Period: {result['date_range']}")
        print(f"Number of observations: {result['n_observations']}")
        
        print("\nConfirmed Cases Statistics:")
        stats = result['confirmed_cases_stats']
        print(f"  Pearson Correlation: {stats['pearson']['correlation']:.10f}")
        print(f"  P-value: {stats['pearson']['p_value']:.10f}")
        print(f"  R-squared: {stats['r_squared']:.10f}")
        print(f"  Kendall's tau Correlation: {stats['kendall']['correlation']:.10f}")
        print(f"  P-value: {stats['kendall']['p_value']:.10f}")
        print(f"  Cohen's d: {stats['cohens_d']:.10f}")
        print(f"  Hedges' g: {stats['hedges_g']:.10f}")
        
        print("\nActive Cases Statistics:")
        stats = result['active_cases_stats']
        print(f"  Pearson Correlation: {stats['pearson']['correlation']:.10f}")
        print(f"  P-value: {stats['pearson']['p_value']:.10f}")
        print(f"  R-squared: {stats['r_squared']:.10f}")
        print(f"  Kendall's tau Correlation: {stats['kendall']['correlation']:.10f}")
        print(f"  P-value: {stats['kendall']['p_value']:.10f}")
        print(f"  Cohen's d: {stats['cohens_d']:.10f}")
        print(f"  Hedges' g: {stats['hedges_g']:.10f}")
        print("="*80)

In [9]:
print("\nClustering Coefficient Results:")
display_results(cluscoeff_results)

print("\nNetwork Density Results:")
display_results(netdense_results)


Clustering Coefficient Results:

File: cluscoeff_msv_0.8_30day.csv
Analysis Type: msv
Threshold: 0.8
Window: 30day
Time Period: 2020-04-14 to 2021-03-16
Number of observations: 337

Confirmed Cases Statistics:
  Pearson Correlation: -0.1027498703
  P-value: 0.0595354532
  R-squared: 0.0105575358
  Kendall's tau Correlation: -0.1678594554
  P-value: 0.0001599854
  Cohen's d: -1.4548919406
  Hedges' g: -1.4532675730

Active Cases Statistics:
  Pearson Correlation: -0.1360489569
  P-value: 0.0124235057
  R-squared: 0.0185093187
  Kendall's tau Correlation: -0.2025696382
  P-value: 0.0000051871
  Cohen's d: -2.0279197462
  Hedges' g: -2.0256556006

File: cluscoeff_msv_0.5_30day.csv
Analysis Type: msv
Threshold: 0.5
Window: 30day
Time Period: 2020-04-14 to 2021-03-16
Number of observations: 337

Confirmed Cases Statistics:
  Pearson Correlation: -0.1116230286
  P-value: 0.0405681494
  R-squared: 0.0124597005
  Kendall's tau Correlation: -0.1153868229
  P-value: 0.0039685172
  Cohen's d: -1