In [1]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy.ma as ma
import glob
import os

In [2]:
def create_temporal_constraint_mask(length: int, radius: int) -> np.ndarray:
    """
    Create a mask matrix for temporal constraints.
    Points outside the radius window will be masked with True
    """
    mask = np.zeros((length, length), dtype=bool)
    
    for i in range(length):
        for j in range(length):
            if abs(i - j) > radius:
                mask[i, j] = True
                
    return mask

In [3]:
def compute_dtw_with_temporal_constraint(series1: np.ndarray, series2: np.ndarray, radius: int) -> Tuple[float, List]:
    """
    Compute DTW with strict temporal constraints
    """
    # Normalize series to [0,1] range
    s1_norm = (series1 - np.min(series1)) / (np.max(series1) - np.min(series1))
    s2_norm = (series2 - np.min(series2)) / (np.max(series2) - np.min(series2))
    
    n, m = len(s1_norm), len(s2_norm)
    
    # Create cost matrix
    cost_matrix = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            cost_matrix[i, j] = abs(s1_norm[i] - s2_norm[j])
    
    # Create accumulated cost matrix with temporal constraint
    D = np.full((n, m), np.inf)
    D[0, 0] = cost_matrix[0, 0]
    
    # Create temporal constraint mask
    temporal_mask = create_temporal_constraint_mask(max(n, m), radius)
    temporal_mask = temporal_mask[:n, :m]
    
    # Apply temporal constraint
    for i in range(n):
        for j in range(m):
            if temporal_mask[i, j]:
                continue
                
            if i == 0 and j == 0:
                continue
                
            candidates = []
            if i > 0:
                candidates.append(D[i-1, j])
            if j > 0:
                candidates.append(D[i, j-1])
            if i > 0 and j > 0:
                candidates.append(D[i-1, j-1])
            
            if candidates:
                D[i, j] = cost_matrix[i, j] + min(candidates)
    
    # Backtrack to find the warping path
    path = []
    i, j = n-1, m-1
    path.append((i, j))
    
    while i > 0 or j > 0:
        candidates = []
        if i > 0:
            candidates.append((D[i-1, j], i-1, j))
        if j > 0:
            candidates.append((D[i, j-1], i, j-1))
        if i > 0 and j > 0:
            candidates.append((D[i-1, j-1], i-1, j-1))
            
        _, i, j = min(candidates, key=lambda x: x[0])
        path.append((i, j))
    
    path.reverse()
    
    return D[-1, -1], path

In [4]:
def plot_dtw_alignment(series1: np.ndarray, series2: np.ndarray, dates: np.ndarray,
                      metric_type: str, value_type: str, source: str, threshold: str,
                      window: str, comparison: str, radius: int) -> None:
    """
    Plot the DTW alignment between network metrics and disease cases
    """
    # Normalize series for visualization
    s1_norm = (series1 - np.min(series1)) / (np.max(series1) - np.min(series1))
    s2_norm = (series2 - np.min(series2)) / (np.max(series2) - np.min(series2))
    
    # Compute DTW with temporal constraint
    distance, path = compute_dtw_with_temporal_constraint(series1, series2, radius)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Plot the first time series at the top
    ax.plot(dates, s1_norm + 1.5, label=f'{comparison}', color='blue', linewidth=2)
    
    # Plot the second time series at the bottom
    metric_label = f"{value_type} ({source}, thresh={threshold}, window={window})"
    ax.plot(dates, s2_norm, label=metric_label, color='red', linewidth=2)
    
    # Draw matching lines between points
    path = np.array(path)
    for i, j in path[::3]:  # Plot every 3rd line to reduce visual clutter
        ax.plot([dates[i], dates[j]], [s1_norm[i] + 1.5, s2_norm[j]], 
                'gray', alpha=0.9, linestyle='--')
    
    # Customize the plot
    ax.set_title(f'DTW Alignment (±{radius} days): {metric_type}\n{value_type} vs {comparison}', 
                 fontsize=25, pad=20)
    ax.legend(fontsize=13, loc='upper right')
    
    # Remove y-axis ticks and labels
    ax.set_yticks([])
    ax.set_ylabel('')
    
    # Format x-axis dates
    formatter = mdates.DateFormatter('%B %Y')
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    
    # Rotate and align the tick labels
    ax.tick_params(axis='x', labelsize=15)
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the plot
    filename = f"dtw_alignment_{metric_type.lower()}_{source.lower()}_{threshold}_{window}_{comparison.lower()}_r{radius}.png"
    plt.savefig(filename.replace('&', 'and'), bbox_inches='tight', dpi=300)
    plt.close()

In [5]:
def analyze_network_metrics(base_path: str, disease_dfs: Dict[str, pd.DataFrame], 
                          radii: List[int]) -> pd.DataFrame:
    """
    Analyze network metrics against disease cases
    """
    results = []
    
    # Process network density files
    density_path = os.path.join(base_path, "gt_netdense_rsvmsv_15or30day")
    pattern = "netdense_*.csv"
    density_files = glob.glob(os.path.join(density_path, pattern))
    
    for file_path in density_files:
        filename = os.path.basename(file_path)
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        window = window.replace('day', '')
        
        print(f"\nProcessing Network Density: {filename}")
        
        # Load network density data
        metric_df = pd.read_csv(file_path)
        
        # Skip if all zeros
        if np.all(metric_df['network_density'] == 0):
            print(f"Skipping {filename} - contains all zeros")
            continue
            
        metric_df['date'] = pd.to_datetime(metric_df['date'])
        
        # Compare with each disease metric
        for disease_name, disease_df in disease_dfs.items():
            merged_df = pd.merge(metric_df, disease_df, on='date', how='inner')
            
            if len(merged_df) == 0:
                print(f"No overlapping dates found for {filename} and {disease_name}")
                continue
                
            metric_series = merged_df['network_density'].values
            disease_series = merged_df.iloc[:, -1].values
            dates = merged_df['date'].values
            
            # Compute DTW for each radius
            for radius in radii:
                try:
                    dtw_score, _ = compute_dtw_with_temporal_constraint(
                        disease_series, metric_series, radius
                    )
                    
                    results.append({
                        'metric_type': 'Network Density',
                        'source': source.upper(),
                        'threshold': threshold,
                        'window': window,
                        'comparison': disease_name,
                        'radius': radius,
                        'dtw_score': dtw_score
                    })
                    
                    # Generate visualization
                    plot_dtw_alignment(disease_series, metric_series, dates,
                                     'Network Density', 'Network Density', source.upper(),
                                     threshold, window, disease_name, radius)
                    
                except Exception as e:
                    print(f"Error processing {filename} with radius {radius}: {str(e)}")
    
    # Process clustering coefficient files
    coeff_path = os.path.join(base_path, "gt_cluscoeff_rsvmsv_15or30day")
    pattern = "cluscoeff_*.csv"
    coeff_files = glob.glob(os.path.join(coeff_path, pattern))
    
    for file_path in coeff_files:
        filename = os.path.basename(file_path)
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        window = window.replace('day', '')
        
        print(f"\nProcessing Clustering Coefficient: {filename}")
        
        # Load clustering coefficient data
        metric_df = pd.read_csv(file_path)
        
        # Skip if all zeros
        if np.all(metric_df['clustering_coefficient'] == 0):
            print(f"Skipping {filename} - contains all zeros")
            continue
            
        metric_df['date'] = pd.to_datetime(metric_df['date'])
        
        # Compare with each disease metric
        for disease_name, disease_df in disease_dfs.items():
            merged_df = pd.merge(metric_df, disease_df, on='date', how='inner')
            
            if len(merged_df) == 0:
                print(f"No overlapping dates found for {filename} and {disease_name}")
                continue
                
            metric_series = merged_df['clustering_coefficient'].values
            disease_series = merged_df.iloc[:, -1].values
            dates = merged_df['date'].values
            
            # Compute DTW for each radius
            for radius in radii:
                try:
                    dtw_score, _ = compute_dtw_with_temporal_constraint(
                        disease_series, metric_series, radius
                    )
                    
                    results.append({
                        'metric_type': 'Clustering Coefficient',
                        'source': source.upper(),
                        'threshold': threshold,
                        'window': window,
                        'comparison': disease_name,
                        'radius': radius,
                        'dtw_score': dtw_score
                    })
                    
                    # Generate visualization
                    plot_dtw_alignment(disease_series, metric_series, dates,
                                     'Clustering Coefficient', 'Clustering Coefficient',
                                     source.upper(), threshold, window, disease_name, radius)
                    
                except Exception as e:
                    print(f"Error processing {filename} with radius {radius}: {str(e)}")
    
    return pd.DataFrame(results)

In [6]:
# Load disease data
confirmed_df = pd.read_csv("../gt_stat_analysis/disease_confirmed_daily_cases.csv")
active_df = pd.read_csv("../gt_stat_analysis/disease_active_cases.csv")

In [7]:
# Convert dates to datetime
confirmed_df['date'] = pd.to_datetime(confirmed_df['date'])
active_df['date'] = pd.to_datetime(active_df['date'])

# Create disease dataframes dictionary
disease_dfs = {
    'Confirmed Cases': confirmed_df,
    'Active Cases': active_df
}

In [8]:
# Define radii for analysis
radii = [7, 15, 20, 30, 50]

In [9]:
# Base path for network metrics
base_path = "../gt_netdense_cluscoeff"

In [10]:
# Analyze network metrics
print("Processing network metrics")
all_results = analyze_network_metrics(base_path, disease_dfs, radii)

Processing network metrics

Processing Network Density: netdense_msv_0.5_30day.csv

Processing Network Density: netdense_msv_0.8_30day.csv

Processing Network Density: netdense_rsv_0.6_30day.csv

Processing Network Density: netdense_rsv_0.4_15day.csv

Processing Network Density: netdense_msv_0.4_30day.csv

Processing Network Density: netdense_msv_0.6_15day.csv

Processing Network Density: netdense_rsv_0.8_15day.csv

Processing Network Density: netdense_rsv_0.5_15day.csv

Processing Network Density: netdense_msv_0.8_15day.csv

Processing Network Density: netdense_msv_0.5_15day.csv

Processing Network Density: netdense_rsv_0.4_30day.csv

Processing Network Density: netdense_rsv_0.6_15day.csv

Processing Network Density: netdense_msv_0.6_30day.csv

Processing Network Density: netdense_msv_0.4_15day.csv

Processing Network Density: netdense_rsv_0.5_30day.csv

Processing Network Density: netdense_rsv_0.8_30day.csv

Processing Clustering Coefficient: cluscoeff_msv_0.8_30day.csv

Processing C

In [11]:
# Save results sorted by radius and DTW score
results_sorted = all_results.sort_values(['radius', 'dtw_score'])
results_sorted.to_csv('network_dtw_results_multi_radius.csv', index=False)

# Save results sorted only by DTW score
results_sorted_by_score = all_results.sort_values('dtw_score', ascending=True)
results_sorted_by_score.to_csv('network_dtw_results_sorted.csv', index=False)

In [12]:
for radius in radii:
    radius_results = results_sorted[results_sorted['radius'] == radius]
    print(f"\nTop 30 most similar pairs for radius {radius} days (lowest DTW scores):")
    print(radius_results.head(30)[['metric_type', 'source', 'threshold', 'window', 
                                  'comparison', 'dtw_score']])

# Display top 10 most similar pairs overall
print("\nTop 10 most similar pairs overall (lowest DTW scores):")
print(results_sorted_by_score.head(10)[['metric_type', 'source', 'threshold', 'window', 
                                       'comparison', 'radius', 'dtw_score']])


Top 30 most similar pairs for radius 7 days (lowest DTW scores):
                metric_type source threshold window       comparison  \
110         Network Density    RSV       0.6     15  Confirmed Cases   
140         Network Density    RSV       0.5     30  Confirmed Cases   
50          Network Density    MSV       0.6     15  Confirmed Cases   
0           Network Density    MSV       0.5     30  Confirmed Cases   
20          Network Density    RSV       0.6     30  Confirmed Cases   
60          Network Density    RSV       0.8     15  Confirmed Cases   
280  Clustering Coefficient    MSV       0.4     15  Confirmed Cases   
220  Clustering Coefficient    RSV       0.5     15  Confirmed Cases   
180  Clustering Coefficient    RSV       0.4     15  Confirmed Cases   
120         Network Density    MSV       0.6     30  Confirmed Cases   
10          Network Density    MSV       0.8     30  Confirmed Cases   
240  Clustering Coefficient    MSV       0.5     15  Confirmed Cases  