In [24]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import glob
import os

In [25]:
def compute_directional_dtw(google_trends: np.ndarray, disease: np.ndarray, radius: int) -> Tuple[float, List]:
    """
    Compute DTW from network metrics perspective (one point to many)
    """
    # Normalize series to [0,1] range
    gt_norm = (google_trends - np.min(google_trends)) / (np.max(google_trends) - np.min(google_trends))
    disease_norm = (disease - np.min(disease)) / (np.max(disease) - np.min(disease))
    
    n = len(gt_norm)  # Length of network metric series
    m = len(disease_norm)  # Length of disease series
    
    # Initialize cost and path matrices
    D = np.full((n, m), np.inf)
    paths = [[[] for _ in range(m)] for _ in range(n)]
    
    # For each point in network metric series
    for i in range(n):
        # Find valid range in disease series within radius
        start_j = max(0, i - radius)
        end_j = min(m, i + radius + 1)
        
        # Find all matches within radius
        for j in range(start_j, end_j):
            cost = abs(gt_norm[i] - disease_norm[j])
            if i == 0:
                D[i, j] = cost
                paths[i][j] = [(i, j)]
            else:
                # Get minimum cost from previous network metric point
                prev_costs = D[i-1, max(0, j-radius):min(m, j+radius+1)]
                min_prev_cost = np.min(prev_costs) if len(prev_costs) > 0 else np.inf
                if min_prev_cost != np.inf:
                    D[i, j] = cost + min_prev_cost
                    # Get path from previous point with minimum cost
                    prev_j = max(0, j-radius) + np.argmin(prev_costs)
                    paths[i][j] = paths[i-1][prev_j] + [(i, j)]
    
    # Find best end point
    final_row = D[n-1, :]
    best_end = np.argmin(final_row)
    best_cost = final_row[best_end]
    best_path = paths[n-1][best_end]
    
    return best_cost, best_path

In [None]:
def plot_directional_dtw(network_metric: np.ndarray, disease: np.ndarray, dates: np.ndarray,
                        metric_type: str, source: str, threshold: str, window: str,
                        comparison: str, radius: int) -> None:
    """
    Plot DTW alignment showing directional matching
    """
    # Normalize series
    net_norm = (network_metric - np.min(network_metric)) / (np.max(network_metric) - np.min(network_metric))
    disease_norm = (disease - np.min(disease)) / (np.max(disease) - np.min(disease))
    
    # Compute DTW
    distance, path = compute_directional_dtw(network_metric, disease, radius)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Plot time series
    ax.plot(dates, disease_norm + 1.5, label=f'{comparison}', color='blue', linewidth=2)
    ax.plot(dates, net_norm, label=f'{metric_type} ({source}, thresh={threshold})', color='red', linewidth=2)
    
    # Draw matching lines every 2 days
    path = np.array(path)
    for idx, (i, j) in enumerate(path):
        if i % 2 == 0:
            ax.plot([dates[i], dates[j]], [net_norm[i], disease_norm[j] + 1.5], 
                    'gray', alpha=0.9, linestyle='--')
    
    # Convert dates to pandas Timestamp for string formatting
    start_date = pd.Timestamp(dates[0]).strftime('%Y-%m-%d')
    end_date = pd.Timestamp(dates[-1]).strftime('%Y-%m-%d')
    timeline_info = f"{start_date} to {end_date}"
    
    ax.set_title(f'Directional DTW Alignment (±{radius} days): {metric_type}\n{source} (), threshold={threshold}', 
                 fontsize=25, pad=20)
    ax.legend(fontsize=13, loc='upper right')
    
    # Remove y-axis ticks and labels
    ax.set_yticks([])
    ax.set_ylabel('')
    
    # Format x-axis
    formatter = mdates.DateFormatter('%B %Y')
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.tick_params(axis='x', labelsize=15)
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
    # Set x-axis limits with padding
    date_range = dates[-1] - dates[0]
    padding = pd.Timedelta(days=(date_range / pd.Timedelta('1D')) * 0.05)  # 5% padding
    ax.set_xlim(dates[0] - padding, dates[-1] + padding)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save plot
    filename = f"dtw_directional_{metric_type.lower()}_{source.lower()}_{threshold}_{window}day_{comparison.lower()}_r{radius}.png"
    plt.savefig(filename.replace('&', 'and'), bbox_inches='tight', dpi=300)
    plt.close()

In [27]:
def analyze_network_metrics(base_path: str, disease_dfs: Dict[str, pd.DataFrame], 
                          radii: List[int]) -> pd.DataFrame:
    """
    Analyze network metrics using directional DTW
    """
    results = []
    
    # Process network density files
    density_path = os.path.join(base_path, "gt_netdense_rsvmsv_15or30day")
    pattern = "netdense_*.csv"
    density_files = glob.glob(os.path.join(density_path, pattern))
    
    for file_path in density_files:
        filename = os.path.basename(file_path)
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        window = window.replace('day', '')
        
        print(f"\nProcessing Network Density: {filename}")
        
        # Load network density data
        metric_df = pd.read_csv(file_path)
        
        # Skip if all zeros
        if np.all(metric_df['network_density'] == 0):
            print(f"Skipping {filename} - contains all zeros")
            continue
            
        metric_df['date'] = pd.to_datetime(metric_df['date'])
        
        # Compare with each disease metric
        for disease_name, disease_df in disease_dfs.items():
            # Filter disease data to match network metric timeframe
            disease_filtered = disease_df[
                (disease_df['date'] >= metric_df['date'].min()) &
                (disease_df['date'] <= metric_df['date'].max())
            ].copy()
            
            # Ensure date alignment
            merged_df = pd.merge(metric_df, disease_filtered, on='date', how='inner')
            
            if len(merged_df) == 0:
                print(f"No overlapping dates found for {filename} and {disease_name}")
                continue
                
            metric_series = merged_df['network_density'].values
            disease_series = merged_df.iloc[:, -1].values
            dates = merged_df['date'].values
            
            # Compute DTW for each radius
            for radius in radii:
                try:
                    dtw_score, _ = compute_directional_dtw(
                        metric_series, disease_series, radius
                    )
                    
                    results.append({
                        'metric_type': 'Network Density',
                        'source': source.upper(),
                        'threshold': threshold,
                        'window': window,
                        'comparison': disease_name,
                        'radius': radius,
                        'dtw_score': dtw_score
                    })
                    
                    # Generate visualization
                    plot_directional_dtw(metric_series, disease_series, dates,
                                       'Network Density', source.upper(), threshold,
                                       window, disease_name, radius)
                    
                except Exception as e:
                    print(f"Error processing {filename} with radius {radius}: {str(e)}")
    
    # Process clustering coefficient files
    coeff_path = os.path.join(base_path, "gt_cluscoeff_rsvmsv_15or30day")
    pattern = "cluscoeff_*.csv"
    coeff_files = glob.glob(os.path.join(coeff_path, pattern))
    
    for file_path in coeff_files:
        filename = os.path.basename(file_path)
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        window = window.replace('day', '')
        
        print(f"\nProcessing Clustering Coefficient: {filename}")
        
        # Load clustering coefficient data
        metric_df = pd.read_csv(file_path)
        
        # Skip if all zeros
        if np.all(metric_df['clustering_coefficient'] == 0):
            print(f"Skipping {filename} - contains all zeros")
            continue
            
        metric_df['date'] = pd.to_datetime(metric_df['date'])
        
        # Compare with each disease metric
        for disease_name, disease_df in disease_dfs.items():
            # Filter disease data to match clustering coefficient timeframe
            disease_filtered = disease_df[
                (disease_df['date'] >= metric_df['date'].min()) &
                (disease_df['date'] <= metric_df['date'].max())
            ].copy()
            
            # Ensure date alignment
            merged_df = pd.merge(metric_df, disease_filtered, on='date', how='inner')
            
            if len(merged_df) == 0:
                print(f"No overlapping dates found for {filename} and {disease_name}")
                continue
                
            metric_series = merged_df['clustering_coefficient'].values
            disease_series = merged_df.iloc[:, -1].values
            dates = merged_df['date'].values
            
            # Compute DTW for each radius
            for radius in radii:
                try:
                    dtw_score, _ = compute_directional_dtw(
                        metric_series, disease_series, radius
                    )
                    
                    results.append({
                        'metric_type': 'Clustering Coefficient',
                        'source': source.upper(),
                        'threshold': threshold,
                        'window': window,
                        'comparison': disease_name,
                        'radius': radius,
                        'dtw_score': dtw_score
                    })
                    
                    # Generate visualization
                    plot_directional_dtw(metric_series, disease_series, dates,
                                       'Clustering Coefficient', source.upper(), threshold,
                                       window, disease_name, radius)
                    
                except Exception as e:
                    print(f"Error processing {filename} with radius {radius}: {str(e)}")
    
    return pd.DataFrame(results)

In [28]:
confirmed_df = pd.read_csv("../gt_stat_analysis/disease_confirmed_daily_cases.csv")
active_df = pd.read_csv("../gt_stat_analysis/disease_active_cases.csv")

In [29]:
confirmed_df['date'] = pd.to_datetime(confirmed_df['date'])
active_df['date'] = pd.to_datetime(active_df['date'])

In [30]:
disease_dfs = {
    'Confirmed Cases': confirmed_df,
    'Active Cases': active_df
}

In [31]:
radii = [7, 15, 20, 30, 50]

In [32]:
base_path = "../gt_netdense_cluscoeff"

In [33]:
print("Processing network metrics")
all_results = analyze_network_metrics(base_path, disease_dfs, radii)

Processing network metrics

Processing Network Density: netdense_msv_0.5_30day.csv

Processing Network Density: netdense_msv_0.8_30day.csv

Processing Network Density: netdense_rsv_0.6_30day.csv

Processing Network Density: netdense_rsv_0.4_15day.csv

Processing Network Density: netdense_msv_0.4_30day.csv

Processing Network Density: netdense_msv_0.6_15day.csv

Processing Network Density: netdense_rsv_0.8_15day.csv

Processing Network Density: netdense_rsv_0.5_15day.csv

Processing Network Density: netdense_msv_0.8_15day.csv

Processing Network Density: netdense_msv_0.5_15day.csv

Processing Network Density: netdense_rsv_0.4_30day.csv

Processing Network Density: netdense_rsv_0.6_15day.csv

Processing Network Density: netdense_msv_0.6_30day.csv

Processing Network Density: netdense_msv_0.4_15day.csv

Processing Network Density: netdense_rsv_0.5_30day.csv

Processing Network Density: netdense_rsv_0.8_30day.csv

Processing Clustering Coefficient: cluscoeff_msv_0.8_30day.csv

Processing C

In [34]:
columns = [col for col in all_results.columns if col != 'dtw_score'] + ['dtw_score']
all_results = all_results[columns]

In [36]:
results_sorted = all_results.sort_values('dtw_score', ascending=True)
results_sorted.to_csv('network_directional_dtw_results.csv', index=False)