In [21]:
import pandas as pd
import numpy as np
from dcor import distance_correlation
import os
from datetime import datetime, timedelta

In [22]:
def create_directory(directory):
    """Create directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

In [23]:
def compute_distance_correlation_matrix(data, tags):
    """
    Compute distance correlation matrix for given data.
    Only fills the lower triangular part.
    """
    n = len(tags)
    matrix = np.zeros((n, n))
    
    # Only compute lower triangular part (including diagonal)
    for i in range(n):
        for j in range(i + 1):  # Only go up to i (lower triangle)
            try:
                x = np.array(data[tags[i]].values, dtype=float)
                y = np.array(data[tags[j]].values, dtype=float)
                corr = distance_correlation(x, y)
                matrix[i, j] = corr
            except Exception as e:
                print(f"Error computing correlation for {tags[i]} and {tags[j]}: {str(e)}")
                matrix[i, j] = np.nan
    
    return matrix

In [24]:
def create_correlation_matrices(window_size):
    """Create correlation matrices for specified window size."""
    # Read the data
    df = pd.read_csv("../gt_preprocessed_data/gt_rsv_stitched/3_gt_rescaled_rsv.csv")
    df['date'] = pd.to_datetime(df['date'])
    
    # Get list of tags (column names except 'date')
    tags = [col for col in df.columns if col != 'date']
    
    # Create output directory
    output_dir = f"./gt_dcorr_matrices_rsv_{window_size}day"
    create_directory(output_dir)
    
    # Set dates
    data_start_date = datetime(2020, 3, 16)
    end_date = datetime(2021, 3, 16)
    first_matrix_date = data_start_date + timedelta(days=window_size - 1)
    
    print(f"Window size: {window_size} days")
    print(f"Data start date: {data_start_date.strftime('%Y-%m-%d')}")
    print(f"First matrix date: {first_matrix_date.strftime('%Y-%m-%d')}")
    print(f"End date: {end_date.strftime('%Y-%m-%d')}")
    
    current_date = first_matrix_date
    while current_date <= end_date:
        window_start = current_date - timedelta(days=window_size - 1)
        window_data = df[(df['date'] >= window_start) & (df['date'] <= current_date)].copy()
        
        # Accept either full window or one day less
        min_window_size = window_size - 1  # 29 for 30-day, 14 for 15-day
        if len(window_data) >= min_window_size:
            print(f"Processing matrix for: {current_date.strftime('%Y-%m-%d')}")
            print(f"Using {len(window_data)}-day window from: {window_start.strftime('%Y-%m-%d')} to {current_date.strftime('%Y-%m-%d')}")
            
            # Compute correlation matrix
            corr_matrix = compute_distance_correlation_matrix(window_data, tags)
            
            # Create DataFrame for the correlation matrix
            corr_df = pd.DataFrame(corr_matrix, index=tags, columns=tags)
            
            # Save to CSV
            date_str = current_date.strftime('%Y_%m_%d')
            filename = f"rsv_distance_correlation_{date_str}.csv"
            filepath = os.path.join(output_dir, filename)
            corr_df.to_csv(filepath)
            
            print(f"Created correlation matrix for {date_str}")
        else:
            print(f"Warning: Window too small for {current_date.strftime('%Y-%m-%d')} "
                  f"(found {len(window_data)} days, needed at least {min_window_size})")
        
        current_date += timedelta(days=1)

In [25]:
# Process both window sizes
for window_size in [15, 30]:
    print(f"\nProcessing {window_size}-day window")
    create_correlation_matrices(window_size)


Processing 15-day window
Created directory: ./gt_dcorr_matrices_rsv_15day
Window size: 15 days
Data start date: 2020-03-16
First matrix date: 2020-03-30
End date: 2021-03-16
Processing matrix for: 2020-03-30
Using 15-day window from: 2020-03-16 to 2020-03-30
Created correlation matrix for 2020_03_30
Processing matrix for: 2020-03-31
Using 15-day window from: 2020-03-17 to 2020-03-31
Created correlation matrix for 2020_03_31
Processing matrix for: 2020-04-01
Using 15-day window from: 2020-03-18 to 2020-04-01
Created correlation matrix for 2020_04_01
Processing matrix for: 2020-04-02
Using 15-day window from: 2020-03-19 to 2020-04-02
Created correlation matrix for 2020_04_02
Processing matrix for: 2020-04-03
Using 15-day window from: 2020-03-20 to 2020-04-03
Created correlation matrix for 2020_04_03
Processing matrix for: 2020-04-04
Using 15-day window from: 2020-03-21 to 2020-04-04
Created correlation matrix for 2020_04_04
Processing matrix for: 2020-04-05
Using 15-day window from: 202