In [None]:
import pandas as pd
import os
from datetime import datetime
from collections import Counter

In [None]:
def get_time_period(date_str):
    # Categorize date into specific time periods

    date = datetime.strptime(date_str, '%Y-%m-%d')
    if datetime(2020, 4, 1) <= date <= datetime(2020, 6, 30):
        return 'April-June 2020'
    elif datetime(2020, 7, 1) <= date <= datetime(2020, 9, 30):
        return 'July-September 2020'
    elif datetime(2020, 10, 1) <= date <= datetime(2020, 12, 31):
        return 'October-December 2020'
    elif datetime(2021, 1, 1) <= date <= datetime(2021, 3, 31):
        return 'January-March 2021'
    return 'Other'

def analyze_connections(adj_matrix):
    # Analyze pairwise keyword connections

    keywords = list(adj_matrix.columns[1:])  # Skip 'Unnamed: 0'
    connections = []
    
    # Look at lower triangular part only
    for i in range(1, len(keywords)):
        for j in range(i):
            if adj_matrix.iloc[i, j+1] == 1:  # +1 because first column is index
                connections.append(tuple(sorted([keywords[i], keywords[j]])))
    
    return connections

def find_triads(adj_matrix):
    # Find triads (triangles) in the network
    
    keywords = list(adj_matrix.columns[1:])  # Skip 'Unnamed: 0'
    triads = []
    
    # Look for triangles in lower triangular part
    for i in range(2, len(keywords)):
        for j in range(1, i):
            for k in range(j):
                if (adj_matrix.iloc[i, j+1] == 1 and 
                    adj_matrix.iloc[i, k+1] == 1 and 
                    adj_matrix.iloc[j, k+1] == 1):
                    triads.append(tuple(sorted([keywords[i], keywords[j], keywords[k]])))
    
    return triads

In [None]:
def process_network_files(base_dir, window_size, threshold):
    # Process all files for a given window size and threshold
    
    threshold_dir = os.path.join(base_dir, f"{threshold}_threshold")
    
    # Initialize counters for each time period
    period_connections = {
        'April-June 2020': Counter(),
        'July-September 2020': Counter(),
        'October-December 2020': Counter(),
        'January-March 2021': Counter()
    }
    period_triads = {
        'April-June 2020': Counter(),
        'July-September 2020': Counter(),
        'October-December 2020': Counter(),
        'January-March 2021': Counter()
    }
    
    # Process each file
    for file in sorted(os.listdir(threshold_dir)):
        if file.endswith('.csv'):
            # Extract date from filename
            date_parts = file.split('_')[-3:]
            date_str = f"{date_parts[0]}-{date_parts[1]}-{date_parts[2].replace('.csv', '')}"
            period = get_time_period(date_str)
            
            if period in period_connections:
                # Read and process matrix
                filepath = os.path.join(threshold_dir, file)
                adj_matrix = pd.read_csv(filepath)
                
                # Analyze connections and triads
                connections = analyze_connections(adj_matrix)
                triads = find_triads(adj_matrix)
                
                # Update counters
                period_connections[period].update(connections)
                period_triads[period].update(triads)
    
    return period_connections, period_triads

In [None]:
def print_results(period_connections, period_triads, window_size, threshold):
    # Print analysis results in organized format

    print(f"\n{'='*80}")
    print(f"Results for {window_size}-day window, threshold {threshold}")
    print(f"{'='*80}")
    
    for period in period_connections.keys():
        if period_connections[period] or period_triads[period]:  # Only print if there are results
            print(f"\n{period}")
            print("-" * len(period))
            
            print("\nMost Common Keyword Pairs:")
            for pair, count in period_connections[period].most_common(10):  # Top 10 pairs
                print(f"{pair[0]} - {pair[1]}: {count} connections")
            
            print("\nMost Common Triads:")
            for triad, count in period_triads[period].most_common(10):  # Top 10 triads
                print(f"{triad[0]} - {triad[1]} - {triad[2]}: {count} occurrences")
            
            print("\n" + "-"*40)

In [None]:
def process_network_files(base_dir, window_size, threshold):
    # Process all files for a given window size and threshold
    
    threshold_dir = os.path.join(base_dir, f"{threshold}_threshold")
    
    # Initialize counters for each time period
    period_connections = {
        'April-June 2020': Counter(),
        'July-September 2020': Counter(),
        'October-December 2020': Counter(),
        'January-March 2021': Counter()
    }
    period_triads = {
        'April-June 2020': Counter(),
        'July-September 2020': Counter(),
        'October-December 2020': Counter(),
        'January-March 2021': Counter()
    }
    
    # Process each file
    for file in sorted(os.listdir(threshold_dir)):
        if file.endswith('.csv'):
            # Extract date from filename
            date_parts = file.split('_')[-3:]
            date_str = f"{date_parts[0]}-{date_parts[1]}-{date_parts[2].replace('.csv', '')}"
            period = get_time_period(date_str)
            
            if period in period_connections:
                # Read and process matrix
                filepath = os.path.join(threshold_dir, file)
                adj_matrix = pd.read_csv(filepath)
                
                # Analyze connections and triads
                connections = analyze_connections(adj_matrix)
                triads = find_triads(adj_matrix)
                
                # Update counters
                period_connections[period].update(connections)
                period_triads[period].update(triads)
    
    return period_connections, period_triads

In [17]:
# Define directories and thresholds
rsv_dirs = [
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_15day",
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_30day"
]
thresholds = [0.4, 0.5, 0.6, 0.8]

# Run analysis for each directory and threshold
for rsv_dir in rsv_dirs:
    window_size = '15' if '15day' in rsv_dir else '30'
    
    for threshold in thresholds:
        period_connections, period_triads = process_network_files(rsv_dir, window_size, threshold)
        print_results(period_connections, period_triads, window_size, threshold)


Results for 15-day window, threshold 0.4

April-June 2020
---------------

Most Common Keyword Pairs:
Quarantine - ecq: 85 connections
social distancing - work from home: 82 connections
fever - headache: 81 connections
fever - flu: 80 connections
cough - fever: 80 connections
flu - rashes: 78 connections
ecq - masks: 78 connections
Frontliners - fever: 75 connections
fever - social distancing: 74 connections
ecq - flu: 73 connections

Most Common Triads:
Frontliners - social distancing - work from home: 60 occurrences
cough - ecq - fever: 57 occurrences
ecq - flu - rashes: 56 occurrences
Frontliners - cough - flu: 56 occurrences
Frontliners - fever - flu: 56 occurrences
cough - fever - flu: 55 occurrences
ecq - fever - flu: 55 occurrences
cough - ecq - flu: 54 occurrences
ecq - fever - masks: 54 occurrences
cough - flu - rashes: 54 occurrences

----------------------------------------

July-September 2020
-------------------

Most Common Keyword Pairs:
Quarantine - ecq: 92 connections