In [49]:
import pandas as pd
import os
from itertools import combinations
from datetime import datetime

In [50]:
def load_adj_matrix(filepath):
    return pd.read_csv(filepath, index_col=0)

In [51]:
def get_subgraph_indices(df, group_tags):
    return [list(df.index).index(tag) for tag in group_tags]

In [52]:
def calculate_network_density(adj_matrix, group_indices):
    n = len(group_indices)
    max_edges = (n * (n-1)) / 2
    if max_edges == 0:
        return 0
    
    actual_edges = 0
    for i, j in combinations(group_indices, 2):
        if adj_matrix.iloc[max(i,j), min(i,j)] == 1:
            actual_edges += 1
    
    return actual_edges / max_edges

In [53]:
def calculate_clustering_coefficient(adj_matrix, group_indices):
    n = len(group_indices)
    if n < 3:
        return 0
    
    possible_triads = 0
    closed_triads = 0
    
    for i, j, k in combinations(group_indices, 3):
        # Get edges from lower triangle
        edge1 = adj_matrix.iloc[max(i,j), min(i,j)]
        edge2 = adj_matrix.iloc[max(j,k), min(j,k)]
        edge3 = adj_matrix.iloc[max(i,k), min(i,k)]
        
        if edge1 == 1 and edge2 == 1 and edge3 == 1:
            closed_triads += 1
        possible_triads += 1
    
    return closed_triads / possible_triads if possible_triads > 0 else 0

In [54]:
def extract_date_from_filename(filename):
    parts = filename.split('_')
    year = parts[2]
    month = parts[3]
    day = parts[4].split('.')[0]
    return f"{year}-{month}-{day}"

In [55]:
def process_matrices(base_dir, threshold, groups, window_size):
    threshold_dir = f"{threshold}_threshold"
    full_path = os.path.join(base_dir, threshold_dir)
    
    results = {group: {'density': {}, 'clustering': {}} for group in groups}
    
    for filename in os.listdir(full_path):
        if filename.endswith('.csv'):
            date = extract_date_from_filename(filename)
            filepath = os.path.join(full_path, filename)
            adj_matrix = load_adj_matrix(filepath)
            
            for group_name, tags in groups.items():
                indices = get_subgraph_indices(adj_matrix, tags)
                density = calculate_network_density(adj_matrix, indices)
                clustering = calculate_clustering_coefficient(adj_matrix, indices)
                
                results[group_name]['density'][date] = density
                results[group_name]['clustering'][date] = clustering
                
    # Create output files
    for group_name in groups:
        # Network Density file
        density_df = pd.DataFrame({
            'network_density': results[group_name]['density']
        })
        density_df.index.name = 'date'
        # Sort index by date
        density_df.index = pd.to_datetime(density_df.index)
        density_df = density_df.sort_index()
        density_filename = f"netdense_{group_name}_{threshold}threshold_{window_size}day.csv"
        density_df.to_csv(density_filename)
        
        # Clustering Coefficient file
        clustering_df = pd.DataFrame({
            'clustering_coefficient': results[group_name]['clustering']
        })
        clustering_df.index.name = 'date'
        # Sort index by date
        clustering_df.index = pd.to_datetime(clustering_df.index)
        clustering_df = clustering_df.sort_index()
        clustering_filename = f"cluscoeff_{group_name}_{threshold}threshold_{window_size}day.csv"
        clustering_df.to_csv(clustering_filename)

In [56]:
base_dirs = [
    "../gt_corr_adj_matrix/gt_adj_matrices_msv_15day",
    "../gt_corr_adj_matrix/gt_adj_matrices_msv_30day",
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_15day",
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_30day"
]

thresholds = [0.4, 0.5, 0.6, 0.8]

In [57]:
# Define the groups with dashes instead of underscores
MSV_GROUPS = {
    "MSVSymptoms-0.6": ["cough", "fever", "sipon"],
    "MSVNewNormalProtocols1-0.6": ["ecq", "Quarantine"],
    "MSVSymptoms&HealthProtocols1-0.6": ["flu", "headache", "lagnat", "rashes", "ubo", 
                                        "face shield", "Frontliners", "masks", "social distancing"],
    "MSVSymptoms-0.5": ["flu", "cough", "fever", "sipon", "ubo"],
    "MSVNewNormalProtocols1-0.5": ["Quarantine", "Frontliners", "ecq"],
    "MSVFaceWearing&Others-0.6": ["masks", "face shield", "headache", "lagnat", "rashes", "social distancing"]
}

RSV_GROUPS = {
    "RSVSymptoms&NewNormalProtocols1-0.6": ["flu", "cough", "fever", "Quarantine", "work from home"],
    "RSVSymptoms&NewNormalProtocols2-0.6": ["face shield", "headache", "lagnat", "rashes", "sipon", 
                                        "ubo", "Frontliners", "masks", "social distancing"],
    "RSVSymptoms&NewNormalProtocols-0.5": ["social distancing", "work from home", "Quarantine", 
                                     "headache", "fever", "cough", "flu"],
    "MSVFaceWearing&Others-0.5": ["face shield", "masks", "Frontliners", "ubo", "sipon", "rashes", "lagnat"]
}

In [58]:
for base_dir in base_dirs:
    window_size = "15" if "15day" in base_dir else "30"
    groups = MSV_GROUPS if "msv" in base_dir else RSV_GROUPS
    
    for threshold in thresholds:
        process_matrices(base_dir, threshold, groups, window_size)