In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from math import comb

In [2]:
main_dirs = [
    "../gt_corr_adj_matrix/gt_adj_matrices_msv_15day",
    "../gt_corr_adj_matrix/gt_adj_matrices_msv_30day",
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_15day",
    "../gt_corr_adj_matrix/gt_adj_matrices_rsv_normal_30day"
]

In [3]:
thresholds = [0.4, 0.5, 0.6, 0.8]

In [4]:
def calculate_clustering_coefficient(adj_matrix, debug=False):
    """
    Calculate global clustering coefficient with fixed denominator.
    Uses total possible triplets in 15-node network as denominator.
    """
    n = 15
    # Total possible triplets in a 15-node network: C(15,3)
    total_possible_triplets = comb(n, 3)  # This equals 455
    total_triangles = 0
    
    if debug:
        print("\nDebugging Clustering Coefficient Calculation:")
        print(f"Total possible triplets (denominator): {total_possible_triplets}")
        print("============================================")
    
    # Store triangle information for debugging
    triangle_details = []
    
    # Look at each possible triplet of nodes
    for i in range(1, n):
        for j in range(i):
            for k in range(j):
                # Get the three relevant edges
                edge_ij = adj_matrix.iloc[i, j]
                edge_ik = adj_matrix.iloc[i, k]
                edge_jk = adj_matrix.iloc[j, k]
                
                # If all three edges exist, it's a triangle
                if edge_ij == 1 and edge_ik == 1 and edge_jk == 1:
                    total_triangles += 1
                    
                    if debug:
                        triangle_info = {
                            'nodes': (i, j, k),
                            'node_names': (adj_matrix.index[i], adj_matrix.index[j], adj_matrix.index[k]),
                            'edges': {
                                f'{adj_matrix.index[i]}-{adj_matrix.index[j]}': edge_ij,
                                f'{adj_matrix.index[i]}-{adj_matrix.index[k]}': edge_ik,
                                f'{adj_matrix.index[j]}-{adj_matrix.index[k]}': edge_jk
                            }
                        }
                        triangle_details.append(triangle_info)
    
    # Calculate coefficient using fixed denominator
    coef = total_triangles / total_possible_triplets
    
    if debug:
        print(f"\nTotal triangles found: {total_triangles}")
        print(f"Clustering coefficient: {coef}")
        if triangle_details:
            print("\nTriangles found:")
            for idx, triangle in enumerate(triangle_details, 1):
                print(f"\nTriangle {idx}:")
                print(f"Nodes: {triangle['node_names']}")
                print("Edges:")
                for edge, value in triangle['edges'].items():
                    print(f"  {edge}: {value}")
        print("\n============================================")
    
    return coef, {'triangles': total_triangles, 'possible_triplets': total_possible_triplets, 'details': triangle_details} if debug else None

In [5]:
def process_adjacency_matrices(main_dir, threshold):
    """Process all adjacency matrices with debugging."""
    threshold_dir = os.path.join(main_dir, f"{threshold}_threshold")
    
    if not os.path.exists(threshold_dir):
        print(f"Directory not found: {threshold_dir}")
        return pd.DataFrame()
    
    dates = []
    coefficients = []
    
    adj_files = sorted([f for f in os.listdir(threshold_dir) if f.endswith('.csv')])
    
    for adj_file in adj_files:
        date_parts = adj_file.split('_')[-3:]
        formatted_date = f"{date_parts[0]}-{date_parts[1]}-{date_parts[2].replace('.csv', '')}"
        
        print(f"\nProcessing {adj_file}")
        
        try:
            file_path = os.path.join(threshold_dir, adj_file)
            adj_matrix = pd.read_csv(file_path, index_col=0)
            
            if adj_matrix.shape != (15, 15):
                print(f"Warning: Matrix size is {adj_matrix.shape}, expected (15, 15)")
                continue
            
            # Calculate coefficient with debugging for high values
            coef, debug_info = calculate_clustering_coefficient(adj_matrix, debug=(coef > 0.5 if 'coef' in locals() else False))
            
            dates.append(formatted_date)
            coefficients.append(coef)
            print(f"Processed successfully: coefficient = {coef}")
            
        except Exception as e:
            print(f"Error processing {adj_file}: {str(e)}")
            continue
    
    return pd.DataFrame({
        'date': dates,
        'clustering_coefficient': coefficients
    })

In [6]:
def process_adjacency_matrices(main_dir, threshold):
    """Process all adjacency matrices with debugging"""
    threshold_dir = os.path.join(main_dir, f"{threshold}_threshold")
    
    if not os.path.exists(threshold_dir):
        print(f"Directory not found: {threshold_dir}")
        return pd.DataFrame()
    
    dates = []
    coefficients = []
    
    adj_files = sorted([f for f in os.listdir(threshold_dir) if f.endswith('.csv')])
    
    for adj_file in adj_files:
        date_parts = adj_file.split('_')[-3:]
        formatted_date = f"{date_parts[0]}-{date_parts[1]}-{date_parts[2].replace('.csv', '')}"
        
        print(f"\nProcessing {adj_file}")
        
        try:
            file_path = os.path.join(threshold_dir, adj_file)
            adj_matrix = pd.read_csv(file_path, index_col=0)
            
            if adj_matrix.shape != (15, 15):
                print(f"Warning: Matrix size is {adj_matrix.shape}, expected (15, 15)")
                continue
            
            # Calculate coefficient with debugging for high values
            coef, debug_info = calculate_clustering_coefficient(adj_matrix, debug=(coef > 0.5 if 'coef' in locals() else False))
            
            dates.append(formatted_date)
            coefficients.append(coef)
            print(f"Processed successfully: coefficient = {coef}")
            
        except Exception as e:
            print(f"Error processing {adj_file}: {str(e)}")
            continue
    
    return pd.DataFrame({
        'date': dates,
        'clustering_coefficient': coefficients
    })

In [7]:
# Process each directory and threshold
for main_dir in main_dirs:
    matrix_type = 'msv' if 'msv' in main_dir else 'rsv'
    window_size = '15' if '15day' in main_dir else '30'
    
    print(f"\nProcessing {main_dir}")
    
    for threshold in thresholds:
        print(f"\nProcessing threshold {threshold}")
        
        try:
            results_df = process_adjacency_matrices(main_dir, threshold)
            
            if not results_df.empty:
                results_df = results_df.sort_values('date')
                output_filename = f"cluscoeff_{matrix_type}_{threshold}_{window_size}day.csv"
                results_df.to_csv(output_filename, index=False)
                print(f"Created {output_filename} with {len(results_df)} entries")
            else:
                print(f"No results generated for {main_dir} threshold {threshold}")
            
        except Exception as e:
            print(f"Error processing {main_dir} with threshold {threshold}: {str(e)}")

print("\nClustering coefficient calculation completed")


Processing ../gt_corr_adj_matrix/gt_adj_matrices_msv_15day

Processing threshold 0.4

Processing msv_0.4_2020_03_30.csv
Processed successfully: coefficient = 0.45494505494505494

Processing msv_0.4_2020_03_31.csv
Processed successfully: coefficient = 0.5714285714285714

Processing msv_0.4_2020_04_01.csv

Debugging Clustering Coefficient Calculation:
Total possible triplets (denominator): 455

Total triangles found: 224
Clustering coefficient: 0.49230769230769234

Triangles found:

Triangle 1:
Nodes: ('fever', 'cough', 'flu')
Edges:
  fever-cough: 1.0
  fever-flu: 1.0
  cough-flu: 1.0

Triangle 2:
Nodes: ('headache', 'fever', 'flu')
Edges:
  headache-fever: 1.0
  headache-flu: 1.0
  fever-flu: 1.0

Triangle 3:
Nodes: ('lagnat', 'cough', 'flu')
Edges:
  lagnat-cough: 1.0
  lagnat-flu: 1.0
  cough-flu: 1.0

Triangle 4:
Nodes: ('lagnat', 'fever', 'flu')
Edges:
  lagnat-fever: 1.0
  lagnat-flu: 1.0
  fever-flu: 1.0

Triangle 5:
Nodes: ('lagnat', 'fever', 'cough')
Edges:
  lagnat-fever: 1.0