In [73]:
import pandas as pd
import numpy as np
from scipy import stats
import glob
import os

In [74]:
def is_all_zeros(filepath, value_column):
    df = pd.read_csv(filepath)
    return np.all(df[value_column] == 0)

In [75]:
dtw_results = pd.read_csv('dtw_results_multi_radius_sorted.csv')

In [76]:
# Test overall dtw_scores
print("Shapiro-Wilk test for all DTW scores:")
stat, p_value = stats.shapiro(dtw_results['dtw_score'])
print(f"Statistic: {stat:.10f}")
print(f"p-value: {p_value:.10e}")
print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}\n")

# Test dtw_scores for each tag separately
print("Shapiro-Wilk test for each tag:")
for tag in sorted(dtw_results['tag'].unique()):
    tag_scores = dtw_results[dtw_results['tag'] == tag]['dtw_score']
    stat, p_value = stats.shapiro(tag_scores)
    print(f"\nTag: {tag}")
    print(f"Statistic: {stat:10f}")
    print(f"p-value: {p_value:10e}")
    print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}")

Shapiro-Wilk test for all DTW scores:
Statistic: 0.9924630931
p-value: 1.3314606321e-01
Normally distributed? Yes

Shapiro-Wilk test for each tag:

Tag: Frontliners
Statistic:   0.868773
p-value: 1.118067e-02
Normally distributed? No

Tag: Quarantine
Statistic:   0.899522
p-value: 4.039762e-02
Normally distributed? No

Tag: cough
Statistic:   0.903223
p-value: 4.741292e-02
Normally distributed? No

Tag: ecq
Statistic:   0.912105
p-value: 6.991162e-02
Normally distributed? Yes

Tag: face shield
Statistic:   0.917475
p-value: 8.860138e-02
Normally distributed? Yes

Tag: fever
Statistic:   0.921416
p-value: 1.055049e-01
Normally distributed? Yes

Tag: flu
Statistic:   0.933882
p-value: 1.833481e-01
Normally distributed? Yes

Tag: headache
Statistic:   0.977608
p-value: 8.996215e-01
Normally distributed? Yes

Tag: lagnat
Statistic:   0.931678
p-value: 1.663301e-01
Normally distributed? Yes

Tag: masks
Statistic:   0.987563
p-value: 9.931399e-01
Normally distributed? Yes

Tag: rashes
Statis

In [77]:
base_path = "../gt_netdense_cluscoeff"

In [78]:
# Get valid network density files
density_files = glob.glob(os.path.join(base_path, "gt_netdense_rsvmsv_15or30day/netdense_*.csv"))
valid_density_files = [f for f in density_files if not is_all_zeros(f, 'network_density')]

# Get valid clustering coefficient files
coeff_files = glob.glob(os.path.join(base_path, "gt_cluscoeff_rsvmsv_15or30day/cluscoeff_*.csv"))
valid_coeff_files = [f for f in coeff_files if not is_all_zeros(f, 'clustering_coefficient')]

In [79]:
network_results = pd.read_csv('network_dtw_results_sorted.csv')

In [80]:
# Filter results to include only valid files
valid_combos = []
for f in valid_density_files + valid_coeff_files:
    filename = os.path.basename(f)
    if 'netdense_' in filename:
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        valid_combos.append((source.upper(), threshold, window.replace('day', ''), 'Network Density'))
    else:
        _, source, threshold, window = filename.replace('.csv', '').split('_')
        valid_combos.append((source.upper(), threshold, window.replace('day', ''), 'Clustering Coefficient'))

In [81]:
# Filter network results
filtered_network = network_results[network_results.apply(lambda x: 
    (x['source'], x['threshold'], x['window'], x['metric_type']) in valid_combos, axis=1)]

In [82]:
# Test overall dtw_scores for network statistics
print("\nShapiro-Wilk test for all network statistics DTW scores:")
stat, p_value = stats.shapiro(network_results['dtw_score'])
print(f"Statistic: {stat:.4f}")
print(f"p-value: {p_value:.4e}")
print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}\n")

# Test dtw_scores for each network metric type
print("Shapiro-Wilk test for each network metric type:")
for metric in sorted(network_results['metric_type'].unique()):
    metric_scores = network_results[network_results['metric_type'] == metric]['dtw_score']
    stat, p_value = stats.shapiro(metric_scores)
    print(f"\nMetric Type: {metric}")
    print(f"Statistic: {stat:.4f}")
    print(f"p-value: {p_value:.4e}")
    print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}")


Shapiro-Wilk test for all network statistics DTW scores:
Statistic: 0.9441
p-value: 1.2079e-09
Normally distributed? No

Shapiro-Wilk test for each network metric type:

Metric Type: Clustering Coefficient
Statistic: 0.9425
p-value: 4.3337e-06
Normally distributed? No

Metric Type: Network Density
Statistic: 0.9615
p-value: 2.0007e-04
Normally distributed? No


In [83]:
pca_results = pd.read_csv('pca_dtw_results_sorted.csv')

In [84]:
pca_base = "../gt_pca_corr_adj_matrix"
valid_pca_files = []

In [85]:
for pca_dir in [d for d in os.listdir(pca_base) if os.path.isdir(os.path.join(pca_base, d)) and d.startswith('pc')]:
    dir_path = os.path.join(pca_base, pca_dir)
    
    # Check network density files
    for f in glob.glob(os.path.join(dir_path, "netdense_*.csv")):
        if not is_all_zeros(f, 'network_density'):
            valid_pca_files.append((pca_dir, f))
            
    # Check clustering coefficient files
    for f in glob.glob(os.path.join(dir_path, "cluscoeff_*.csv")):
        if not is_all_zeros(f, 'clustering_coefficient'):
            valid_pca_files.append((pca_dir, f))

In [86]:
pca_results = pd.read_csv('pca_dtw_results_sorted.csv')

In [87]:
valid_pca_combos = []
for pca_dir, f in valid_pca_files:
    filename = os.path.basename(f)
    group_name = '_'.join(pca_dir.split('_')[1:])  # Remove pc#_ prefix
    if 'netdense_' in filename:
        valid_pca_combos.append((group_name, 'Network Density'))
    else:
        valid_pca_combos.append((group_name, 'Clustering Coefficient'))

In [88]:
filtered_pca = pca_results[pca_results.apply(lambda x: 
    (x['pca_group'], x['metric_type']) in valid_pca_combos, axis=1)]

In [89]:
# Test overall dtw_scores for PCA groups
print("\nShapiro-Wilk test for all PCA groups DTW scores:")
stat, p_value = stats.shapiro(pca_results['dtw_score'])
print(f"Statistic: {stat:.4f}")
print(f"p-value: {p_value:.4e}")
print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}\n")

# Test dtw_scores for each PCA group
print("Shapiro-Wilk test for each PCA group:")
for group in sorted(pca_results['pca_group'].unique()):
    group_scores = pca_results[pca_results['pca_group'] == group]['dtw_score']
    stat, p_value = stats.shapiro(group_scores)
    print(f"\nPCA Group: {group}")
    print(f"Statistic: {stat:.4f}")
    print(f"p-value: {p_value:.4e}")
    print(f"Normally distributed? {'Yes' if p_value > 0.05 else 'No'}")


Shapiro-Wilk test for all PCA groups DTW scores:
Statistic: 0.9628
p-value: 4.0952e-15
Normally distributed? No

Shapiro-Wilk test for each PCA group:

PCA Group: MSVFaceWearing&Others-0.5
Statistic: 0.9574
p-value: 7.9018e-04
Normally distributed? No

PCA Group: MSVSymptoms&NewNormalProtocols-0.6
Statistic: 0.9648
p-value: 1.1294e-03
Normally distributed? No

PCA Group: MSVSymptoms-0.5
Statistic: 0.9223
p-value: 1.4266e-07
Normally distributed? No

PCA Group: RSVFaceWearing&Others-0.5
Statistic: 0.9742
p-value: 2.1011e-02
Normally distributed? No

PCA Group: RSVSymptoms&NewNormalProtocols-0.5
Statistic: 0.9419
p-value: 3.8717e-06
Normally distributed? No

PCA Group: RSVSymptoms&NewNormalProtocols1-0.6
Statistic: 0.9427
p-value: 4.4480e-06
Normally distributed? No

PCA Group: RSVSymptoms&NewNormalProtocols2-0.6
Statistic: 0.9724
p-value: 1.4268e-02
Normally distributed? No


In [90]:
print("\n\nSummary of Zero-Value Files:")
print(f"Network Density files excluded: {len(density_files) - len(valid_density_files)}")
print(f"Clustering Coefficient files excluded: {len(coeff_files) - len(valid_coeff_files)}")
print(f"Total PCA files processed: {len(valid_pca_files)}")



Summary of Zero-Value Files:
Network Density files excluded: 0
Clustering Coefficient files excluded: 0
Total PCA files processed: 98
