In [52]:
import pandas as pd
import numpy as np
from scipy import stats
from scikit_posthocs import posthoc_dunn

In [53]:
def perform_kruskal_and_posthoc(data: pd.DataFrame, variable: str, group_name: str):
    """
    Perform Kruskal-Wallis H test and Dunn's post-hoc test if significant
    """
    
    # Group data by the variable
    groups = [group['dtw_score'].values for name, group in data.groupby(variable)]
    
    # Perform Kruskal-Wallis H test
    h_stat, p_value = stats.kruskal(*groups)
    
    print(f"\nKruskal-Wallis test for {variable} in {group_name}:")
    print(f"H statistic: {h_stat:.4f}")
    print(f"p-value: {p_value:.4e}")
    print(f"Significant difference? {'Yes' if p_value < 0.05 else 'No'}")
    
    # Perform post-hoc test if significant
    if p_value < 0.05:
        print(f"\nPost-hoc Dunn test for {variable}:")
        posthoc = posthoc_dunn(data, val_col='dtw_score', group_col=variable, p_adjust='bonferroni')
        
        # Save post-hoc results
        posthoc.to_csv(f'posthoc_{group_name.lower().replace(" ", "_")}_{variable}.csv')
        
        # Print significant pairwise differences
        print("\nSignificant pairwise differences (p < 0.05):")
        significant_pairs = []
        for idx in posthoc.index:
            for col in posthoc.columns:
                if idx < col and posthoc.loc[idx, col] < 0.05:
                    print(f"{idx} vs {col}: p = {posthoc.loc[idx, col]:.4e}")
                    significant_pairs.append({
                        'Group 1': idx,
                        'Group 2': col,
                        'p-value': posthoc.loc[idx, col]
                    })
    
    result = {
        'Variable': variable,
        'H_statistic': h_stat,
        'p_value': p_value,
        'Significant': p_value < 0.05
    }
    
    return result

In [54]:
tags_results = pd.read_csv('dtw_results_multi_radius_sorted.csv')

tags_variables = ['source', 'tag', 'comparison', 'radius']
tags_tests = []
for var in tags_variables:
    result = perform_kruskal_and_posthoc(tags_results, var, "Tags")
    tags_tests.append(result)

pd.DataFrame(tags_tests).to_csv('tags_kruskal_results.csv', index=False)


Kruskal-Wallis test for source in Tags:
H statistic: 184.1287
p-value: 6.0815e-42
Significant difference? Yes

Post-hoc Dunn test for source:

Significant pairwise differences (p < 0.05):
MSV vs RSV: p = 6.0815e-42

Kruskal-Wallis test for tag in Tags:
H statistic: 26.1666
p-value: 2.4653e-02
Significant difference? Yes

Post-hoc Dunn test for tag:

Significant pairwise differences (p < 0.05):

Kruskal-Wallis test for comparison in Tags:
H statistic: 1.4770
p-value: 2.2425e-01
Significant difference? No

Kruskal-Wallis test for radius in Tags:
H statistic: 19.1988
p-value: 7.1831e-04
Significant difference? Yes

Post-hoc Dunn test for radius:

Significant pairwise differences (p < 0.05):
7 vs 50: p = 4.8210e-04
15 vs 50: p = 1.5262e-02


In [55]:
network_results = pd.read_csv('network_dtw_results_sorted.csv')

network_variables = ['metric_type', 'source', 'threshold', 'window', 
                    'comparison', 'radius']
network_tests = []
for var in network_variables:
    result = perform_kruskal_and_posthoc(network_results, var, "Network_Statistics")
    network_tests.append(result)

pd.DataFrame(network_tests).to_csv('network_kruskal_results.csv', index=False)


Kruskal-Wallis test for metric_type in Network_Statistics:
H statistic: 26.4387
p-value: 2.7203e-07
Significant difference? Yes

Post-hoc Dunn test for metric_type:

Significant pairwise differences (p < 0.05):
Clustering Coefficient vs Network Density: p = 2.7203e-07

Kruskal-Wallis test for source in Network_Statistics:
H statistic: 4.0971
p-value: 4.2958e-02
Significant difference? Yes

Post-hoc Dunn test for source:

Significant pairwise differences (p < 0.05):
MSV vs RSV: p = 4.2958e-02

Kruskal-Wallis test for threshold in Network_Statistics:
H statistic: 27.7149
p-value: 4.1685e-06
Significant difference? Yes

Post-hoc Dunn test for threshold:

Significant pairwise differences (p < 0.05):
0.4 vs 0.8: p = 5.0113e-04
0.5 vs 0.8: p = 6.2024e-06
0.6 vs 0.8: p = 1.2553e-03

Kruskal-Wallis test for window in Network_Statistics:
H statistic: 1.6594
p-value: 1.9768e-01
Significant difference? No

Kruskal-Wallis test for comparison in Network_Statistics:
H statistic: 154.8028
p-value: 1

In [56]:
pca_results = pd.read_csv('pca_dtw_results_sorted.csv')

pca_variables = ['pca_group', 'metric_type', 'threshold', 'window', 
                 'comparison', 'radius']
pca_tests = []
for var in pca_variables:
    result = perform_kruskal_and_posthoc(pca_results, var, "PCA_Groups")
    pca_tests.append(result)

pd.DataFrame(pca_tests).to_csv('pca_kruskal_results.csv', index=False)


Kruskal-Wallis test for pca_group in PCA_Groups:
H statistic: 62.5934
p-value: 1.3357e-11
Significant difference? Yes

Post-hoc Dunn test for pca_group:

Significant pairwise differences (p < 0.05):
MSVFaceWearing&Others-0.5 vs MSVSymptoms&NewNormalProtocols-0.6: p = 4.1219e-03
MSVFaceWearing&Others-0.5 vs RSVFaceWearing&Others-0.5: p = 3.2094e-06
MSVFaceWearing&Others-0.5 vs RSVSymptoms&NewNormalProtocols2-0.6: p = 2.1579e-05
MSVSymptoms&NewNormalProtocols-0.6 vs RSVSymptoms&NewNormalProtocols-0.5: p = 4.5758e-03
MSVSymptoms&NewNormalProtocols-0.6 vs RSVSymptoms&NewNormalProtocols1-0.6: p = 4.4134e-02
MSVSymptoms-0.5 vs RSVFaceWearing&Others-0.5: p = 4.0667e-03
MSVSymptoms-0.5 vs RSVSymptoms&NewNormalProtocols2-0.6: p = 1.7640e-02
RSVFaceWearing&Others-0.5 vs RSVSymptoms&NewNormalProtocols-0.5: p = 2.1950e-06
RSVFaceWearing&Others-0.5 vs RSVSymptoms&NewNormalProtocols1-0.6: p = 4.8857e-05
RSVSymptoms&NewNormalProtocols-0.5 vs RSVSymptoms&NewNormalProtocols2-0.6: p = 1.7198e-05
RSVSym