In [9]:
import numpy as np
from scipy.stats import f_oneway, kruskal
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import MultiComparison



# Sample data: Each list represents the performance of one model
model_1 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_2.5.parquet')
model_2 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_3.0.parquet')
model_3 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_4.0.parquet')
model_4 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_5.0.parquet')
model_5 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_6.0.parquet')
model_6 = pd.read_parquet('/Users/carlborg/Desktop/model_performance_data/citizen_model_16.0.parquet')

accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_16 = [], [], [], [], [], []

for df,accuracy_list in zip([model_1, model_2, model_3, model_4,model_5,model_6], [accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6,accuracy_16]):
    for i in range(500):
        true = 0
        for idx,(answer,model) in enumerate(zip(df["answer"], df[f"model_answer_{i}"])):
            if answer.lower() == model.lower():
                true += 1
        accuracy_list.append(true/len(df))
# Combine data into a list of lists and create group labels
data = np.array([accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_16])
groups = ['2.5 BPW', '3.0 BPW', '4.0 BPW', '5.0 BPW', '6.0 BPW', '16.0 BPW']

# Perform ANOVA test
statistic, p_value = f_oneway(*data)

print(f'ANOVA Results:')
print(f'F-statistic: {statistic:.4f}')
print(f'P-value: {p_value:.4f}')

# Perform Turkey HSD post hoc test if ANOVA is significant
if p_value < 0.05:
    # Create DataFrame for easier handling
    values = np.concatenate(data)  # Combine all model values into a single array
    group_labels = np.repeat(groups, len(accuracy_2))  # Repeat each group label 100 times (the size of each model list)
    df_long = pd.DataFrame({'Value': values, 'Group': group_labels})

    # Perform Turkey HSD post hoc test
    mc = MultiComparison(df_long['Value'], df_long['Group'])
    result = mc.tukeyhsd()
    
    print('\nPairwise comparisons (Turkey HSD):')
    print(result)


ANOVA Results:
F-statistic: 113761.7259
P-value: 0.0000

Pairwise comparisons (Turkey HSD):
 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2 meandiff p-adj  lower   upper  reject
------------------------------------------------------
16.0 BPW 2.5 BPW  -0.2067   0.0 -0.2078 -0.2055   True
16.0 BPW 3.0 BPW  -0.1812   0.0 -0.1824   -0.18   True
16.0 BPW 4.0 BPW  -0.0154   0.0 -0.0166 -0.0142   True
16.0 BPW 5.0 BPW  -0.0032   0.0 -0.0044  -0.002   True
16.0 BPW 6.0 BPW   0.0092   0.0   0.008  0.0104   True
 2.5 BPW 3.0 BPW   0.0255   0.0  0.0243  0.0267   True
 2.5 BPW 4.0 BPW   0.1913   0.0  0.1901  0.1925   True
 2.5 BPW 5.0 BPW   0.2034   0.0  0.2023  0.2046   True
 2.5 BPW 6.0 BPW   0.2159   0.0  0.2147  0.2171   True
 3.0 BPW 4.0 BPW   0.1658   0.0  0.1646   0.167   True
 3.0 BPW 5.0 BPW    0.178   0.0  0.1768  0.1792   True
 3.0 BPW 6.0 BPW   0.1904   0.0  0.1892  0.1916   True
 4.0 BPW 5.0 BPW   0.0122   0.0   0.011  0.0133   True
 4.0 BPW 6.0 BPW   0.0246   

In [21]:
df_speed = pd.read_parquet("/Users/carlborg/Desktop/model_performance_data/speed.parquet")
data = np.array([df_speed["speed_2.5bpw"], df_speed["speed_3.0bpw"], df_speed["speed_4.0bpw"], df_speed["speed_5.0bpw"], df_speed["speed_6.0bpw"], df_speed["speed_16.0bpw"]])
groups = ['2.5 BPW', '3.0 BPW', '4.0 BPW', '5.0 BPW', '6.0 BPW', '16.0 BPW']


# Perform Kruskal test
statistic, p_value = kruskal(*data)

print(f'Kruskal - Wallis Results:')
print(f'F-statistic: {statistic:.4f}')
print(f'P-value: {p_value:.4f}')

# Perform S post hoc test if ANOVA is significant
if p_value < 0.05:
    # Create DataFrame for easier handling
    values = np.concatenate(data)  # Combine all model values into a single array
    group_labels = np.repeat(groups, 100)  # Repeat each group label 100 times (the size of each model list)
    df_long = pd.DataFrame({'Value': values, 'Group': group_labels})

    # Perform Tukey HSD post hoc test
    mc = MultiComparison(df_long['Value'], df_long['Group'])
    result = mc.tukeyhsd()
    
    print('\nPairwise comparisons (Turkey HSD):')
    print(result)


Kruskal - Wallis Results:
F-statistic: 529.7065
P-value: 0.0000

Pairwise comparisons (Turkey HSD):
   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2 meandiff p-adj   lower    upper   reject
---------------------------------------------------------
16.0 BPW 2.5 BPW 124.0779    0.0 120.0924 128.0634   True
16.0 BPW 3.0 BPW 109.2125    0.0  105.227  113.198   True
16.0 BPW 4.0 BPW  97.3776    0.0  93.3921 101.3631   True
16.0 BPW 5.0 BPW  93.3692    0.0  89.3837  97.3547   True
16.0 BPW 6.0 BPW  84.2865    0.0   80.301   88.272   True
 2.5 BPW 3.0 BPW -14.8654    0.0 -18.8509 -10.8799   True
 2.5 BPW 4.0 BPW -26.7003    0.0 -30.6858 -22.7148   True
 2.5 BPW 5.0 BPW -30.7087    0.0 -34.6942 -26.7232   True
 2.5 BPW 6.0 BPW -39.7914    0.0 -43.7769 -35.8059   True
 3.0 BPW 4.0 BPW -11.8349    0.0 -15.8204  -7.8494   True
 3.0 BPW 5.0 BPW -15.8433    0.0 -19.8288 -11.8578   True
 3.0 BPW 6.0 BPW  -24.926    0.0 -28.9115 -20.9405   True
 4.0 BPW 5.0 BPW  -4.0084 0.04