In [8]:
import numpy as np
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import pandas as pd

# Data from the tables (metrics for each scenario in the two groups)
# Rows correspond to different metrics (e.g., Mean Absolute Deviation Distance)
# Columns correspond to different scenarios within the group
data_adv_fair= np.array([
    [0.2000, 0.1000, 2.5318],     # Contestability: Mean Absolute Deviation Distance
    [2.0000, 1.0000, 1.0662],     # Contestability: L1 Distance
    [2.0000, 1.0000, 1.0044],     # Contestability: L2 Distance
    [1.0000, 1.0000, 1.0000],     # Contestability: L-infty Distance
    [0.8462, 0.9231, 0.8462],     # Contestability: Sparsity
    [0.5774, 0.5774, 0.5763],     # Contestability: Mahalanobis Distance
])
data_adv_fair_recourse = np.array([
    [30825823.6386, 1.4449, 1.1949], # Recourse: Mean Absolute Deviation Distance
    [1.2330, 1.0464, 1.0375],     # Recourse: L1 Distance
    [0.9999, 1.0022, 1.0014],     # Recourse: L2 Distance
    [0.9628, 1.0000, 1.0000],     # Recourse: L-infty Distance
    [0.8182, 0.8182, 0.8182],     # Recourse: Sparsity
    [0.0207, 0.5779, 0.5769]      # Recourse: Mahalanobis Distance
])

data_adv_unfair = np.array([
    [2.2127, 4898648.3875, 261.8989],  # Contestability: Mean Absolute Deviation Distance
    [0.4552, 1.1959, 1.6851],          # Contestability: L1 Distance
    [0.2072, 1.0384, 1.4694],          # Contestability: L2 Distance
    [0.4552, 1.0000, 1.0000],          # Contestability: L-infty Distance
    [0.9231, 0.8462, 0.8462],          # Contestability: Sparsity
    [0.0687, 0.5774, 0.5806],          # Contestability: Mahalanobis Distance
])
data_adv_unfair_recourse = np.array([
    [107.6252, 1.7001, 200.6375],      # Recourse: Mean Absolute Deviation Distance
    [0.2811, 1.2640, 1.5243],          # Recourse: L1 Distance
    [0.0790, 1.0697, 1.2749],          # Recourse: L2 Distance
    [0.2811, 1.0000, 1.0000],          # Recourse: L-infty Distance
    [0.9231, 0.8462, 0.8462],          # Recourse: Sparsity
    [0.0028, 0.6163, 0.5798]           # Recourse: Mahalanobis Distance
])

# Combine both datasets for Friedman's Test
# Each row represents a scenario (Scenario 1, 2, 3) and columns represent different metrics

data_combined = np.concatenate([data_adv_fair, data_adv_unfair], axis=1)
data_combined_recourse = np.concatenate([data_adv_fair_recourse, data_adv_unfair_recourse], axis=1)

# Step 3: Perform Friedman's Test
friedman_stat, p_value = friedmanchisquare(*data_combined.T)
friedman_stat_recourse, p_value_recourse = friedmanchisquare(*data_combined_recourse.T)

print('Friedman Test for difference in Counterfactual explanations (contestability) for the three scenarios:')
print(f'Statistic: {friedman_stat}, p-value: {p_value}')
print('Friedman Test for difference in Counterfactual explanations (recourse) for the three scenarios:')
print(f'Statistic: {friedman_stat_recourse}, p-value: {p_value_recourse}')

# Step 4: If p < 0.05, perform Nemenyi's test
if p_value < 0.05:
    print('Nemenyi Test (Post-Hoc):')
    nemenyi_results = sp.posthoc_nemenyi_friedman(data_combined.T)
    print(nemenyi_results)

    # Highlighting significant results
    sig_results = nemenyi_results.applymap(lambda x: '**' if x < 0.05 else '')
    print('Significant Differences (p < 0.05):')
    print(sig_results)
else:
    print("No significant differences were found across the algorithms using Friedman's test.")

if p_value_recourse < 0.05:
    print('Nemenyi Test (Post-Hoc) (Recourse):')
    nemenyi_results_recourse = sp.posthoc_nemenyi_friedman(data_combined_recourse.T)
    print(nemenyi_results_recourse)

    # Highlighting significant results
    sig_results_recourse = nemenyi_results_recourse.applymap(lambda x: '**' if x < 0.05 else '')
    print('Significant Differences (p < 0.05) (Recourse):')
    print(sig_results_recourse)
else:
    print("No significant differences were found across the algorithms using Friedman's test (Recourse).")


Friedman Test for difference in Counterfactual explanations (contestability) for the three scenarios:
Statistic: 8.657142857142832, p-value: 0.12354615724886574
Friedman Test for difference in Counterfactual explanations (recourse) for the three scenarios:
Statistic: 12.282051282051274, p-value: 0.031120880525240337
No significant differences were found across the algorithms using Friedman's test.
Nemenyi Test (Post-Hoc) (Recourse):
          0         1         2         3         4         5
0  1.000000  0.840572  0.256521  0.075349  0.015147  0.001000
1  0.840572  1.000000  0.900000  0.617769  0.296154  0.006853
2  0.256521  0.900000  1.000000  0.900000  0.885131  0.133062
3  0.075349  0.617769  0.900000  1.000000  0.900000  0.385739
4  0.015147  0.296154  0.885131  0.900000  1.000000  0.706891
5  0.001000  0.006853  0.133062  0.385739  0.706891  1.000000
Significant Differences (p < 0.05) (Recourse):
    0   1 2 3   4   5
0              **  **
1                  **
2               

  sig_results_recourse = nemenyi_results_recourse.applymap(lambda x: '**' if x < 0.05 else '')
