In [1]:
import sys
import os

# Add the parent directory to the sys.path to avoid 'ModuleNotFoundError'
sys.path.append(os.path.abspath(os.path.join('..')))

import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["font.family"] = ['serif']

from src.helpers import load_json
from src.paths import paths
from src.config import MODEL_NAMES

## 1. Check if metrics have normal distribution

In [19]:
def extract_metric_scores(metrics, metric_name):
    metric_scores = {}
    for model_name in MODEL_NAMES:
        metric_scores[model_name] = [score[metric_name] for score in metrics[model_name]]
    return metric_scores

# Shapiro-Wilk test for normality
def sw_test(data, verbose=0):
    stat, p_value = stats.shapiro(data)
    if verbose:
        print(f"Shapiro-Wilk Test: (Statistic: {stat:.3f}, p-value: {p_value:.3f})")
    return p_value > 0.05

# Kolmogorov-Smirnov test for normality
def ks_test(data, verbose=0):
    stat, p_value = stats.kstest(data, 'norm', args=(np.mean(data), np.std(data)))
    if verbose:
        print(f"Kolmogorov-Smirnov Test: (Statistic: {stat:.3f}, p-value: {p_value:.3f})")
    return p_value > 0.05

# Visual inspection: Q-Q plot and Histogram
def plot_distribution(data, model_name, metric_name):
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    stats.probplot(data, dist="norm", plot=plt)
    plt.title(f"Q-Q Plot for {model_name} ({metric_name})")

    plt.subplot(1, 2, 2)
    sns.histplot(data, kde=True)
    plt.title(f"Histogram for {model_name} ({metric_name})")

    plt.tight_layout()
    plt.show()

def check_normality(metric_scores, verbose=0):
    results = []

    for model_name in MODEL_NAMES:
        if verbose:
            print(model_name.upper())
        sw_normal = sw_test(metric_scores[model_name], verbose=verbose)
        ks_normal = ks_test(metric_scores[model_name], verbose=verbose)
        is_normal = sw_normal and ks_normal
        results.append({
            'Model': model_name.upper(),
            'Shapiro-Wilk Normal': sw_normal,
            'Kolmogorov-Smirnov Normal': ks_normal,
            'Overall Normality': 'Normal' if is_normal else 'Not Normal'
        })
        if verbose:
            print('')

    results_df = pd.DataFrame(results)
    return results_df

In [20]:
metrics = load_json(paths.get('metric_fold_path'))
metrics_to_analyze = [
    'accuracy',
    'roc_auc',
    'f1_score',
]

In [21]:
for metric_name in metrics_to_analyze:
    print(f"\nAnalyzing {metric_name.upper()}...\n")
    metric_scores = extract_metric_scores(metrics, metric_name)
    results_df = check_normality(metric_scores, verbose=1)
    print(results_df)


Analyzing ACCURACY...

CATBOOST
Shapiro-Wilk Test: (Statistic: 0.947, p-value: 0.712)
Kolmogorov-Smirnov Test: (Statistic: 0.277, p-value: 0.753)

XGBOOST
Shapiro-Wilk Test: (Statistic: 0.978, p-value: 0.922)
Kolmogorov-Smirnov Test: (Statistic: 0.186, p-value: 0.982)

LGBM
Shapiro-Wilk Test: (Statistic: 0.915, p-value: 0.495)
Kolmogorov-Smirnov Test: (Statistic: 0.221, p-value: 0.922)

RF
Shapiro-Wilk Test: (Statistic: 0.968, p-value: 0.859)
Kolmogorov-Smirnov Test: (Statistic: 0.176, p-value: 0.990)

SVM
Shapiro-Wilk Test: (Statistic: 0.929, p-value: 0.588)
Kolmogorov-Smirnov Test: (Statistic: 0.286, p-value: 0.720)

LR
Shapiro-Wilk Test: (Statistic: 0.958, p-value: 0.797)
Kolmogorov-Smirnov Test: (Statistic: 0.243, p-value: 0.864)

      Model  Shapiro-Wilk Normal  Kolmogorov-Smirnov Normal Overall Normality
0  CATBOOST                 True                       True            Normal
1   XGBOOST                 True                       True            Normal
2      LGBM         

## 2. Statistical tests

### 2.1. Comparing Model Performance

In [5]:
def prepare_data_for_anova(metric_scores, metric_name):
    data = pd.DataFrame(metric_scores)
    data['Fold'] = list(range(1, 6))  # Adding fold information
    data_long = pd.melt(data, id_vars=['Fold'], value_vars=data.columns[:-1], var_name='Model', value_name=metric_name)
    return data_long

def perform_repeated_measures_anova(data, metric_name):
    rm_anova = AnovaRM(data, metric_name, 'Fold', within=['Model']).fit()
    return rm_anova.summary()

def perform_tukey_hsd(data, metric_name):
    tukey = pairwise_tukeyhsd(endog=data[metric_name], groups=data['Model'], alpha=0.05)
    return tukey

def wilcoxon_test(model1_scores, model2_scores):
    stat, p_value = stats.wilcoxon(model1_scores, model2_scores, method='exact')
    return stat, p_value

def non_parametric_pairwise_comparisons(metric_scores):
    model_names = list(metric_scores.keys())
    results = []

    for i, model1 in enumerate(model_names):
        for model2 in model_names[i + 1:]:
            stat, p_value = wilcoxon_test(metric_scores[model1], metric_scores[model2])
            results.append({
                'Model 1': model1,
                'Model 2': model2,
                'Wilcoxon Statistic': stat,
                'p-value': p_value,
                'Significant': p_value < 0.05
            })

    results_df = pd.DataFrame(results)
    return results_df

def analyze_metric(metric_scores, metric_name):
    data = prepare_data_for_anova(metric_scores, metric_name)

    # Perform parametric tests
    anova_summary = perform_repeated_measures_anova(data, metric_name)
    print(anova_summary)

    tukey_result = perform_tukey_hsd(data, metric_name)
    print(tukey_result)

    # # Perform non-parametric tests
    # wilcoxon_results_df = non_parametric_pairwise_comparisons(metric_scores)
    # print(wilcoxon_results_df)
    # return anova_summary, tukey_result, wilcoxon_results_df

    return anova_summary, tukey_result

In [6]:
# Analyze each metric
for metric_name in metrics_to_analyze:
    print(f"\nAnalyzing {metric_name.upper()}...\n")
    metric_scores = extract_metric_scores(metrics, metric_name)
    analyze_metric(metric_scores, metric_name)


Analyzing ACCURACY...

               Anova
      F Value Num DF  Den DF Pr > F
-----------------------------------
Model  6.0849 5.0000 20.0000 0.0014



 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2 meandiff p-adj   lower  upper  reject
------------------------------------------------------
catboost    lgbm   0.0001    1.0 -0.0877 0.0878  False
catboost      lr  -0.0659 0.2238 -0.1537 0.0218  False
catboost      rf  -0.0403 0.7157  -0.128 0.0475  False
catboost     svm  -0.0624 0.2744 -0.1502 0.0253  False
catboost xgboost  -0.0147 0.9949 -0.1025  0.073  False
    lgbm      lr   -0.066 0.2229 -0.1537 0.0218  False
    lgbm      rf  -0.0403 0.7143 -0.1281 0.0474  False
    lgbm     svm  -0.0625 0.2734 -0.1502 0.0253  False
    lgbm xgboost  -0.0148 0.9947 -0.1026 0.0729  False
      lr      rf   0.0257 0.9417 -0.0621 0.1134  False
      lr     svm   0.0035    1.0 -0.0843 0.0913  False
      lr xgboost   0.0512 0.4824 -0.0366 0.1389  False
      rf     svm  -0.0222 0.9682 -0.1099 0.0656  False
      rf xgboost   0.0255 0.9429 -0.0622 0.1133  False
     svm xgboost   0.0477 0.5572 -0.0401 0.1354  False
----------

### 2.2. Comparing the Impact of Features Using SHAP Values