In [1]:
import sys
import os

# Add the parent directory to the sys.path to avoid 'ModuleNotFoundError'
sys.path.append(os.path.abspath(os.path.join('..')))

import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["font.family"] = ['serif']

from src.helpers import load_json, load_pickle
from src.paths import paths
from src.config import MODEL_NAMES

## 1. Check if metrics have normal distribution

In [2]:
metrics_dict = load_json(paths.get('metric_fold_path'))
metrics_to_analyze = [
    'accuracy',
    'roc_auc',
    'f1_score',
    'precision',
    'recall',
    'specificity'
]

In [3]:
def check_normality(metrics_dict, metrics_to_analyze):
    results = []

    for model, metrics_list in metrics_dict.items():
        for metric in metrics_to_analyze:
            values = [metrics[metric] for metrics in metrics_list]
            stat, p_value = stats.shapiro(values)
            result = {
                'model': model,
                'metric': metric,
                'statistic': stat,
                'p_value': p_value,
                'normality': 'normal' if p_value > 0.05 else 'not normal'
            }
            results.append(result)
    
    results_df = pd.DataFrame(results)
    return results_df

normality_results_df = check_normality(metrics_dict, metrics_to_analyze)
normality_results_df

Unnamed: 0,model,metric,statistic,p_value,normality
0,catboost,accuracy,0.946522,0.712336,normal
1,catboost,roc_auc,0.838784,0.161588,normal
2,catboost,f1_score,0.980866,0.9392,normal
3,catboost,precision,0.940116,0.666758,normal
4,catboost,recall,0.80299,0.085693,normal
5,catboost,specificity,0.910757,0.472151,normal
6,xgboost,accuracy,0.977634,0.921584,normal
7,xgboost,roc_auc,0.800256,0.081425,normal
8,xgboost,f1_score,0.963147,0.829707,normal
9,xgboost,precision,0.946566,0.712645,normal


## 2. Statistical tests

### 2.1. Comparing Model Performance

In [4]:
def extract_metric_scores(metrics_dict, metric_name):
    metric_scores = {}
    for model_name in MODEL_NAMES:
        metric_scores[model_name] = [score[metric_name] for score in metrics_dict[model_name]]
    return metric_scores

def perform_friedman_test(metrics_dict, metric_name, verbose=0):
    # Prepare the data in the format required for the Friedman test
    metric_scores = extract_metric_scores(metrics_dict, metric_name)
    scores = [metric_scores[model_name] for model_name in metric_scores]
    
    # Perform the Friedman test
    stat, p_value = friedmanchisquare(*scores)
    
    # Print the result
    print(f"Friedman test result for {metric_name}:")
    if verbose:
        print(f"Test Statistic: {stat}")
        print(f"P-Value: {p_value}")
    
    if p_value < 0.05:
        print(f"Significant differences")
    else:
        print(f"No significant differences")
    print('')


In [5]:
metrics_to_analyze = ['accuracy', 'roc_auc', 'f1_score', 'precision', 'recall', 'specificity']

for metric_name in metrics_to_analyze:
    perform_friedman_test(metrics_dict, metric_name)

Friedman test result for accuracy:
Significant differences

Friedman test result for roc_auc:
Significant differences

Friedman test result for f1_score:
Significant differences

Friedman test result for precision:
Significant differences

Friedman test result for recall:
No significant differences

Friedman test result for specificity:
Significant differences



In [6]:
def identify_different_models(metrics_dict, metric_name, verbose=0):
    metric_scores = extract_metric_scores(metrics_dict, metric_name)
    scores = np.array([metric_scores[model_name] for model_name in metric_scores])
            
    # Perform the Nemenyi post-hoc test
    nemenyi_results = posthoc_nemenyi_friedman(scores.T)
    
    print(f"\nNemenyi post-hoc test results for {metric_name}:")
    if verbose:
        print(nemenyi_results)
    
    significant_pairs = np.where(nemenyi_results < 0.05)
    for i in range(len(significant_pairs[0])):
        model1 = list(metric_scores.keys())[significant_pairs[0][i]]
        model2 = list(metric_scores.keys())[significant_pairs[1][i]]
        print(f"Nemenyi post-hoc test found significant differences between {model1} and {model2}")
        
    print('')


In [7]:
metrics_to_analyze = ['accuracy', 'roc_auc', 'f1_score', 'precision', 'specificity']

for metric_name in metrics_to_analyze:
    identify_different_models(metrics_dict, metric_name, verbose=0)


Nemenyi post-hoc test results for accuracy:
Nemenyi post-hoc test found significant differences between lgbm and lr
Nemenyi post-hoc test found significant differences between lr and lgbm




Nemenyi post-hoc test results for roc_auc:
Nemenyi post-hoc test found significant differences between catboost and lr
Nemenyi post-hoc test found significant differences between lr and catboost


Nemenyi post-hoc test results for f1_score:
Nemenyi post-hoc test found significant differences between lgbm and lr
Nemenyi post-hoc test found significant differences between lr and lgbm


Nemenyi post-hoc test results for precision:
Nemenyi post-hoc test found significant differences between lgbm and svm
Nemenyi post-hoc test found significant differences between svm and lgbm


Nemenyi post-hoc test results for specificity:



### 2.2. Comparing the Impact of Features Using SHAP Values

In [8]:
shap_values = load_pickle(paths.get('shap_values_path'))
print(len(shap_values), len(shap_values['catboost']), shap_values['catboost'][0].shape)

6 5 (54, 20)
