In [1]:
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
def mcnemar_test(y_true, y_pred_m1, y_pred_m2):
    """
    Perform McNemar's test to compare the predictions of two models.

    Parameters:
    ----------
    y_true : np.ndarray
        The ground truth labels for the dataset.
    y_pred_m1 : np.ndarray
        Predictions from the first model (e.g. logistic regression).
    y_pred_m2 : np.ndarray
        Predictions from the second model (e.g. decision tree, random forest, etc.).

    Returns:
    -------
    float
        The p-value from McNemar's test, indicating whether the two models' predictions
        are statistically significantly different.

    Notes:
    -----
    - McNemar's test uses a contingency table to compare the paired predictions:
        [[both_correct + both_incorrect, model1_correct_model2_incorrect],
         [model1_incorrect_model2_correct, 0]]
    - Exact McNemar's test is used for small sample sizes.
    - If the p-value is below a significance threshold (e.g., 0.05), it suggests that
      the two models have statistically different performance.
    """
    both_correct = np.sum((y_pred_m1 == y_true) & (y_pred_m2 == y_true))
    # Logistic correct, Decision Tree incorrect
    lr_correct_dt_incorrect = np.sum((y_pred_m1 == y_true) & (y_pred_m2 != y_true))
    # Logistic incorrect, Decision Tree correct
    lr_incorrect_dt_correct = np.sum((y_pred_m1 != y_true) & (y_pred_m2 == y_true))
    # Both incorrect
    both_incorrect = np.sum((y_pred_m1 != y_true) & (y_pred_m2 != y_true))

    # Contingency table
    contingency_table = np.array([[both_correct + both_incorrect, lr_correct_dt_incorrect],
                                [lr_incorrect_dt_correct, 0]])

    print(contingency_table)

    # Perform McNemar test
    result = mcnemar(contingency_table, exact=True).pvalue

    return result

In [3]:
list_n = [9, 12, 15, 18]
list_models = ['rf', 'nn', 'cluster']

for model_name in list_models:
    for n in list_n:
        y_true = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_test.npy' % n)
        y_pred_m1 = np.load('../../Datasets_Train_Test_Split/kryptonite_%s_pred_%s.npy' % (n, 'logreg'))
        y_pred_m2 = np.load('../../Datasets_Train_Test_Split/kryptonite_%s_pred_%s.npy' % (n, model_name))

        p_value = mcnemar_test(y_true, y_pred_m1, y_pred_m2)
        print(f'model:[{model_name}], n: [{n}], McNemar p-val: [{p_value}]')

[[1835   77]
 [1688    0]]
model:[rf], n: [9], McNemar p-val: [0.0]
[[2473  130]
 [2197    0]]
model:[rf], n: [12], McNemar p-val: [0.0]
[[3091 1421]
 [1488    0]]
model:[rf], n: [15], McNemar p-val: [0.22106191159593794]
[[4478 1362]
 [1360    0]]
model:[rf], n: [18], McNemar p-val: [0.9847082878742756]
[[1836   76]
 [1688    0]]
model:[nn], n: [9], McNemar p-val: [0.0]
[[2350   81]
 [2369    0]]
model:[nn], n: [12], McNemar p-val: [0.0]
[[3047  113]
 [2840    0]]
model:[nn], n: [15], McNemar p-val: [0.0]
[[3650  113]
 [3437    0]]
model:[nn], n: [18], McNemar p-val: [0.0]
[[1848   79]
 [1673    0]]
model:[cluster], n: [9], McNemar p-val: [0.0]
[[2548  955]
 [1297    0]]
model:[cluster], n: [12], McNemar p-val: [6.052485159065259e-13]
[[3275 1381]
 [1344    0]]
model:[cluster], n: [15], McNemar p-val: [0.49043076211320874]
[[3823 1711]
 [1666    0]]
model:[cluster], n: [18], McNemar p-val: [0.4489593312574476]
