### Example Code

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.contingency_tables import mcnemar

# Generate a synthetic binary classification dataset
X, y = make_classification(n_samples=500, n_features=20, random_state=42)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train[:,0:5], y_train)

# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Get predictions from both models
lr_preds = lr_model.predict(X_test[:,0:5])
dt_preds = dt_model.predict(X_test)

# Build the contingency table
# Both correct
both_correct = np.sum((lr_preds == y_test) & (dt_preds == y_test))
# Logistic correct, Decision Tree incorrect
lr_correct_dt_incorrect = np.sum((lr_preds == y_test) & (dt_preds != y_test))
# Logistic incorrect, Decision Tree correct
lr_incorrect_dt_correct = np.sum((lr_preds != y_test) & (dt_preds == y_test))
# Both incorrect
both_incorrect = np.sum((lr_preds != y_test) & (dt_preds != y_test))

# Contingency table
contingency_table = np.array([[both_correct + both_incorrect, lr_correct_dt_incorrect],
                              [lr_incorrect_dt_correct, 0]])

# Perform McNemar test
result = mcnemar(contingency_table, exact=True)

# Output the test results
print("McNemar test p-value:", result.pvalue)
if result.pvalue < 0.05:
    print("The performance difference between the models is statistically significant.")
else:
    print("The performance difference between the models is not statistically significant.")

print(contingency_table)

McNemar test p-value: 0.0002771615982055664
The performance difference between the models is statistically significant.
[[126   3]
 [ 21   0]]


### Create Train-Test Split for McNemar Test

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

RUN_TRAIN_TEST_SPLIT = False

list_n = [9, 12, 15, 18]

save_loc = f'Datasets_McNemar_Test/'

if RUN_TRAIN_TEST_SPLIT
    for n in list_n:
        X = np.load('Datasets/kryptonite-%s-X.npy' % (n))
        y = np.load('Datasets/kryptonite-%s-y.npy' % (n))

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 20% of all data for final model evaluation

        print(f'n: [{n}]', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        np.save('%skryptonite-%s-X_train.npy' % (save_loc, n), X_train)
        np.save('%skryptonite-%s-X_test.npy' % (save_loc, n), X_test)
        np.save('%skryptonite-%s-y_train.npy' % (save_loc, n), y_train)
        np.save('%skryptonite-%s-y_test.npy' % (save_loc, n), y_test)



n: [9] (14400, 9) (3600, 9) (14400,) (3600,)
n: [12] (19200, 12) (4800, 12) (19200,) (4800,)
n: [15] (24000, 15) (6000, 15) (24000,) (6000,)
n: [18] (28800, 18) (7200, 18) (28800,) (7200,)


### Test from Results

In [None]:
def mcnemar_test(y_true, y_pred_m1, y_pred_m2):
    both_correct = np.sum((y_pred_m1 == y_true) & (y_pred_m2 == y_true))
    # Logistic correct, Decision Tree incorrect
    lr_correct_dt_incorrect = np.sum((y_pred_m1 == y_true) & (y_pred_m2 != y_true))
    # Logistic incorrect, Decision Tree correct
    lr_incorrect_dt_correct = np.sum((y_pred_m1 != y_true) & (y_pred_m2 == y_true))
    # Both incorrect
    both_incorrect = np.sum((y_pred_m1 != y_true) & (y_pred_m2 != y_true))

    # Contingency table
    contingency_table = np.array([[both_correct + both_incorrect, lr_correct_dt_incorrect],
                                [lr_incorrect_dt_correct, 0]])

    # Perform McNemar test
    result = mcnemar(contingency_table, exact=True)

    return result

