# Model Selection Methods

This notebooks demonstrates methods for selecting models that perform on the same domain and task.

- Wilcoxon signed-rank test
- McNemar's Test

In [1]:
import tensorflow as tf
from sklearn.model_selection import KFold
import numpy as np

## Train Models for Image Classification

In [2]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [3]:
X_full = np.concatenate([x_train, x_test])
Y_full = np.concatenate([y_train, y_test])

### Model 1: Fully Connected Neural Network

In [None]:
def create_model_1():

    model_1 = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(8, activation='relu'),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

    model_1.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model_1

### Model 2: Fully Connected Neural Network

In [5]:
def create_model_2():
    model_2 = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

    model_2.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model_2

### Model 3: Convolutional Neural Network

## Wilcoxon Signed-Rank Test

In [77]:
model_1_scores = []
model_2_scores = []

In [85]:
# Runt 5-fold cross-validation and save accuracy scores

n_split=3

for fold_index, (train_index,test_index) in enumerate(KFold(n_split).split(X_full)):
    x_train, x_test = X_full[train_index], X_full[test_index]
    y_train, y_test = Y_full[train_index], Y_full[test_index]
    
    
    model_1 = create_model_1()
    model_2 = create_model_2()
    
    print(f"Model 1, Fold {fold_index}")
    model_1.fit(x_train, y_train, epochs=3)
    print(f"Model 2, Fold {fold_index}")
    model_2.fit(x_train, y_train, epochs=3)
    
    model_1_scores.append(model_1.evaluate(x_test,  y_test, verbose=0)[1])
    model_2_scores.append(model_2.evaluate(x_test,  y_test, verbose=0)[1])

Model 1, Fold 0
Train on 46666 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 2, Fold 0
Train on 46666 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 1, Fold 1
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 2, Fold 1
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 1, Fold 2
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 2, Fold 2
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [87]:
print(model_1_scores)
print(model_2_scores)

[0.8986886, 0.8987271, 0.91239876, 0.8991172, 0.89199847, 0.89962715]
[0.9589869, 0.9612137, 0.9705139, 0.94424444, 0.94565636, 0.94595635]


In [88]:
# Use wilcoxon to calculate p-value

from scipy.stats import wilcoxon
stat, p = wilcoxon(model_1_scores, model_2_scores, zero_method='zsplit')



In [90]:
# p-value
p

0.027707849358079864

- Since p-value < 0.05 we can reject the hypothesis that there is no significant difference between both models
- Model 2 performs better

## McNemar's Test

In [6]:
from mlxtend.evaluate import mcnemar_table, mcnemar
mcnemar_p_values = []

In [7]:
# Runt 5-fold cross-validation and save accuracy scores

n_split=3

for fold_index, (train_index,test_index) in enumerate(KFold(n_split).split(X_full)):
    x_train, x_test = X_full[train_index], X_full[test_index]
    y_train, y_test = Y_full[train_index], Y_full[test_index]
    
    
    model_1 = create_model_1()
    model_2 = create_model_2()
    
    print(f"\nModel 1, Fold {fold_index}")
    model_1.fit(x_train, y_train, epochs=3)
    y_predict_1 = model_1.predict_classes(x_test)
    print(y_predict_1)
    exit(0)
    print(f"\nModel 2, Fold {fold_index}")
    model_2.fit(x_train, y_train, epochs=3)
    y_predict_2 = model_2.predict_classes(x_test)
    
    # Calculate p value
    tb = mcnemar_table(y_target = y_test, 
                       y_model1 = y_predict_1, 
                       y_model2 = y_predict_2)
    chi2, p = mcnemar(ary=tb, exact=True)
    print(f"p-value: {p}")
    mcnemar_p_values.append(p)


Model 1, Fold 0
Train on 46666 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[5 0 4 ... 0 3 1]

Model 2, Fold 0
Train on 46666 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
p-value: 1.707573476987988e-231

Model 1, Fold 1
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[7 2 8 ... 1 6 3]

Model 2, Fold 1
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
p-value: 9.206628758778992e-205

Model 1, Fold 2
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[5 4 6 ... 4 5 6]

Model 2, Fold 2
Train on 46667 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
p-value: 2.1780472413873312e-222


In [8]:
mcnemar_p_values

[1.707573476987988e-231, 9.206628758778992e-205, 2.1780472413873312e-222]

- McNemar p-values are << 0.05 for all folds
- Model 2 performs better