In [5]:
from pathlib import Path
from typing import Tuple
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats

HERE = './src/'

# Paths to the test data
paths = {
    "MC": HERE + "MCTestData.csv",
    "TMSt": HERE + "TMStTestData.csv",
    "F": HERE + "FTestData.csv",}

### **1. McNemar Test**
Two models are trained to classify images of cats and dogs. The result is stored in MCTestData.csv with n = 500 images. The function load data MNTest() loads the data as an n × 3 numpy array, where the first column represents the ground truth. The 2nd and the 3rd columns represent the output from model 1 and 2 respectively. Implement a McNemar Test to determine whether the two models perform equally well on the dataset. In your solution, state what is H0, H1 and return χ2 for this evaluation. As this exercise will not be automatically graded, please include your answer in an attached PDF file, and upload that PDF to your
repository.

In [6]:
def load_data_MNTest() -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Loads data stored in McNemarTest.csv
    :param fl: path to the csv file
    :return: labels, prediction1, prediction2
    """
    data = pd.read_csv(paths["MC"], header=None).to_numpy()
    labels = data[:, 0]
    prediction_1 = data[:, 1]
    prediction_2 = data[:, 2]
    return labels, prediction_1, prediction_2

In [7]:
def McNemar_test(
    labels: np.ndarray,
    prediction_1: np.ndarray,
    prediction_2: np.ndarray,
) -> float:
    """
    :param labels: the ground truth labels
    :param prediction_1: the prediction results from model 1
    :param prediction_2:  the prediction results from model 2
    :return: the test statistic chi2_Mc
    """
    # Calculate the conditions
    A = np.sum((labels == prediction_1) & (labels == prediction_2))
    B = np.sum((labels == prediction_1) & (labels != prediction_2))
    C = np.sum((labels != prediction_1) & (labels == prediction_2))
    D = np.sum((labels != prediction_1) & (labels != prediction_2))

    assert B+C>20

    chi2_Mc = (abs(B-C) - 1)**2 / (B+C)
    return chi2_Mc

In [8]:
# (a)

labels, prediction_A, prediction_B = load_data_MNTest()
chi2_Mc = McNemar_test(labels, prediction_A, prediction_B)
print("chi2_Mc ", chi2_Mc)


chi2_Mc  4.613924050632911


In [9]:
chi2_d1_alpha95 = stats.chi2.ppf(0.95, 1)
print("chi2_d1_alpha95 ", chi2_d1_alpha95)


if chi2_Mc > chi2_d1_alpha95:
    print("- Reject H0 at a significance level of 0.05")
else:
    print("- Fail to reject H0 at a significance level of 0.05")

chi2_d1_alpha95  3.841458820694124
- Reject H0 at a significance level of 0.05


### **2. Two-Matched Samples t-Test** [3 points]
TMStTestData.csv contains error values of two algorithms on n = 419 datasets, the function load data TMStTest() loads the data as an n × 2 numpy array. Implement a `Two-Matched-Samples t-Test` to determine whether the two algorithms perform equally well on the dataset and return the test statistic t value for this evaluation.

In [10]:
def load_data_TMStTest() -> Tuple[np.ndarray, np.ndarray]:
    """
    Loads data stored in fl
    :param fl: path to the csv file
    :return: y1, y2
    """
    data = np.loadtxt(paths["TMSt"], delimiter=",")
    y1 = data[:, 0]
    y2 = data[:, 1]
    return y1, y2

In [11]:
def TwoMatchedSamplest_test(y1: np.ndarray, y2: np.ndarray) -> float:
    """
    :param y1: runs of algorithm 1
    :param y2: runs of algorithm 2
    :return: the test statistic t-value
    """
    d = y1 - y2
    mean_d = np.mean(d)
    std_d = np.std(d, ddof=1)
    n = len(y1)

    t_value = mean_d / (std_d / np.sqrt(n))
    return t_value

In [12]:
# (b)
y1, y2 = load_data_TMStTest()
t_value = TwoMatchedSamplest_test(y1, y2)
print("t_value ", t_value)

t_value  -8.923519540749611


In [32]:
t_value_d_alpha95 = stats.t.ppf(0.05, len(y1) - 1)
print("t_value_d1_alpha95 ", t_value_d_alpha95)

if t_value > t_value_d_alpha95:
    print("- Reject H0 at a significance level of 0.05")
else:
    print("- Fail to reject H0 at a significance level of 0.05")

t_value_d1_alpha95  -1.6485071494259467
- Fail to reject H0 at a significance level of 0.05


### **3. Friedman Test**
FTestData.csv contains error values of k = 5 algorithms on n = 15 datasets, the function load data FTest() loads the data as an n × k numpy matrix Err, where Err_ij represents the error of the jth algorithm on the ith dataset.
Implement a Friedman Test to determine if all algorithms are equivalent in their performance and return χ2 F for this evaluation. If this hypothesis is not rejected, you can skip the next question.


In [26]:
def load_data_FTest() -> np.ndarray:
    """
    Loads data stored in fl
    :param fl: path to the csv file
    :return: evaluations
    """
    errors = np.loadtxt(paths["F"], delimiter=",")
    return errors

In [27]:
def Friedman_test(errors: np.ndarray) -> Tuple[float, dict]:
    """
    :param errors: the error values of different algorithms on different datasets
    :return: chi2_F: the test statistic chi2_F value
    :return: FData_stats: the statistical data of the Friedan test data, containing anything
    you need to solve the `Nemenyi_test` and `box_plot` functions.
    """
    R = np.empty_like(errors)

    chi2_F = np.random.uniform(0, 1)

    FData_stats = {
        "errors": errors,
        "hello": "world",
    }
    return chi2_F, FData_stats

In [29]:
# (c)
errors = load_data_FTest()
chi2_F, FData_stats = Friedman_test(errors)
print("chi2_F ", chi2_F)

chi2_F  0.681504218543089


In [33]:
errors

array([[0.95703175, 0.55344353, 0.1174456 , 0.4555772 , 0.72243399],
       [0.5722354 , 0.39836789, 0.07723426, 0.25402393, 0.19170288],
       [0.29754282, 0.1945297 , 0.16670142, 0.83948775, 0.3619303 ],
       [0.43684854, 0.56866147, 0.23408375, 0.63966914, 0.33980737],
       [0.85891678, 0.4245215 , 0.36306726, 0.42678245, 0.64777516],
       [0.91495205, 0.30282603, 0.39465833, 0.2127598 , 0.49431276],
       [0.70849721, 0.9129483 , 0.325063  , 0.26127221, 0.66955606],
       [0.75694605, 0.63048452, 0.22031192, 0.549582  , 0.06686902],
       [0.37903357, 0.65355935, 0.22309479, 0.23832535, 0.66860399],
       [0.07972898, 0.99572019, 0.07926882, 0.82849802, 0.87645693],
       [0.47281105, 0.08981156, 0.20367391, 0.40215297, 0.78326659],
       [0.70107285, 0.11877423, 0.64167016, 0.77051231, 0.32505122],
       [0.86730362, 0.94459225, 0.09056449, 0.74345177, 0.76815858],
       [0.15814528, 0.41508237, 0.2329551 , 0.19198217, 0.47140751],
       [0.9569331 , 0.87054409, 0.