Useful link to understand McNemar's Test

https://machinelearningmastery.com/mcnemars-test-for-machine-learning/

Joey's paper: https://medinform.jmir.org/2022/9/e37770/pdf

MaCNemar's paper: https://link.springer.com/article/10.1007/BF02295996




In [None]:
%%time
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
import nltk
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
np.random.seed(42)
import nltk
import warnings
warnings.filterwarnings('ignore')

CPU times: user 1.82 s, sys: 324 ms, total: 2.15 s
Wall time: 5.81 s


## Significance Testing with Transformer Models

> Indented block



In [None]:
model_paths = ['Bert-Predictions.csv', 'RoBerta-Predictions.csv',
               'Distil-BERT-Predictions.csv', 'ELECTRA-Predictions.csv', 'MPNet-Predictions.csv']

model_names = ['BERT', 'RoBERTa', 'Distil-BERT','ELECTRA', 'MPNet']


MaCNemar Statistic Calculated by the following,

* statistic = (Yes/No - No/Yes)^2 / (Yes/No + No/Yes)

Syntax:

mcnemar(table, exact=True, correction=True)

Parameters:

-- table: It represents the square contingency table
* exact = True: The binomial distribution will be used.
* exact = False: The Chi-Square distribution will be used
* correction = True: Then the continuity correction would be used. As a rule. this correction would be applied any cell counts in the table is not more than 4

In [None]:

from scipy.stats import chi2_contingency
# Example of calculating the mcnemar test
from statsmodels.stats.contingency_tables import mcnemar

def create_contingency_table(list1, list2):
    a, b, c, d = 0, 0, 0, 0

    for i in range(len(list1)):
        if list1[i] == 1 and list2[i] == 1:
            a+=1
        elif list1[i] == 1 and list2[i] == 0:
            b+=1
        elif list1[i] == 0 and list2[i] == 1:
            c+=1
        elif list1[i] == 0 and list2[i] == 0:
            d+=1

    contingency_table = [[a,b],
                        [c,d]]
    print(contingency_table)
    result = mcnemar(contingency_table, exact=False, correction= False)

    # summarize the finding
    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    return result.pvalue



In [None]:
from statsmodels.stats.contingency_tables import mcnemar

def p_value_calculattion(md1_df, md2_df, target_cols):
    p=0
    for i in target_cols:
      print(i)
      model1_lst = md1_df[i].tolist()
      model2_lst = md2_df[i].tolist()
      # print(model1_lst)
      # print(model2_lst)
      p += create_contingency_table(model1_lst, model2_lst)

    print("Average P-Value: ", p/6.0)
    print("----------------------")


In [None]:
path = '/content/drive/MyDrive/Omar/AAAI-Submission/ChatGPT-Exps/'
import ast

from sklearn.metrics import cohen_kappa_score

target_cols= ['AL', 'CDU', 'MA','OTH', 'PPE', 'TP']


##Reading the XLNet model predictions
model1_path = path + 'XLNet-Predictions.csv'
model1_df = pd.read_csv(model1_path)

for i in range(len(model_paths)):
  print(model_names[i])
  ##Reading the dataFrame
  model2_path = path + model_paths[i]
  model2_df= pd.read_csv(model2_path)

  ##Getting the labels
  model1_labels = model1_df['Predictions'].apply(ast.literal_eval)
  model2_labels = model2_df['Predictions'].apply(ast.literal_eval)

  ##Doing Multilabel Binarization
  multilabel = MultiLabelBinarizer()
  multilabel.fit([target_cols])
  model1_pred = multilabel.fit_transform(model1_labels)
  model2_pred = multilabel.fit_transform(model2_labels)

  md1_df = pd.DataFrame(model1_pred, columns=multilabel.classes_)
  md2_df = pd.DataFrame(model2_pred, columns=multilabel.classes_)

  ## Calling the function to calculate p-value
  p_value_calculattion(md1_df, md2_df, target_cols)


BERT
AL
[[61, 14], [6, 428]]
statistic=3.200, p-value=0.074
CDU
[[126, 29], [11, 343]]
statistic=8.100, p-value=0.004
MA
[[113, 52], [26, 318]]
statistic=8.667, p-value=0.003
OTH
[[14, 19], [4, 472]]
statistic=9.783, p-value=0.002
PPE
[[138, 89], [11, 271]]
statistic=60.840, p-value=0.000
TP
[[125, 14], [15, 355]]
statistic=0.034, p-value=0.853
Average P-Value:  0.1559585028009706
----------------------
RoBERTa
AL
[[67, 8], [10, 424]]
statistic=0.222, p-value=0.637
CDU
[[126, 29], [11, 343]]
statistic=8.100, p-value=0.004
MA
[[147, 18], [46, 298]]
statistic=12.250, p-value=0.000
OTH
[[16, 17], [10, 466]]
statistic=1.815, p-value=0.178
PPE
[[178, 49], [26, 256]]
statistic=7.053, p-value=0.008
TP
[[131, 8], [25, 345]]
statistic=8.758, p-value=0.003
Average P-Value:  0.138528395381633
----------------------
Distil-BERT
AL
[[57, 18], [9, 425]]
statistic=3.000, p-value=0.083
CDU
[[128, 27], [17, 337]]
statistic=2.273, p-value=0.132
MA
[[109, 56], [29, 315]]
statistic=8.576, p-value=0.003
OT

## Significance Testing with ChatGPT Models

In [None]:
model_paths = ['F1-ZS-S-ChatGPT-Predictions.csv', 'F1-ZS-L-1-ChatGPT-Predictions.csv',
               'F1-FS-S-1-ChatGPT-Predictions.csv', 'F1-FS-L-2-ChatGPT-Predictions.csv',
               'F1-CoT-2-ChatGPT-Predictions.csv']

model_names = ['Zero-Shot-Short', 'Zero-Shot-Long', 'Few-Shot-Short','Few-Shot-Long', 'CoT']


In [None]:
path = '/content/drive/MyDrive/Omar/AAAI-Submission/ChatGPT-Exps/'
import ast

from sklearn.metrics import cohen_kappa_score

target_cols= ['AL', 'CDU', 'MA','OTH', 'PPE', 'TP']


##Reading the XLNet model predictions
model1_path = path + 'XLNet-Predictions.csv'
model1_df = pd.read_csv(model1_path)

for i in range(len(model_paths)):
  print(model_names[i])
  ##Reading the dataFrame
  model2_path = path + model_paths[i]
  model2_df= pd.read_csv(model2_path)

  ##Getting the labels
  model1_labels = model1_df['Predictions'].apply(ast.literal_eval)
  model2_labels = model2_df['ChatGPT-Prediction'].apply(ast.literal_eval)

  ##Doing Multilabel Binarization
  multilabel = MultiLabelBinarizer()
  multilabel.fit([target_cols])
  model1_pred = multilabel.fit_transform(model1_labels)
  model2_pred = multilabel.fit_transform(model2_labels)

  md1_df = pd.DataFrame(model1_pred, columns=multilabel.classes_)
  md2_df = pd.DataFrame(model2_pred, columns=multilabel.classes_)

  ## Calling the function to calculate p-value
  p_value_calculattion(md1_df, md2_df, target_cols)


Zero-Shot-Short
AL
[[19, 56], [1, 433]]
statistic=53.070, p-value=0.000
CDU
[[123, 32], [84, 270]]
statistic=23.310, p-value=0.000
MA
[[49, 116], [24, 320]]
statistic=60.457, p-value=0.000
OTH
[[27, 6], [193, 283]]
statistic=175.724, p-value=0.000
PPE
[[21, 206], [8, 274]]
statistic=183.196, p-value=0.000
TP
[[59, 80], [29, 341]]
statistic=23.862, p-value=0.000
Average P-Value:  4.0221760628490473e-07
----------------------
Zero-Shot-Long
AL
[[44, 31], [16, 418]]
statistic=4.787, p-value=0.029
CDU
[[78, 77], [32, 322]]
statistic=18.578, p-value=0.000
MA
[[103, 62], [64, 280]]
statistic=0.032, p-value=0.859
OTH
[[28, 5], [165, 311]]
statistic=150.588, p-value=0.000
PPE
[[60, 167], [11, 271]]
statistic=136.719, p-value=0.000
TP
[[90, 49], [47, 323]]
statistic=0.042, p-value=0.838
Average P-Value:  0.28758840496284227
----------------------
Few-Shot-Short
AL
[[61, 14], [65, 369]]
statistic=32.924, p-value=0.000
CDU
[[110, 45], [59, 295]]
statistic=1.885, p-value=0.170
MA
[[152, 13], [196,

## Significance Testing with ML Models

In [None]:
model_paths = ['LR-Predictions.csv', 'NBSVM-Predictions.csv',
               'BiGRU-Predictions.csv', 'FastText-Predictions.csv']

model_names = ['LR', 'NBSVM', 'BiGRU','FastText']


In [None]:
path = '/content/drive/MyDrive/Omar/AAAI-Submission/ChatGPT-Exps/'
import ast

from sklearn.metrics import cohen_kappa_score

target_cols= ['AL', 'CDU', 'MA','OTH', 'PPE', 'TP']


##Reading the XLNet model predictions
model1_path = path + 'XLNet-Predictions.csv'
model1_df = pd.read_csv(model1_path)

for i in range(len(model_paths)):
  print(model_names[i])
  ##Reading the dataFrame
  model2_path = path + model_paths[i]
  model2_df= pd.read_csv(model2_path)

  ##Getting the labels
  model1_labels = model1_df['Predictions'].apply(ast.literal_eval)
  model2_labels = model2_df['Predictions'].apply(ast.literal_eval)

  ##Doing Multilabel Binarization
  multilabel = MultiLabelBinarizer()
  multilabel.fit([target_cols])
  model1_pred = multilabel.fit_transform(model1_labels)
  model2_pred = multilabel.fit_transform(model2_labels)

  md1_df = pd.DataFrame(model1_pred, columns=multilabel.classes_)
  md2_df = pd.DataFrame(model2_pred, columns=multilabel.classes_)

  ## Calling the function to calculate p-value
  p_value_calculattion(md1_df, md2_df, target_cols)


LR
AL
[[53, 22], [29, 405]]
statistic=0.961, p-value=0.327
CDU
[[87, 68], [41, 313]]
statistic=6.688, p-value=0.010
MA
[[114, 51], [76, 268]]
statistic=4.921, p-value=0.027
OTH
[[0, 33], [1, 475]]
statistic=30.118, p-value=0.000
PPE
[[156, 71], [78, 204]]
statistic=0.329, p-value=0.566
TP
[[86, 53], [34, 336]]
statistic=4.149, p-value=0.042
Average P-Value:  0.16186731763027906
----------------------
NBSVM
AL
[[46, 29], [14, 420]]
statistic=5.233, p-value=0.022
CDU
[[104, 51], [57, 297]]
statistic=0.333, p-value=0.564
MA
[[125, 40], [100, 244]]
statistic=25.714, p-value=0.000
OTH
[[1, 32], [1, 475]]
statistic=29.121, p-value=0.000
PPE
[[187, 40], [106, 176]]
statistic=29.836, p-value=0.000
TP
[[109, 30], [43, 327]]
statistic=2.315, p-value=0.128
Average P-Value:  0.11899948917491925
----------------------
BiGRU
AL
[[56, 19], [18, 416]]
statistic=0.027, p-value=0.869
CDU
[[120, 35], [23, 331]]
statistic=2.483, p-value=0.115
MA
[[113, 52], [55, 289]]
statistic=0.084, p-value=0.772
OTH
[[