#Statistical analysis of the fine-tuning results

## Load modules

In [5]:
from google.colab import drive
import json as js
import pandas as pd
import random
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Set Working directory

In this work, Google Colab was used and therefore Google Drive was also used to load data. If a different environment is used, the working directory must be adapted.

In [6]:
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks


This code can be customised to set your own WD.

In [7]:
#path = "/this/is/my/wd"
#set_working_directory(path)

## Load results data

In [8]:
df = pd.read_excel('Results.xlsx', sheet_name='results')
df.columns = df.columns.str.strip()
df

Unnamed: 0,Name,pretrained_model,Labels,ANLI_Train,MNLI_Train,NLI Fever_Train,VitaminC_Train,SciFact_Train,DocNLI_Train,SNLI_Train,...,train_batchsize,warmup_ratio,weight_decay,Train_epochs,Train_loss,Validation_loss,Accuracy,Precision,Recall,F1
0,model10_deberta_base_colab,deberta_v3,3,1,1,1,0,0,0,1,...,16,0.1,0.06,3,0.335,0.712672,0.8064,0.7976,0.8064,0.799
1,model10_roberta,roberta_base,2,0,0,0,2,2,0,2,...,16,0.1,0.08,5,0.175,0.723186,0.3718,0.5696,0.7715,0.6554
2,model11_roberta,roberta_base,2,0,0,0,2,2,0,2,...,16,0.1,0.03,3,0.3037,0.591004,0.482,0.4878,0.7186,0.5811
3,model12_deberta_base_colab,deberta_v3,2,0,0,0,0,0,0,1,...,16,0.1,0.06,3,0.0484,1.03833,0.6774,0.6444,0.8222,0.7225
4,model12_roberta,roberta_base,2,0,0,0,2,2,0,2,...,16,0.05,0.05,5,0.694,0.693119,0.4574,0.5873,0.5968,0.592
5,model13_roberta,roberta_base,2,0,0,0,2,2,0,2,...,16,0.15,0.05,5,0.1946,0.68721,0.6078,0.5775,0.8066,0.6731
6,model14_deberta_base_colab,deberta_v3,2,0,0,0,2,2,0,2,...,8,0.1,0.01,3,0.1248,0.528552,0.706,0.6893,0.706,0.6892
7,model14_roberta,roberta_base,3,1,1,1,0,0,0,1,...,16,0.15,0.1,3,0.3633,1.821438,0.326,0.324353,0.326,0.322951
8,model15_deberta_base_colab,deberta_v3,3,0,0,0,1,0,0,0,...,8,0.1,0.01,3,0.2607,0.764862,0.8332,0.830273,0.8332,0.831446
9,model15_roberta,roberta_base,2,0,0,1,0,0,0,0,...,16,0.15,0.01,3,0.2118,1.75739,0.473054,0.467698,0.373054,0.450267


In [9]:
# Checking the unique values in the relevant columns
print(df['pretrained_model'].unique())
print(df['Labels'].unique())
print(df['Dataset'].unique())
print(df['Multiple'].unique())
print(df['Train_epochs'].unique())
print(df['warmup_ratio'].unique())
print(df['train_batchsize'].unique())
print(df['weight_decay'].unique())


['deberta_v3' 'roberta_base' 'bert_base']
[3 2]
['B4' 'B1' 'SNLI' 'VC' 'FEVERNLI' 'B2' 'ANLI' 'B3']
[1 0]
[3 5]
[0.1  0.05 0.15]
[16  8]
[0.06 0.08 0.03 0.05 0.01 0.1  0.04]


Overview of frequencies per variable

In [10]:
labels_counts = df['Labels'].value_counts()
dataset_counts = df['Dataset'].value_counts()
multiple_counts = df['Multiple'].value_counts()
train_epochs_counts = df['Train_epochs'].value_counts()
pretrained_model_counts = df['pretrained_model'].value_counts()
weight_decay_counts = df['weight_decay'].value_counts()
warmup_ratio_counts = df['warmup_ratio'].value_counts()
train_batchsize_counts = df['train_batchsize'].value_counts()

# Frequency of characteristics
print("\nFrequency of characteristics in 'Labels':")
print(labels_counts)

print("\nFrequency of characteristics in 'Dataset':")
print(dataset_counts)

print("\nFrequency of characteristics in 'Multiple':")
print(multiple_counts)

print("\nFrequency of characteristics in 'Train_epochs':")
print(train_epochs_counts)

print("\nFrequency of characteristics in 'pretrained_model':")
print(pretrained_model_counts)

print("\nFrequency of characteristics in 'weight_decay':")
print(weight_decay_counts)

print("\nFrequency of characteristics in 'warmup_ratio':")
print(warmup_ratio_counts)

print("\nFrequency of characteristics in 'train_batchsize':")
print(train_batchsize_counts)



Frequency of characteristics in 'Labels':
Labels
2    20
3    18
Name: count, dtype: int64

Frequency of characteristics in 'Dataset':
Dataset
B1          9
B2          6
VC          4
SNLI        4
ANLI        4
FEVERNLI    4
B3          4
B4          3
Name: count, dtype: int64

Frequency of characteristics in 'Multiple':
Multiple
1    22
0    16
Name: count, dtype: int64

Frequency of characteristics in 'Train_epochs':
Train_epochs
3    23
5    15
Name: count, dtype: int64

Frequency of characteristics in 'pretrained_model':
pretrained_model
roberta_base    15
bert_base       12
deberta_v3      11
Name: count, dtype: int64

Frequency of characteristics in 'weight_decay':
weight_decay
0.06    8
0.01    7
0.08    6
0.10    5
0.05    5
0.04    5
0.03    2
Name: count, dtype: int64

Frequency of characteristics in 'warmup_ratio':
warmup_ratio
0.10    25
0.15     9
0.05     4
Name: count, dtype: int64

Frequency of characteristics in 'train_batchsize':
train_batchsize
16    21
8     17


# ANOVA
In this step, an ANOVA is performed to analyse whether the selected parameters have a significant mean difference in terms of model accuracy.

In [11]:
model = ols('Accuracy ~ C(Labels) + C(Dataset) + C(Train_epochs) + C(train_batchsize) + C(pretrained_model) + C(train_batchsize)+ C(Multiple)+ C(weight_decay)+ C(warmup_ratio)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

                       sum_sq    df          F    PR(>F)
C(Labels)            0.010482   1.0   1.503533  0.236843
C(Dataset)           0.131861   7.0   2.702127  0.044556
C(Train_epochs)      0.001715   1.0   0.246060  0.626215
C(train_batchsize)   0.000101   1.0   0.014454  0.905714
C(pretrained_model)  0.221364   2.0  15.876871  0.000129
C(Multiple)          0.043199   1.0   6.196673  0.023455
C(weight_decay)      0.104432   6.0   2.496731  0.064337
C(warmup_ratio)      0.004982   2.0   0.357340  0.704665
Residual             0.118512  17.0        NaN       NaN


# Tukey HSD
A Turkey HSD test is then carried out for the three factors that showed a significant mean difference in the ANOVA.

In [12]:
# Tukey's HSD Test für den Faktor 'Dataset'
tukey_dataset = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['Dataset'], alpha=0.05)
print("\nTukey's HSD Test 'Dataset':")
print(tukey_dataset)

# Tukey's HSD Test für den Faktor 'Mixed'
tukey_Mixed = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['Multiple'], alpha=0.05)
print("\nTukey's HSD Test 'Mixed':")
print(tukey_Mixed)

# Tukey's HSD Test für den Faktor 'Pretrained_model'
tukey_pretrained_model = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['pretrained_model'], alpha=0.05)
print("\nTukey's HSD Test 'Pretrained_model':")
print(tukey_pretrained_model)



Tukey's HSD Test 'Dataset':
  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
    ANLI       B1  -0.1198 0.8766 -0.4111 0.1714  False
    ANLI       B2  -0.1727 0.6276 -0.4855 0.1402  False
    ANLI       B3  -0.0481 0.9998 -0.3909 0.2946  False
    ANLI       B4  -0.0217    1.0 -0.3919 0.3485  False
    ANLI FEVERNLI  -0.1839 0.6586 -0.5267 0.1588  False
    ANLI     SNLI  -0.1416 0.8742 -0.4843 0.2011  False
    ANLI       VC  -0.1309 0.9121 -0.4736 0.2118  False
      B1       B2  -0.0528 0.9971 -0.3083 0.2026  False
      B1       B3   0.0717 0.9918 -0.2196 0.3629  False
      B1       B4   0.0981 0.9727  -0.225 0.4213  False
      B1 FEVERNLI  -0.0641 0.9958 -0.3554 0.2271  False
      B1     SNLI  -0.0218    1.0 -0.3131 0.2695  False
      B1       VC  -0.0111    1.0 -0.3023 0.2802  False
      B2       B3   0.1245 0.8937 -0.1884 0.4374  False
      B2       B4  