In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/question_answering/

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/question_answering


In [2]:
!pip install scikit-posthocs researchpy

Installing collected packages: scikit-posthocs, researchpy
Successfully installed researchpy-0.3.2 scikit-posthocs-0.6.7


In [3]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import f_oneway # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
import scikit_posthocs as sp # https://scikit-posthocs.readthedocs.io/en/latest/intro/
import researchpy as rp
import json

  import pandas.util.testing as tm


In [4]:
models = ["CookBERT", "FoodBERT", "bert-base-uncased"]
folds = 10 # number of folds for cross validation

result_df = pd.DataFrame()


for model in models:
  for fold in range(folds):
    try:
      with open(f'model_output/{model}/predict_fold_{fold}_results.json') as f:
        fold_result = json.load(f)
        result = {
            'model': model,
            'fold': fold,
            'f1': fold_result['test_f1'],
            'exact_match': fold_result['test_exact_match'],
            'samples': fold_result['predict_samples']
        }
        result_df = result_df.append(result, ignore_index=True)
    except: # if file not found just continue with the next one
      continue

In [None]:
result_df

In [7]:
anova = f_oneway(result_df['f1'][result_df['model'] == 'FoodBERT'],
                  result_df['f1'][result_df['model'] == 'bert-base-uncased'],
                  result_df['f1'][result_df['model'] == 'CookBERT']
                 )

print(anova)
no_context_model_posthoc = sp.posthoc_ttest(result_df, val_col='f1', group_col='model',p_adjust='bonferroni')
display(no_context_model_posthoc)
display(rp.summary_cont(result_df['f1'].groupby(result_df['model'])))

F_onewayResult(statistic=26.18273367584176, pvalue=4.768529935718259e-07)


Unnamed: 0,CookBERT,FoodBERT,bert-base-uncased
CookBERT,1.0,0.000558,0.072057
FoodBERT,0.000558,1.0,3e-06
bert-base-uncased,0.072057,3e-06,1.0






Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,30.6353,1.5938,0.504,29.4952,31.7754
FoodBERT,10,27.5065,1.3882,0.439,26.5135,28.4996
bert-base-uncased,10,32.3945,1.5987,0.5056,31.2508,33.5382


In [None]:
result_df.groupby(['model']).agg({'exact_match':['mean'],
                                    'f1':['mean']})

Unnamed: 0_level_0,exact_match,f1
Unnamed: 0_level_1,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2
CookBERT,12.509578,30.63528
FoodBERT,10.806192,27.506528
bert-base-uncased,14.059678,32.394497


In [None]:
# confidence intervals for exact match score
display(rp.summary_cont(result_df['exact_match'].groupby(result_df['model'])))





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,12.5096,1.267,0.4007,11.6032,13.416
FoodBERT,10,10.8062,1.0318,0.3263,10.0681,11.5443
bert-base-uncased,10,14.0597,1.4901,0.4712,12.9938,15.1256
