In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/text_classification/

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/text_classification


# 2. Load and prepare data

In [None]:
!pip install scikit-posthocs researchpy

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import f_oneway # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
import scikit_posthocs as sp # https://scikit-posthocs.readthedocs.io/en/latest/intro/
import researchpy as rp

labels = ['Knowledge',
 'Cooking technique',
 'Amount',
 'Meal',
 'Equipment',
 'Miscellaneous',
 'Ingredient',
 'Preparation',
 'Recipe',
 'Time',
 'Temperature']

# confusion_matrix(data.ground_trouth, data.prediction)

  import pandas.util.testing as tm


In [None]:
models = ["CookBERT", "FoodBERT", "bert-base-uncased"]
conditions = ["no_context", "1_prev_utterance"]
folds = 10 # number of folds for cross validation

result_df = pd.DataFrame()

for model in models:
  for condition in conditions:
    for fold in range(folds):
      cl_results = pd.read_csv(f'model_output/{model}/{condition}/predict_results_{fold}.txt', delimiter="\t") # load single result file of one fold
      cl_report = classification_report(cl_results.ground_trouth, cl_results.prediction, labels=labels, output_dict=True) # get report as dict
      for label in labels:
        result = {
          "info_need": label,
          "model": model,
          "condition": condition,
          "f1": cl_report[label]['f1-score'],
          "precision": cl_report[label]['precision'],
          "recall": cl_report[label]['recall']
        }
        result_df = result_df.append(result, ignore_index=True)

In [None]:
result_df # dataframe including the results for each each class (11) of each fold (10) for each model (3)

Unnamed: 0,info_need,model,condition,f1,precision,recall
0,Knowledge,CookBERT,no_context,0.222222,0.250000,0.200000
1,Cooking technique,CookBERT,no_context,0.461538,0.387097,0.571429
2,Amount,CookBERT,no_context,0.947368,0.937500,0.957447
3,Meal,CookBERT,no_context,0.000000,0.000000,0.000000
4,Equipment,CookBERT,no_context,0.250000,0.250000,0.250000
...,...,...,...,...,...,...
655,Ingredient,bert-base-uncased,1_prev_utterance,0.348837,0.750000,0.227273
656,Preparation,bert-base-uncased,1_prev_utterance,0.611940,0.719298,0.532468
657,Recipe,bert-base-uncased,1_prev_utterance,0.576271,0.435897,0.850000
658,Time,bert-base-uncased,1_prev_utterance,0.844444,0.730769,1.000000


# Analysis for condition 'No Context'

In [None]:
# Analysis for no context - condition
no_context_results = result_df[result_df['condition']=='no_context']
no_context_model_comparison = f_oneway(no_context_results['f1'][no_context_results['model'] == 'CookBERT'],
                                         no_context_results['f1'][no_context_results['model'] == 'FoodBERT'],
                                         no_context_results['f1'][no_context_results['model'] == 'bert-base-uncased'])
print(no_context_model_comparison)
no_context_model_posthoc = sp.posthoc_ttest(no_context_results, val_col='f1', group_col='model',p_adjust='bonferroni')
display(no_context_model_posthoc)
display(rp.summary_cont(no_context_results['f1'].groupby(no_context_results['model'])))

F_onewayResult(statistic=1.5789635379047495, pvalue=0.2077566090518626)


Unnamed: 0,CookBERT,FoodBERT,bert-base-uncased
CookBERT,1.0,0.274533,0.630904
FoodBERT,0.274533,1.0,1.0
bert-base-uncased,0.630904,1.0,1.0






Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,110,0.5072,0.2741,0.0261,0.4554,0.559
FoodBERT,110,0.4432,0.286,0.0273,0.3892,0.4973
bert-base-uncased,110,0.4615,0.265,0.0253,0.4115,0.5116


# Analysis for condition '1 previous utterance'

In [None]:
# Analysis for no context - condition
one_prev_utt_results = result_df[result_df['condition']=='1_prev_utterance']
one_prev_utt_model_comparison = f_oneway(one_prev_utt_results['f1'][one_prev_utt_results['model'] == 'CookBERT'],
                                         one_prev_utt_results['f1'][one_prev_utt_results['model'] == 'FoodBERT'],
                                         one_prev_utt_results['f1'][one_prev_utt_results['model'] == 'bert-base-uncased'])
print(one_prev_utt_model_comparison)
one_prev_utt_model_posthoc = sp.posthoc_ttest(one_prev_utt_results, val_col='f1', group_col='model',p_adjust='bonferroni')
display(one_prev_utt_model_posthoc)
display(rp.summary_cont(one_prev_utt_results['f1'].groupby(one_prev_utt_results['model'])))

F_onewayResult(statistic=8.931954264756968, pvalue=0.00016716720884432833)


Unnamed: 0,CookBERT,FoodBERT,bert-base-uncased
CookBERT,1.0,0.000103,0.062466
FoodBERT,0.000103,1.0,0.177768
bert-base-uncased,0.062466,0.177768,1.0






Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,110,0.5405,0.2705,0.0258,0.4893,0.5916
FoodBERT,110,0.3809,0.2885,0.0275,0.3264,0.4355
bert-base-uncased,110,0.4538,0.2814,0.0268,0.4006,0.507


# Analysis of overall model performance

In [None]:
overall_model_comparison = f_oneway(result_df['f1'][result_df['model'] == 'CookBERT'],
                                    result_df['f1'][result_df['model'] == 'FoodBERT'],
                                    result_df['f1'][result_df['model'] == 'bert-base-uncased'])
print(overall_model_comparison)
overall_model_posthoc = sp.posthoc_ttest(result_df, val_col='f1', group_col='model',p_adjust='bonferroni')
display(overall_model_posthoc)
display(rp.summary_cont(result_df['f1'].groupby(result_df['model'])))

F_onewayResult(statistic=9.000103877793624, pvalue=0.0001392805954747044)


Unnamed: 0,CookBERT,FoodBERT,bert-base-uncased
CookBERT,1.0,0.000105,0.033649
FoodBERT,0.000105,1.0,0.267011
bert-base-uncased,0.033649,0.267011,1.0






Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,220,0.5238,0.2722,0.0184,0.4877,0.56
FoodBERT,220,0.4121,0.2883,0.0194,0.3738,0.4504
bert-base-uncased,220,0.4577,0.2728,0.0184,0.4214,0.4939


# Results after 10-fold cross validation for each model

In [None]:
cookBERT_results = result_df[result_df['model']=='CookBERT']
bert_base_uncased_results = result_df[result_df['model']=='bert-base-uncased']
foodBERT_results = result_df[result_df['model']=='FoodBERT']

Mean CookBERT:  0.5238433870089669
Mean BERTbase_uncased:  0.4576794496389906
Mean FoodBERT:  0.4120717911622522


## CookBERT

In [None]:
print("Mean CookBERT: ", cookBERT_results['f1'].mean())
rp.summary_cont(cookBERT_results.groupby(['condition'])['f1'])





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_prev_utterance,110,0.5405,0.2705,0.0258,0.4893,0.5916
no_context,110,0.5072,0.2741,0.0261,0.4554,0.559


## BERT base uncased

In [None]:
print("Mean BERTbase_uncased: ", bert_base_uncased_results['f1'].mean())
rp.summary_cont(bert_base_uncased_results.groupby(['condition'])['f1'])





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_prev_utterance,110,0.4538,0.2814,0.0268,0.4006,0.507
no_context,110,0.4615,0.265,0.0253,0.4115,0.5116


## FoodBERT

In [None]:
print("Mean FoodBERT: ", foodBERT_results['f1'].mean())
rp.summary_cont(foodBERT_results.groupby(['condition'])['f1'])





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_prev_utterance,110,0.3809,0.2885,0.0275,0.3264,0.4355
no_context,110,0.4432,0.286,0.0273,0.3892,0.4973


# Mean precision, recall and f1 of each model for each condition

In [None]:
result_df.groupby(['model','condition']).agg({'precision':['mean'],
                                                          'recall':['mean'],
                                                          'f1':['mean'],})

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean
model,condition,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
CookBERT,1_prev_utterance,0.522574,0.592996,0.540458
CookBERT,no_context,0.485845,0.55645,0.507229
FoodBERT,1_prev_utterance,0.368875,0.444949,0.380941
FoodBERT,no_context,0.42405,0.498075,0.443203
bert-base-uncased,1_prev_utterance,0.462885,0.498406,0.45381
bert-base-uncased,no_context,0.479385,0.486841,0.461549


# Differences between conditions

In [None]:
# Analysis for no context - condition
CookBERT_results = result_df[result_df['model']=='CookBERT']
condition_comparison = f_oneway(CookBERT_results['f1'][CookBERT_results['condition'] == '1_prev_utterance'],
                                         CookBERT_results['f1'][CookBERT_results['condition'] == 'no_context'])
print(condition_comparison)
condition_posthoc = sp.posthoc_ttest(CookBERT_results, val_col='f1', group_col='condition',p_adjust='bonferroni')
display(condition_posthoc)
display(rp.summary_cont(CookBERT_results['f1'].groupby(CookBERT_results['condition'])))

F_onewayResult(statistic=0.8188998154262772, pvalue=0.3665010230982727)


Unnamed: 0,no_context,1_prev_utterance
no_context,1.0,0.366501
1_prev_utterance,0.366501,1.0






Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_prev_utterance,110,0.5405,0.2705,0.0258,0.4893,0.5916
no_context,110,0.5072,0.2741,0.0261,0.4554,0.559


In [None]:
The one-way analysis of variance (ANOVA) is used to determine whether there are any statistically significant differences between the means of three or more independent (unrelated) groups. This guide will provide a brief introduction to the one-way ANOVA, including the assumptions of the test and when you should use this test. If you are familiar with the one-way ANOVA, but would like to carry out a one-way ANOVA analysis, go to our guide: One-way ANOVA in SPSS Statistics.

What does this test do?
The one-way ANOVA compares the means between the groups you are interested in and determines whether any of those means are statistically significantly different from each other. Specifically, it tests the null hypothesis:

One-way ANOVA Null Hypothesis
where µ = group mean and k = number of groups. If, however, the one-way ANOVA returns a statistically significant result, we accept the alternative hypothesis (HA), which is that there are at least two group means that are statistically significantly different from each other.

At this point, it is important to realize that the one-way ANOVA is an omnibus test statistic and cannot tell you which specific groups were statistically significantly different from each other, only that at least two groups were. To determine which specific groups differed from each other, you need to use a post hoc test. Post hoc tests are described later in this guide.

