# Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/named_entity_recognition/

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis/CookBERT/finetuning_for_downstream_tasks/named_entity_recognition


# Installations and Imports

In [None]:
!pip install scikit-posthocs researchpy



In [None]:
import pandas as pd
import json
from scipy.stats import f_oneway # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
import scikit_posthocs as sp # https://scikit-posthocs.readthedocs.io/en/latest/intro/
import researchpy as rp

# Preparations for Evaluation

## Load the Data

In [None]:
models = ["CookBERT", "bert-base-uncased", "FoodBERT"]
tasks = ["food-classification", "foodon", "hansard-closest", "hansard-parent", "snomedct"]
num_folds = 10 # number of cross validation folds
result_df = pd.DataFrame()

# append all the cross-validation results to a joint dataframe 
for model in models:
  for task in tasks:
    for fold in range(num_folds):
      try:
        with open(f'model_output/{model}/{task}/predict_fold_{fold}_results.json') as f:
          fold_result = json.load(f)
          result = {
              'model': model,
              'task': task,
              'fold': fold,
              'f1': fold_result['predict_f1'],
              'precision': fold_result['predict_precision'],
              'recall': fold_result['predict_recall'],
              'accuracy': fold_result['predict_accuracy']
          }
          result_df = result_df.append(result, ignore_index=True)
      except:
        continue
result_df

Unnamed: 0,model,task,fold,f1,precision,recall,accuracy
0,CookBERT,food-classification,0.0,0.948454,0.929787,0.967885,0.987716
1,CookBERT,food-classification,1.0,0.951709,0.930998,0.973363,0.988292
2,CookBERT,food-classification,2.0,0.943590,0.925553,0.962343,0.986926
3,CookBERT,food-classification,3.0,0.939956,0.916933,0.964166,0.989171
4,CookBERT,food-classification,4.0,0.937750,0.918345,0.957993,0.987805
...,...,...,...,...,...,...,...
145,FoodBERT,snomedct,5.0,0.510009,0.538603,0.484298,0.956021
146,FoodBERT,snomedct,6.0,0.540670,0.526398,0.555738,0.960843
147,FoodBERT,snomedct,7.0,0.542158,0.546618,0.537770,0.961407
148,FoodBERT,snomedct,8.0,0.538653,0.537313,0.540000,0.960850


## Methods for Evaluation

In [None]:
# method for evaluating the data for a certain task/ or all tasks/ all ner-tasks, excluding the food-nofood classification
# includes one-way anova and posthoc t-test to compare all three models
def evaluate_for_task(df_with_all_results, task="all"):
  if task == "all":
    task_results = df_with_all_results
    print(task_results)
  elif task == "ner-only":
    task_results = df_with_all_results[df_with_all_results['task'] != "food-classification"]
    print(task_results)
  else:
    task_results = df_with_all_results[df_with_all_results['task'] == task]
  anova = f_oneway(task_results['f1'][task_results['model'] == 'CookBERT'],
                    #task_results['f1'][task_results['model'] == 'FoodBERT'],
                    task_results['f1'][task_results['model'] == 'bert-base-uncased'])
  print("Anova: ")
  display(anova)
  posthoc = sp.posthoc_ttest(task_results, val_col='f1', group_col='model',p_adjust='bonferroni')
  print("\n\nPosthoc: ")
  display(posthoc)
  print("\n\nSummary: ")
  display(rp.summary_cont(task_results['f1'].groupby(task_results['model'])))

--- 
# Evaluation
---
Notes:
- The tasks/different tagging styles used for evaluation were proposed in the paper "[A Fine-Tuned Bidirectional Encoder Representations From Transformers Model for Food Named-Entity Recognition: Algorithm Development and Validation](https://www.researchgate.net/publication/353789336_A_Fine-Tuned_Bidirectional_Encoder_Representations_From_Transformers_Model_for_Food_Named-Entity_Recognition_Algorithm_Development_and_Validation)" by Riste Stojanov, Gorjan Popovski, Gjorgjina Cenikj, Barbara Koroušić Seljak, Tome Eftimov (2021). Their procedure was adopted for this work. Below there is a summary for each "task" that provides a short description of it (the descriptions were taken from their paper). 
- The authors also published the FoodBase corpus tagged with these 5 different ontologies they used (see [here](https://github.com/ds4food/FoodNer)).

## Task-specific Performance

### Food-classification
This was performed for distinguishing food versus nonfood entity. In this task, all food phrases annotated in FoodBase were labeled with the tag FOOD.

In [None]:
evaluate_for_task(result_df, "food-classification")

Anova: 


F_onewayResult(statistic=26.65751631768714, pvalue=6.531706693863314e-05)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.0001959512,1.372926e-11
bert-base-uncased,0.0001959512,1.0,4.758076e-09
FoodBERT,1.372926e-11,4.758076e-09,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,0.9447,0.0041,0.0013,0.9417,0.9476
FoodBERT,10,0.8953,0.0089,0.0028,0.889,0.9017
bert-base-uncased,10,0.9329,0.0059,0.0019,0.9287,0.9371


### Foodon
This was performed for distinguishing 205 classes, where the classes are semantic tags from the [FoodOn ontology](https://foodon.org/). For each food phrase in FoodBase, the corresponding FoodOn class was selected based on the [FoodOntoMap](https://zenodo.org/record/2635437#.YhuGoejMJjV) mappings.


In [None]:
evaluate_for_task(result_df, "foodon")

Anova: 


F_onewayResult(statistic=17.48208324741538, pvalue=0.0005612282667801357)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.001684,3.122529e-07
bert-base-uncased,0.001683685,1.0,0.000112924
FoodBERT,3.122529e-07,0.000113,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,0.7342,0.0211,0.0067,0.7191,0.7493
FoodBERT,10,0.5985,0.0459,0.0145,0.5656,0.6313
bert-base-uncased,10,0.6894,0.0265,0.0084,0.6704,0.7083


### Hansard-closest
This was performed for distinguishing 92 classes from the [Hansard hierarchy](https://www.english-corpora.org/hansard/). In this task, for each
food phrase in FoodBase, the closest Hansard tag to the food phrase was chosen as annotation. The closest tag was selected using the minimum cosine distance between the BERT embedding of the food phrase and the BERT embeddings of the Hansard tag labels.

In [None]:
evaluate_for_task(result_df, "hansard-closest")

### Hansard-parent
This was performed for distinguishing 48 classes from the [Hansard corpus](https://www.english-corpora.org/hansard/). In this task, the parent semantic tags from the Hansard hierarchy that correspond to the food phrases in FoodBase were selected. In cases with multiple different parent tags present for the food phrase, the first occurring parent was selected.

In [None]:
evaluate_for_task(result_df, "hansard-parent")

Anova: 


F_onewayResult(statistic=6.235986592060603, pvalue=0.022438411653473433)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.06731523,1.25969e-09
bert-base-uncased,0.06731523,1.0,3.26943e-09
FoodBERT,1.25969e-09,3.26943e-09,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,0.8583,0.0159,0.005,0.8469,0.8697
FoodBERT,10,0.7401,0.0263,0.0083,0.7213,0.759
bert-base-uncased,10,0.8431,0.0108,0.0034,0.8354,0.8508


### Snomedct
This was performed for distinguishing 207 classes, where the classes are semantic tags from the SNOMED CT ontology. In this task, [FoodOntoMap](https://zenodo.org/record/2635437#.YhuGoejMJjV) was used to obtain the SNOMED CT class for the food phrase.

In [None]:
evaluate_for_task(result_df, "snomedct")

Anova: 


F_onewayResult(statistic=17.42896102117906, pvalue=0.0005692135069610121)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.001708,6.706296e-09
bert-base-uncased,0.001707641,1.0,1.258617e-06
FoodBERT,6.706296e-09,1e-06,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,10,0.7187,0.0263,0.0083,0.6999,0.7375
FoodBERT,10,0.5267,0.0489,0.0155,0.4917,0.5617
bert-base-uncased,10,0.6662,0.0298,0.0094,0.6449,0.6875


## Overall Performance

### All Tasks
Overall performance of the three models over all tasks, food-classification included

In [None]:
evaluate_for_task(result_df, "all")

        model                 task  fold  ...  precision    recall  accuracy
0    CookBERT  food-classification   0.0  ...   0.929787  0.967885  0.987716
1    CookBERT  food-classification   1.0  ...   0.930998  0.973363  0.988292
2    CookBERT  food-classification   2.0  ...   0.925553  0.962343  0.986926
3    CookBERT  food-classification   3.0  ...   0.916933  0.964166  0.989171
4    CookBERT  food-classification   4.0  ...   0.918345  0.957993  0.987805
..        ...                  ...   ...  ...        ...       ...       ...
145  FoodBERT             snomedct   5.0  ...   0.538603  0.484298  0.956021
146  FoodBERT             snomedct   6.0  ...   0.526398  0.555738  0.960843
147  FoodBERT             snomedct   7.0  ...   0.546618  0.537770  0.961407
148  FoodBERT             snomedct   8.0  ...   0.537313  0.540000  0.960850
149  FoodBERT             snomedct   9.0  ...   0.506000  0.440767  0.955222

[150 rows x 7 columns]
Anova: 


F_onewayResult(statistic=1.9665411122982266, pvalue=0.16397526965986317)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.491926,1e-06
bert-base-uncased,0.491926,1.0,0.000259
FoodBERT,1e-06,0.000259,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,50,0.8033,0.0885,0.0125,0.7782,0.8285
FoodBERT,50,0.6787,0.1344,0.019,0.6405,0.7169
bert-base-uncased,50,0.7765,0.1022,0.0144,0.7475,0.8055


### All NER-Tasks
Overall performance of the three models over all ner-tasks, food-classification excluded

In [None]:
evaluate_for_task(result_df, "ner-only")

        model      task  fold        f1  precision    recall  accuracy
10   CookBERT    foodon   0.0  0.729167   0.690141  0.772871  0.974456
11   CookBERT    foodon   1.0  0.742456   0.710067  0.777941  0.976014
12   CookBERT    foodon   2.0  0.740437   0.705729  0.778736  0.976394
13   CookBERT    foodon   3.0  0.734177   0.700671  0.771049  0.977993
14   CookBERT    foodon   4.0  0.691843   0.653352  0.735152  0.976065
..        ...       ...   ...       ...        ...       ...       ...
145  FoodBERT  snomedct   5.0  0.510009   0.538603  0.484298  0.956021
146  FoodBERT  snomedct   6.0  0.540670   0.526398  0.555738  0.960843
147  FoodBERT  snomedct   7.0  0.542158   0.546618  0.537770  0.961407
148  FoodBERT  snomedct   8.0  0.538653   0.537313  0.540000  0.960850
149  FoodBERT  snomedct   9.0  0.471136   0.506000  0.440767  0.955222

[120 rows x 7 columns]
Anova: 


F_onewayResult(statistic=4.294333786785217, pvalue=0.04154372718078663)



Posthoc: 


Unnamed: 0,CookBERT,bert-base-uncased,FoodBERT
CookBERT,1.0,0.1246312,1.76942e-12
bert-base-uncased,0.1246312,1.0,5.224562e-08
FoodBERT,1.76942e-12,5.224562e-08,1.0




Summary: 




Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CookBERT,40,0.768,0.0586,0.0093,0.7492,0.7867
FoodBERT,40,0.6245,0.0874,0.0138,0.5966,0.6525
bert-base-uncased,40,0.7374,0.0726,0.0115,0.7142,0.7606


## Other Metrics

Macro average performance for each model for each task

In [None]:
display(result_df.groupby(['model','task']).agg({'precision':['mean'],
                                          'recall':['mean'],
                                          'f1':['mean'],
                                          'accuracy':['mean']})) # accuracy not really meaningfull, since most tags are 'O'

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
model,task,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
CookBERT,food-classification,0.925051,0.965155,0.944658,0.987812
CookBERT,foodon,0.697472,0.775149,0.734187,0.977088
CookBERT,hansard-closest,0.722139,0.804076,0.760797,0.971416
CookBERT,hansard-parent,0.827239,0.89177,0.858264,0.980141
CookBERT,snomedct,0.685762,0.755097,0.718701,0.977392
FoodBERT,food-classification,0.852819,0.942361,0.89534,0.979157
FoodBERT,foodon,0.587341,0.610286,0.59849,0.961503
FoodBERT,hansard-closest,0.595473,0.675194,0.632803,0.952843
FoodBERT,hansard-parent,0.684143,0.80622,0.740142,0.963621
FoodBERT,snomedct,0.536348,0.518386,0.526688,0.958369


Macro average performance for each model over all tasks

In [None]:
result_df.groupby(['model']).agg({'precision':['mean'],
                                          'recall':['mean'],
                                          'f1':['mean'],
                                          'accuracy':['mean']}) # accuracy not really meaningfull, since most tags are 'O'

Unnamed: 0_level_0,precision,recall,f1,accuracy
Unnamed: 0_level_1,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
CookBERT,0.771533,0.838249,0.803321,0.97877
FoodBERT,0.651225,0.710489,0.678693,0.963099
bert-base-uncased,0.740186,0.816943,0.776514,0.975685
