In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis


In [None]:
!pip install datasets transformers torch

# DoQA Dataset Overview
For finetuning our BERT models for the task of question answering, the DoQA dataset is used. This section gives a brief overview over the data and its structure.

The DoQA dataset contains 2,437 information-seeking question/answer dialogues (10,917 questions in total) on three different domains: cooking, travel and movies. Since 'travel' and 'movies' are different domains, only the cooking data is of interest.

The dataset consists of 4612 train, 911 validation and 1797 test samples. All three splits contain the following relevant information:
> ***context*** (string): The context from which the answer to a question has to be extracted. It is important to mention that to each context the string "CANNOTANSER" was appended at the end which is what the system should predict, whenever there is no answer to the question in the context. <br>
> ***question*** (string): Question to a specific context <br>
> ***answers*** (dict): includes 'answer_start' (int) that holds the index of where the answer can be found in text, and 'text' (string), which is the extracted passage of the context that answers the question.<br>
> 





At first, the data is preprocessed and adapted to the correct format in order to make it work with the finetuning script used below.

In [None]:
import json
from datasets import load_dataset, DatasetDict, Dataset
# load data and create json file with all three splits concatenated
train = load_dataset('json', data_files='datasets/doqa/doqa-cooking-train.json', field="data")
valid = load_dataset('json', data_files='datasets/doqa/doqa-cooking-dev.json', field="data")
test = load_dataset('json', data_files='datasets/doqa/doqa-cooking-test.json', field="data")
datasets = [train, valid, test]
doqa = {
    'id': [],
    'context': [],
    'question': [], 
    'answers': []
}

for dataset in datasets:
  for p in dataset['train']['paragraphs']:
    context = p[0]['context']
    qas = p[0]['qas']
    for qu_answ_pair in p[0]['qas']:
      id = qu_answ_pair['id']
      question = qu_answ_pair['question']
      answers = qu_answ_pair['answers'][0]
      doqa['id'].append(id)
      doqa['context'].append(context)
      doqa['question'].append(question)
      ans = {'text': [answers['text']], 'answer_start': [answers['answer_start']]}
      doqa['answers'].append(ans)

# save the preprocessed data to file
with open('datasets/doqa/doqa_all.json', 'w') as f:
    json.dump(doqa, f)

Using custom data configuration default-b6eafaf9c57b4386
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-b6eafaf9c57b4386/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-117a9f1774627c2d
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-117a9f1774627c2d/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-d390de6fc2b93603
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-d390de6fc2b93603/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

# Run Finetuning script

To finetune our BERT models for question answering, the [run_qa.py](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa.py) script from 🤗Huggingface Transformer Library is used and slightly adjusted, so that the dataset is preprocessed as mentioned above.

Note: **model_name_or_path** and **output_dir** has to be modified based on the model to be used.

In [None]:
# When using CookBERT
# --model_name_or_path=CookBERT/further_pretraining/model_output/checkpoint-final
# --output_dir=CookBERT/finetuning_for_downstream_tasks/question_answering/model_output/CookBERT

# When using FoodBERT
# --model_name_or_path=otherModels/checkpoint-final
# --output_dir=CookBERT/finetuning_for_downstream_tasks/question_answering/model_output/FoodBERT

# When using BERT base uncased
# --model_name_or_path=bert-base-uncased
# --output_dir=CookBERT/finetuning_for_downstream_tasks/question_answering/model_output/bert-base-uncased

num_folds = 10

for fold in range(num_folds):
  !python CookBERT/finetuning_for_downstream_tasks/question_answering/run_qa.py \
  --model_name_or_path=CookBERT/further_pretraining/model_output/checkpoint-final \
  --train_file=datasets/doqa/doqa_all.json \
  --output_dir=CookBERT/finetuning_for_downstream_tasks/question_answering/model_output/CookBERT \
  --overwrite_output_dir=True \
  --fold=$fold \
  --total_folds=$num_folds \
  --do_train \
  --do_predict \
  --per_device_train_batch_size=16 \
  --gradient_accumulation_steps=2 \
  --learning_rate=5e-5 \
  --num_train_epochs=3 \
  --seed=42 \