# Connect to Google Drive and change directory

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/

# Prepare data for finetuning
To successfully run the finetuning script on the custom 'cookversational_search' data, the data must be of a specific format:

*   The column that contains the ground truth/ the correct class label is called "label"
*   The column that contains the text to be classified is called "sentence"

Therefore the data will be of format:
```
id  |  label  |  sentence
```

Additionally, the train- and validation file containing the data should be of csv or json format. 

Also make sure that the columns in csv file are separated by ','. Other separators like '\t' or ';' might not be processed correctly.

The processing of the data according to these requirements is done in another jupyter notebook (see prepare_for_cross_validation.ipynb)

# Perform required installations 

In [None]:
!pip install transformers datasets

# Run finetuning script
For Finetuning BERT on Text Classification, the [run_glue.py](https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py) (Retrieved at: 10.01.2022) script from 🤗[Huggingface Transformer library](https://huggingface.co/transformers/) was used and slightly modified.

In [None]:
total_folds = 10 # number of folds for cross validation

In [None]:
# For CookBERT use
# --model_name_or_path=CookBERT/further_pretraining/model_output/checkpoint-final
# --output_dir=CookBERT/finetuning_for_downstream_tasks/text_classification/model_output/CookBERT/  + no_context OR 1_prev_utterance

# For FoodBERT use
# --model_name_or_path=otherModels/checkpoint-final
# --output_dir=CookBERT/finetuning_for_downstream_tasks/text_classification/model_output/FoodBERT/  + no_context OR 1_prev_utterance

# For BERT base uncased use
# --model_name_or_path=bert-base-uncased
# --output_dir=CookBERT/finetuning_for_downstream_tasks/text_classification/model_output/bert-base-uncased/  + no_context OR 1_prev_utterance

# Data to use:
# --data_path=datasets/cookversational_search/cookversational_search_no_context.csv     for no context
# --data_path=datasets/cookversational_search/cookversational_search_1_prev_utterance.csv     for context


# todo:
# - für mehrere Epochen trainiern und gucken, ob class weights angepasst werden müssen
# - funktion zum laden von foodbert hinzufügen
# - resultate in separaten Ordner abspeichern
for fold in range(total_folds):
  !python CookBERT/finetuning_for_downstream_tasks/text_classification/run_classification.py \
  --model_name_or_path=otherModels/checkpoint-final \
  --do_train \
  --do_predict \
  --total_folds=$total_folds \
  --fold=$fold \
  --output_dir=CookBERT/finetuning_for_downstream_tasks/text_classification/model_output/FoodBERT/no_context \
  --overwrite_output_dir=True \
  --data_path=datasets/cookversational_search/cookversational_search_no_context.csv \
  --num_train_epochs=4 \
  --save_strategy=no \
  --learning_rate=2e-5 \
  --per_device_train_batch_size=16 \
  --gradient_accumulation_steps=2 \
  --seed=42 \