# BERT based COVID-19 Chatbot

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util, evaluation
from torch.utils.data import DataLoader
import pandas as pd
import datetime

## 1. Load Model

In [None]:
# download sentence-transformers pre-trained models to local
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/

# choose model
base_model = 'bert-base-nli-mean-tokens'
model = SentenceTransformer('./model/' + base_model)

In [None]:
# get data path
train_data_path = "data/training_data.csv"
eval_data_path = "data/evaluation_data.csv"

# set parameters

# epochs: [1,2,3,5,10,15,20]
epochs=3

# batch_size: [32,64,128]
batch_size = 16

# learning_rate: [2e-01, 2e-03, 2e-05, 2e-07]
learning_rate = 2e-05


# fixed
optimizer_eps = 2e-05
warmup_steps=100
evaluation_steps=500


# save model
new_model = base_model + '-eps' + str(epochs) + '-batch' + str(batch_size) + '-lr' + str(learning_rate)
model_save_path = 'model/' + new_model

## 2. Training Data
To represent our training data, we use the InputExample class to store training examples. As parameters, it accepts texts, which is a list of strings representing our pairs (or triplets). Further, we can also pass a label (either float or int). The following shows a simple example, where we pass text pairs to InputExample together with a label indicating the semantic similarity.

In [None]:
train_df = pd.read_csv(train_data_path).loc[:, ['question1', 'question2', 'is_duplicate']].dropna()
train = train_df.apply(lambda row: InputExample(texts=[row[0], row[1]], label=float(row[2])), axis=1)

train_dataset = SentencesDataset(train, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

## 3. Loss Functions
The loss function plays a critical role when fine-tuning the model. It determines how well our embedding model will work for the specific downstream task.

Sadly there is no “one size fits all” loss function. Which loss function is suitable depends on the available training data and on the target task.

To fine-tune our network, we need somehow to tell our network which sentence pairs are similar, and should be close in vector space, and which pairs are dissimilar, and should be far away in vector space.

The most simple way is to have sentence pairs annotated with a score indicating their similarity, e.g. on a scale 0 to 1. We can then train the network with a Siamese Network Architecture.

For each sentence pair, we pass sentence A and sentence B through our network which yields the embeddings u und v. The similarity of these embeddings is computed using cosine similarity and the result is compared to the gold similarity score. This allows our network to be fine-tuned and to recognize the similarity of sentences.

We tune the model by calling model.fit(). We pass a list of train_objectives, which constist of tuples (dataloader, loss_function). We can pass more than one tuple in order to perform multi-task learning on several datasets with different loss functions.

In [None]:
train_loss = losses.CosineSimilarityLoss(model)

## 4. Evaluators
During training, we usually want to measure the performance to see if the performance improves. For this, the sentence_transformers.evaluation package exists. It contains various evaluators which we can pass to the fit-method. These evaluators are run periodically during training. Further, they return a score and only the model with the highest score will be stored on disc.

In [None]:
eval_df = pd.read_csv(eval_data_path).loc[:, ['question1', 'question2', 'is_duplicate']].dropna()
sentences1 = eval_df['question1'].tolist()
sentences2 = eval_df['question2'].tolist()
scores = [float(i) for i in eval_df['is_duplicate'].tolist()]

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

model.fit(train_objectives=[(train_dataloader, train_loss)], 
          epochs=epochs,
          optimizer_params={'correct_bias': False, 
                            'eps': optimizer_eps,
                            'lr': learning_rate},
          warmup_steps=warmup_steps,
          evaluator=evaluator,
          evaluation_steps=evaluation_steps,
          # save model
          output_path=model_save_path
         )