In [None]:
# Reference: https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
import pandas as pd
import re
import tensorflow as tf

path = "../input/medquad-dataset/ProcessedData.csv"
df = pd.read_csv(path)

answers = df['Answers'].values.tolist()

In [None]:
unique_words = set()
for answer in answers:
    answer = answer.lower()
    answer = re.sub(
        "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), "", answer
    )
    answer = re.sub(r'(\t*)+', '', answer)
    words = answer.split(" ")
    unique_words.update(words)

print("Number of unique words", len(unique_words))

In [None]:
# lets view 10 words
list(unique_words)[:10]

In [None]:
num_added_toks = tokenizer.add_tokens(unique_words)
print('We have added', num_added_toks, 'tokens')

In [None]:
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))

In [None]:
inputs = tokenizer(answers, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [None]:
inputs

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
inputs.keys()

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)

# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
mask_arr

In [None]:
# And now we take take the indices of each True value, within each individual vector.
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [None]:
selection[0]

In [None]:
# Then apply these indices to each respective row in input_ids, assigning each of the values at these indices as 103 (id for MASK token)
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
inputs.input_ids

In [None]:
'''
    We can see the value 103 assigned in the same position as the True value is found in the 
    mask_arr tensor.
    The inputs tensors are now ready — and we can begin setting them up to be fed into our model
    during training.
    During training, we’ll be using a PyTorch DataLoader to load our data. To use this, we’ll 
    need to format our data into a PyTorch Dataset object.
'''

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)

In [None]:
# with batch_size = 16, "CUDA out of memory" issue was there, so reduced the batch_size, other possible ways to handle
# https://stackoverflow.com/questions/59129812/how-to-avoid-cuda-out-of-memory-in-pytorch
loader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

In [None]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Now we’re finally set up — we can begin training! We format this as a typical training loop in PyTorch.

from tqdm import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()

        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        
        # extract loss
        loss = outputs.loss
        
        # calculate loss for every parameter that needs grad update
        loss.backward()
        
        # update parameters
        optim.step()
        
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
# for saving tokenizer
# BASE_MODEL = "distilbert-base-multilingual-cased"
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# tokenizer.save_pretrained("./models/tokenizer/")
# tokenizer2 = DistilBertTokenizer.from_pretrained("./models/tokenizer/")

# Also note: tokenizer2 = AutoTokenizer.from_pretrained("./models/tokenizer/"), this does not work
# instead, DistilBertTokenizer.from_pretrained("./models/tokenizer/"), this works.
tokenizer.save_pretrained("tokenizer_saved")

In [None]:
model.save_pretrained("model_medquad")