In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip3 install datasets
!pip3 install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm.auto import tqdm  # for showing progress bar
from datasets import load_dataset

import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizerFast

device = torch.device('cuda:0')
#Using torch by GPU
if torch.cuda.is_available():
    device = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cpu')


In [None]:
#load the dataset from SQuAD

SQuAD = pd.read_csv("/content/gdrive/MyDrive/bertqa/train-v2.0-no-imposs-q.csv", dtype=str, keep_default_na = False)
print(f'length of dataset: {len(SQuAD)}')

In [None]:
# replace all NaN values in 'text' column with 'nulls'
SQuAD['text'].fillna('null', inplace=True)

# convert 'answer_start' column to integer type
SQuAD['answer_start'] = SQuAD['answer_start'].astype(int)

# create new column 'answer' with desired format
SQuAD['answers'] = SQuAD.apply(lambda row: {'text': row['text'], 'answer_start': row['answer_start']}, axis=1)

# remove original 'text' and 'answer_start' columns if desired
SQuAD.drop(['text', 'answer_start'], axis=1, inplace=True)

SQuAD['answers']

SQuAD['answers'][71651]

In [None]:
def add_end_idx(answers, contexts):
    new_answers = []
    # loop through each answer-context pair
    for answer, context in tqdm(zip(answers, contexts)):
        # quick reformating to remove lists
        #answer['text'] = answer['text'][0]
        #answer['answer_start'] = answer['answer_start'][0]
        # gold_text refers to the answer we are expecting to find in context
    
        gold_text = str(answer['text'])
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
        new_answers.append(answer)
    return new_answers

In [None]:
def prep_data(dataset):
    questions = dataset['question']
    contexts = dataset['context']
    answers = add_end_idx(
        dataset['answers'],
        contexts
    )
    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

In [None]:
#splict the set in train and validate
dataset = prep_data(SQuAD)
#dataset_validation = prep_data(SQuAD['validation'])
print('{:>5,} training samples'.format(len(dataset['question'])))

In [None]:
# import pyarrow as pa
# import pyarrow.dataset as ds

# # convert DataFrame to Arrow table
# table = pa.Table.from_pandas(SQuAD)

# # create Arrow dataset
# dataset = ds.dataset(table)

In [None]:
#prepare BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

#print(dataset['answers'][:5])
[str(i) for i in dataset['context'].values]

# tokenize
train = tokenizer([str(i) for i in dataset['context'].values],
                  [str(i) for i in dataset['question'].values],
                  add_special_tokens=True,
                  truncation=True,
                  return_attention_mask=True,  # Construct attn. masks.
                  padding='max_length',
                  return_tensors='pt')

#print(tokenizer.decode(train['input_ids'][0])[:855])



In [None]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in tqdm(range(len(answers))):

        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


add_token_positions(train, dataset['answers'])

In [None]:
#training
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training data
train_dataset = SquadDataset(train)

batch_size = 32

loader = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=32,
                                     shuffle=True)

from transformers import AdamW

model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(4):
    loop = tqdm(loader)
    total_loss = 0    #reset loss
    for batch in loop:
        optim.zero_grad()
        loss=0  #reset this value
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        loss = outputs[0]
        total_loss += loss
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(loader)    
    print("Epochs:" + str(epoch) + ", loss:" + str(avg_train_loss))   #this is average loss
    model.save_pretrained('/content/gdrive/MyDrive/bertqa/bert_qa_pt_'+ str(epoch))