# Installing Modules

In [1]:
!pip install -qq pytorch_lightning==1.4.0 tableprint
!pip install -qq datasets transformers[sentencepiece]

[K     |████████████████████████████████| 913 kB 11.2 MB/s 
[K     |████████████████████████████████| 5.6 MB 27.9 MB/s 
[K     |████████████████████████████████| 636 kB 35.7 MB/s 
[K     |████████████████████████████████| 829 kB 43.4 MB/s 
[K     |████████████████████████████████| 272 kB 42.0 MB/s 
[K     |████████████████████████████████| 118 kB 48.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 43.4 MB/s 
[K     |████████████████████████████████| 294 kB 46.6 MB/s 
[K     |████████████████████████████████| 142 kB 50.3 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 264 kB 13.2 MB/s 
[K     |████████████████████████████████| 2.6 MB 44.2 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[K     |████████████████████████████████| 243 kB 50.7 MB/s 
[K     |████████████████████████████████| 76 kB 4.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 51.2 MB/s 
[K     |██████████████

# Imports

In [2]:
##############
### Basics ###
##############
import numpy as np
import pandas as pd
import random
import math, time

##############
### Torch  ###
##############
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

##############
#### Plots ###
##############
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib.ticker as ticker

##################
## Transformers ##
##################
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

##############
#### Misc ####
##############
import tableprint as tp


In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
model_checkpoint = "bert-base-uncased"

# Loading the Data

In [6]:
datasets = load_dataset("squad_v2")
datasets

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [7]:
print("Column names:")
print(datasets.column_names)

Column names:
{'train': ['id', 'title', 'context', 'question', 'answers'], 'validation': ['id', 'title', 'context', 'question', 'answers']}


In [8]:
# Sample from dataset
datasets["train"][0]

{'answers': {'answer_start': [269], 'text': ['in the late 1990s']},
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'id': '56be85543aeaaa14008c9063',
 'question': 'When did Beyonce start becoming popular?',
 'title': 'Beyoncé'}

# Building the Dataset

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

The following code ensures that we are using a Fast Tokenizer.

In [10]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [11]:
len(datasets['train'])

130319

In [12]:
type(datasets['train'])

datasets.arrow_dataset.Dataset

In [13]:
train_dataset_filtered = datasets['train'].filter(lambda x: x['answers']['text'] != [])

  0%|          | 0/131 [00:00<?, ?ba/s]

In [14]:
print(len(datasets['train']))
print(len(train_dataset_filtered))

130319
86821


In [15]:
val_dataset_filtered = datasets['validation'].filter(lambda x: x['answers']['text'] != [])

  0%|          | 0/12 [00:00<?, ?ba/s]

In [16]:
print(len(datasets['validation']))
print(len(val_dataset_filtered))

11873
5928


To reduce computational time, we extract a subset of the Dataset

In [17]:
train_size = 8000
train_dataset_red, _ = torch.utils.data.random_split(train_dataset_filtered, [train_size, len(train_dataset_filtered) - train_size])

In [18]:
test_size = 2000
val_dataset_red, _ = torch.utils.data.random_split(val_dataset_filtered, [test_size, len(val_dataset_filtered) - test_size])

In [19]:
len(train_dataset_red), len(val_dataset_red )

(8000, 2000)

In [20]:
class SQUADDataset(Dataset):

  def __init__(self, dataset, tokenizer, max_token_len= 384, doc_stride = 128):

    self.dataset = dataset
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
    self.doc_stride = doc_stride
    # print(len(self.dataset))

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx: int):

    data_row = self.dataset[idx]
    encoding = self.tokenizer(
        data_row["question"],
        data_row["context"],
        truncation="only_second",
        max_length=self.max_token_len,
        stride=self.doc_stride,
        # return_overflowing_tokens=True,
        # return_offsets_mapping=True,
        return_attention_mask = True,
        padding="max_length",
        return_tensors='pt'
    )
    
    # try:
    start_positions = data_row['answers']['answer_start'][0]
    end_positions = data_row['answers']['answer_start'][0] + len(data_row['answers']['text'][0])
    # except IndexError:
    #     pdb.set_trace()


# ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    return dict(
                question = data_row["question"],
                context = data_row["context"],
                input_ids = encoding['input_ids'].flatten(),
                attention_mask = encoding['attention_mask'].flatten(),  
                token_type_ids = encoding['token_type_ids'].flatten(), # not applicable for distillbert
                start_positions = torch.tensor([start_positions]),
                end_positions = torch.tensor([end_positions])
                )

In [21]:
# train_dataset = SQUADDataset(datasets['train'], tokenizer)
# val_dataset = SQUADDataset(datasets['validation'], tokenizer)
train_dataset = SQUADDataset(train_dataset_red, tokenizer)
val_dataset = SQUADDataset(val_dataset_red, tokenizer)

In [22]:
len(train_dataset), len(val_dataset)

(8000, 2000)

In [None]:
# for data in train_dataset:
#     # print(data['question'])
#     # print(data['context'])
#     # print(data['input_ids'].shape)
#     # print(data['attention_mask'].shape)
#     # print(data['start_positions'])
#     # print(data['end_positions'])
#     # break
#     pass

In [None]:
# for data in val_dataset:
#     # print(data['question'])
#     # print(data['context'])
#     # print(data['input_ids'].shape)
#     # print(data['attention_mask'].shape)
#     # print(data['start_positions'])
#     # print(data['end_positions'])
#     # break
#     pass

## Testing our Train Dataset on a sample

In [23]:
sample_item = train_dataset[0]
print(sample_item.keys())
print(len(sample_item['attention_mask']))
print(len(sample_item['input_ids']))
print(sample_item['start_positions'])
print(sample_item['end_positions'])

dict_keys(['question', 'context', 'input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'])
384
384
tensor([176])
tensor([182])


# Building the DataModule

In [25]:
BATCH_SIZE = 16 # bert

In [26]:
train_loader = DataLoader(
                        train_dataset,
                        batch_size = BATCH_SIZE, 
                        shuffle = True,
                        num_workers = 2,
                      )
val_loader = DataLoader(
                        val_dataset,
                        batch_size = BATCH_SIZE, 
                        shuffle = False,
                        num_workers = 2,
                      )

# Building the Model

In [27]:
automodel = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [28]:
class Model(pl.LightningModule):
    def __init__(self, model, steps_per_epoch, n_epochs):
        super(Model, self).__init__()
        self.model = model
        self.train_loss = torch.tensor([])
        self.avg_train_loss = 0.
        self.table_context = None
        self.start_time = 0
        self.end_time = 0
        self.epoch_mins = 0
        self.epoch_secs = 0
        self.avg_train_loss = 0.
        self.table_context = None
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)

        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(
                                                    optimizer,
                                                    warmup_steps,
                                                    total_steps
                                                    ) 
        return [optimizer], [scheduler]

    def forward(self, input_ids, attention_mask, token_type_ids, start_positions, end_positions):
    # def forward(self, input_ids, attention_mask, start_positions, end_positions):
        output = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)
        # output = self.model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        
        loss = output.loss
        start_scores = output.start_logits
        end_scores = output.end_logits
               
        # return loss, start_scores, end_scores
        # return {"loss": loss, 'start_scores': start_scores, 'end_scores':end_scores}
        return output
    
    def on_train_epoch_start(self):
        self.train_loss = torch.tensor([])

    def step(self, batch):

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        start_positions = batch['start_positions']
        end_positions = batch['end_positions']
        token_type_ids = batch['token_type_ids'] # not applicable for distilbert
        output = self(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions) 
        # output = self(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions) 

        loss = output.loss
        start_scores = output.start_logits
        end_scores = output.end_logits
 
        return {"loss": loss, 'start_scores': start_scores, 'end_scores':end_scores}


    def training_step(self, batch, batch_idx):

        loss = self.step(batch)['loss']
        self.train_loss =  torch.cat((self.train_loss, torch.tensor([loss])), 0)
        return loss
        # return self.step(batch)
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch)
    
    def on_validation_epoch_start(self):
        self.avg_train_loss = self.train_loss.mean().item()
        self.train_loss = torch.tensor([])

    def validation_epoch_end(self, outputs):
        if self.trainer.sanity_checking:
          return
        self.end_time = time.time()
        self.epoch_mins, self.epoch_secs = self.epoch_time(self.start_time, self.end_time)
        time_int = f'{self.epoch_mins}m {self.epoch_secs}s'
        avg_valid_loss = torch.stack([x['loss'] for x in outputs]).mean().item()
        metrics = {'epoch': self.current_epoch+1, 'Train PPL': math.exp(self.avg_train_loss), 'Train Loss': self.avg_train_loss,  'Valid PPL': math.exp(avg_valid_loss), 'Valid Loss': avg_valid_loss}
        if self.table_context is None:
          self.table_context = tp.TableContext(headers=['epoch', 'Train PPL', 'Train Loss', 'Valid PPL', 'Valid Loss', 'Time'])
          self.table_context.__enter__()
        self.table_context([self.current_epoch+1, math.exp(self.avg_train_loss), self.avg_train_loss, math.exp(avg_valid_loss), avg_valid_loss, time_int])
        self.logger.log_metrics(metrics)
        if self.current_epoch == self.trainer.max_epochs - 1:
          self.table_context.__exit__()
    
    def epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs
    
    def on_train_epoch_start(self):
        self.start_time = time.time()

In [29]:
N_EPOCHS = 2

model = Model(
            model = automodel,
            steps_per_epoch=len(train_dataset) // BATCH_SIZE,
            n_epochs = N_EPOCHS
            )

## Testing our Model on a sample

In [30]:
sample_item['start_positions']

tensor([176])

In [31]:
sample_item['end_positions']

tensor([182])

In [32]:
outputs = model(
                    sample_item['input_ids'].unsqueeze(0),
                    sample_item['attention_mask'].unsqueeze(0),
                    sample_item['token_type_ids'].unsqueeze(0),
                    sample_item['start_positions'],
                    sample_item['end_positions']
                )

In [33]:
outputs.keys()

odict_keys(['loss', 'start_logits', 'end_logits'])

# Training the Model

In [34]:
!rm -rf csv_logs
csvlogger = CSVLogger('csv_logs', name='END2_Assign_14_1', version=0)
trainer = pl.Trainer(max_epochs=2, num_sanity_val_steps=1, logger=csvlogger, gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [35]:
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                     | Params
---------------------------------------------------
0 | model | BertForQuestionAnswering | 108 M 
---------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.573   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

╭─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────╮
│       epoch │   Train PPL │  Train Loss │   Valid PPL │  Valid Loss │        Time │
├─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┤
│           1 │      389.55 │       5.965 │      391.49 │        5.97 │     19m 20s │


Validating: 0it [00:00, ?it/s]

│           2 │       388.6 │      5.9626 │      390.27 │      5.9668 │     19m 23s │
╰─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────╯
