# **TRAIN MODEL 2**

## Installed Libraries

In [None]:
!pip install --quiet transformers==4.1.1
!pip install --quiet tokenizers==0.9.4 
!pip install --quiet sentencepiece==0.1.94
!pip install --quiet tqdm==4.56.0
!pip install --quiet pytorch-lightning==1.2.10

[K     |████████████████████████████████| 1.5 MB 5.5 MB/s 
[K     |████████████████████████████████| 2.9 MB 40.2 MB/s 
[K     |████████████████████████████████| 895 kB 49.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 5.5 MB/s 
[K     |████████████████████████████████| 72 kB 788 kB/s 
[K     |████████████████████████████████| 841 kB 5.5 MB/s 
[K     |████████████████████████████████| 829 kB 34.8 MB/s 
[K     |████████████████████████████████| 176 kB 46.3 MB/s 
[K     |████████████████████████████████| 133 kB 49.5 MB/s 
[K     |████████████████████████████████| 596 kB 41.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 41.1 MB/s 
[K     |████████████████████████████████| 94 kB 3.0 MB/s 
[K     |████████████████████████████████| 271 kB 49.2 MB/s 
[K     |████████████████████████████████| 144 kB 49.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [None]:
# connect your personal google drive to store dataset and trained model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from termcolor import colored
import textwrap
from tqdm.notebook import tqdm
import copy

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

# seed all the pseudo-random number generators in: pytorch, numpy, python.random
# number 42 was used because the answer to the great question of “life, the universe and everything” is 42
pl.seed_everything(42)

Global seed set to 42


42

## Check GPU

In [None]:
# Check we have a GPU and check the GPU's memory size
!nvidia-smi

Mon Feb  7 23:58:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    25W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## File paths of the two our custom datasets

In [None]:
train_file_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/dataset/squad_t5_train.csv'
validation_file_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/dataset/squad_t5_validation.csv'

## Download our pretrained model and tokenizer

In [None]:
# download the vocab used during pretraining
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')

t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Class for encoding our custom datasets

In [None]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.context_column = "context" # first column of the dataframe
        self.answer = "answer"  # second column of the dataframe
        self.question = "question"  # third column of the dataframe

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_csv(self.path,nrows=1000) # get only the first 1000 rows from the CSV file for simplicity

        self.max_len_input = max_len_inp # set the max length of input to 512
        self.max_len_output = max_len_out  # set the max length of output to 96

        self.tokenizer = tokenizer
        
        self.inputs = []  # create an emty array for appending the inputs
        self.targets = []  # create an emty array for appending the outputs
        self.skippedcount =0  # counter for counting the skipped inputs

        self._build()  # call the build function for tokenizing the inputs and outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze


        # labels are used by T5 model to calculate the loss
        # some target ids will have less than 96 length of output
        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100  # if there is padding in the target ids => the id of padding is 0 in the dictionary
                                   # it will be replaced by -100 as this is what t5 requires


        # return dictionary
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for idx in tqdm(range(len(self.data))):
            context,answer,target = self.data.loc[idx, self.context_column],self.data.loc[idx, self.answer], self.data.loc[idx, self.question]

            input_ = "context: %s  answer: %s </s>" % (context, answer)
            target = "question: %s </s>" % (str(target))

            # get the encoded input
            test_input_encoding = self.tokenizer.encode_plus(input_,
                                        truncation=False,
                                        return_tensors="pt")
            

            # get the length of the encoded input
            length_of_input_encoding = len(test_input_encoding['input_ids'][0])


            # if the length of the econded input is more than the max length => skip it
            if length_of_input_encoding > self.max_len_input:
              self.skippedcount = self.skippedcount + 1
              continue


            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output, pad_to_max_length=True,return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

## Encode the whole training dataset

In [None]:
train_dataset = QuestionGenerationDataset(t5_tokenizer,train_file_path)

  0%|          | 0/1000 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


## Encode the whole validation dataset

In [None]:
validation_dataset = QuestionGenerationDataset(t5_tokenizer,validation_file_path)

  0%|          | 0/1000 [00:00<?, ?it/s]

## T5 Finetuner

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self,hparams, t5model, t5tokenizer):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams
        self.model = t5model
        self.tokenizer = t5tokenizer



    """ we are getting input ids/attention mask and output ids/attentionmask and labels
        the function arguments that are set to "None" is for giving them a default value in case they are not passed
        so it is only necessary to pass input_ids
    """
    
    """ the reason why decoder_input_ids is not initialized
        is because it is the same with labels just where there is padding, instead of having -100 it has 0 
    """
    def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels, # labels have all the padding ids from 0 to -100 replaced
        )
         
         # outputs is a tuple with loss and soft-max for all the tokenized vocabulary
         return outputs


    def training_step(self, batch, batch_idx):
        outputs = self.forward(  # calls the forward function that is declared above
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        """ helps to fetch the data in batches """

        # returns sequences according to the batch that is passed and their tokenized ids
        return DataLoader(train_dataset, batch_size=self.hparams.batch_size,num_workers=4)

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=self.hparams.batch_size,num_workers=4)



    def configure_optimizers(self):
        """ in Deep Learning, optimizer is something that defines how the weights are updated/adjusted for the model
        because once the forward pass is done (forward function) 
        => the gradients are calculated and then doing back-propagation to adjust the weights of the model """

        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [None]:
# define the batch size
# it is fetched by the T5FineTuner class as hparameters.batch_size
args_dict = dict(
    batch_size=4,
)

# variable for passing the arguments
args = argparse.Namespace(**args_dict)

# define our model 
# by passing the arguments, t5_model and t5_tokenizer to the T5FineTuner class
model = T5FineTuner(args,t5_model,t5_tokenizer)


# main trainer class where pytorch-lightning is initialized
""" 
max_epochs = 1  --> for demonstrating it quicker
gpus =  1 --> if we go to Kernel -> Change Kernel -> will see that only one is available
progress_bar_refresh_rate --> the progress bar will update every 30 samples
"""
trainer = pl.Trainer(max_epochs = 1, gpus=1,progress_bar_refresh_rate=30)

trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

## Save model and tokenizer

In [None]:
print ("Saving model")

# file paths for saving the model and the tokenizer
save_path_model = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/model/'
save_path_tokenizer = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer/'

# save the model
model.model.save_pretrained(save_path_model)

# save the tokenizer
t5_tokenizer.save_pretrained(save_path_tokenizer)

Saving model


('/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer/tokenizer_config.json',
 '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer/special_tokens_map.json',
 '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer/spiece.model',
 '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer/added_tokens.json')