In [1]:
import numpy as np
import pandas as pd
import transformers
import torch

from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda

device = "cuda:0" if cuda.is_available() else "cpu"
device

'cuda:0'

In [17]:
# @title Customize your key variables here
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 512  # @param {type:"integer"}
TRAIN_BATCH_SIZE = 64  # @param {type:"integer"}
VALID_BATCH_SIZE = 32  # @param {type:"integer"}
EPOCHS = 1  # @param {type:"integer"}
LEARNING_RATE = 1e-5  # @param {type:"number"}
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
text = "AHAH Some text"
tokenizer.encode_plus(
    text,
    padding="max_length",
    max_length=MAX_LEN,
    add_special_tokens=True,
    truncation=True,
    return_attention_mask=True,
)

{'input_ids': [101, 6289, 4430, 2070, 3793, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

#### First coding exercise
Define the key CustomDataset `__getitem__` as follows:

*   Define the tokenizer call using `self.tokenizer.encode_plus`
*   Assign the `ids`, `mask`, `token_type_ids`
*   Return the given dictionary, transforming the inputs to long torch tensors



In [18]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())
        ## Tokenize your texts here and return the dictionary!
        tokenizer_output = tokenizer.encode_plus(
            comment_text,
            max_length=self.max_len,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
        )

        return {
            "ids": tokenizer_output["input_ids"],
            "mask": tokenizer_output["attention_mask"],
            "token_type_ids": tokenizer_output["token_type_ids"],
            "targets": self.targets[index],
        }

Now sample, and split training and test into the train_dataset and test_dataset variables. Use `.sample`, `.drop` and `reset_index` functions.

When the datasets are split, declare the `training_set` and `testing_set` variables with your `CustomDataset` data class.

In [19]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset = new_df.sample(frac=train_size, random_state=123)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset.reset_index(drop=True, inplace=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {"batch_size": TRAIN_BATCH_SIZE, "shuffle": True, "num_workers": 0}

test_params = {"batch_size": VALID_BATCH_SIZE, "shuffle": True, "num_workers": 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


<a id='section04'></a>
### Creating the Neural Network for Fine Tuning

#### Neural Network
 - We will be creating a neural network with the `BERTClass`.
 - This network will have the `Bert` model.  Follwed by a `Droput` and `Linear Layer`. They are added for the purpose of **Regulariaztion** and **Classification** respectively.
 - In the forward loop, there are 2 output from the `BertModel` layer.
 - The second output `output_1` or called the `pooled output` is passed to the `Drop Out layer` and the subsequent output is given to the `Linear layer`.
 - Keep note the number of dimensions for `Linear Layer` is **6** because that is the total number of categories in which we are looking to classify our model.
 - The data will be fed to the `BertClass` as defined in the dataset.
 - Final layer outputs is what will be used to calcuate the loss and to determine the accuracy of models prediction.
 - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference.

#### Loss Function and Optimizer
 - The Loss is defined in the next cell as `loss_fn`.
 - As defined above, the loss function used will be a combination of Binary Cross Entropy which is implemented as [BCELogits Loss](https://pytorch.org/docs/stable/nn.html#bcewithlogitsloss) in PyTorch
 - `Optimizer` is defined in the next cell.
 - `Optimizer` is used to update the weights of the neural network to improve its performance.

#### Further Reading
- You can refer to my [Pytorch Tutorials](https://github.com/abhimishra91/pytorch-tutorials) to get an intuition of Loss Function and Optimizer.
- [Pytorch Documentation for Loss Function](https://pytorch.org/docs/stable/nn.html#loss-functions)
- [Pytorch Documentation for Optimizer](https://pytorch.org/docs/stable/optim.html)
- Refer to the links provided on the top of the notebook to read more about `BertModel`.

#### Second coding exercise
Initialize the BERTClass with three layers, the bert transformer (`BertModel.from_pretrained('bert-base-uncased')`) a dropout (`nn.Dropout`) and a dense layer (`nn.Linear`). When the model is declared, you have to code the forward pass, detailing the relationship between neural modules.

In [9]:
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)

tokens = tokenizer.encode_plus(
    text="Some text",
    max_length=512,
    add_special_tokens=True,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
)

output = model.forward(
    input_ids=torch.tensor([tokens["input_ids"]]).to(device),
    attention_mask=torch.tensor([tokens["attention_mask"]]).to(device),
    token_type_ids=torch.tensor([tokens["token_type_ids"]]).to(device),
)

In [10]:
output[1].shape

torch.Size([1, 768])

In [11]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.


class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.dropout = 0.3
        self.hidden_embd = 768
        self.output_layer = 6

        # Declare the layers here
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
        self.l2 = torch.nn.Dropout(p=self.dropout)
        self.l3 = torch.nn.Linear(self.hidden_embd, self.output_layer)

        # Freeze weights
        for param in self.l1.parameters():
            param.requires_grad = False

    def forward(self, ids, mask, token_type_ids):
        # Use the transformer, then the dropout and the linear in that order.
        output_1 = self.l1.forward(  # (BATCH, HIDDEN_EMBD)
            input_ids=ids.to(device),
            attention_mask=mask.to(device),
            token_type_ids=token_type_ids.to(device),
        )[1]
        output_2 = self.l2.forward(output_1)  # (BATCH, HIDDEN_EMBD)
        output = self.l3.forward(output_2)  # (BATCH, OUTPUT_LAYER)
        return output


model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.functional.cross_entropy(outputs, targets)

In [13]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning the Model

After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process.

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network.

Following events happen in this function to fine tune the neural network:
- The dataloader passes data to the model based on the batch size.
- Subsequent output from the model and the actual category are compared to calculate the loss.
- Loss value is used to optimize the weights of the neurons in the network.
- After every 5000 steps the loss value is printed in the console.

As you can see just in 1 epoch by the final step the model was working with a miniscule loss of 0.022 i.e. the network output is extremely close to the actual output.

#### Last coding exercise
Now you have to code the training setup:

1.   Zero-out the gradients with `optimizer.zero_grad()`
2.   Get a batch of data (ids, mask, token_type_ids and labels) and move it to gpu with `.to`
3.   Compute outputs
4.   Compute the loss using `loss_fn` earlier declared.
5.   Make a backward pass with `loss.backward()`
6.   Make the optimizer move forward with `optimizer.step()`



In [22]:
def train(epoch):
    model.train()
    for _, data in enumerate(training_loader, 0):
        optimizer.zero_grad()
        ids = torch.stack(data["ids"], dim=1)
        mask = torch.stack(data["mask"], dim=1)
        token_type_ids = torch.stack(data["token_type_ids"], dim=1)
        targets = torch.stack(data["targets"], dim=1).float().to(device)
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        loss = loss_fn(outputs, targets)
        if _ % 1000 == 0:
            print(f"Epoch: {epoch}, Loss:  {loss.item()}")

In [23]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.4365181028842926
Epoch: 0, Loss:  0.21131421625614166


<a id='section06'></a>
### Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data.

This unseen data is the 20% of `train.csv` which was seperated during the Dataset creation stage.
During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model.

As defined above to get a measure of our models performance we are using the following metrics.
- Accuracy Score
- F1 Micro
- F1 Macro

We are getting amazing results for all these 3 categories just by training the model for 1 Epoch.

**Extract the data and compute the outputs as you did on the training step!**

In [None]:
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = None
            mask = None
            token_type_ids = None
            targets = None
            outputs = None
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average="micro")
    f1_score_macro = metrics.f1_score(targets, outputs, average="macro")
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9354828601867519
F1 Score (Micro) = 0.8104458787743897
F1 Score (Macro) = 0.6943681099377335


<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

This is the final step in the process of fine tuning the model.

The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.

Please remember that a trained neural network is only useful when used in actual inference after its training.

In the lifecycle of an ML projects this is only half the job done. We will leave the inference of these models for some other day.