# Fine Tuning Transformer for MultiClass Text Classification

In [1]:
%%capture
!pip3 install transformers

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [4]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

<a id='section02'></a>
### Importing and Pre-Processing the domain data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# model_df = pd.read_csv("/content/drive/MyDrive/RupolSir/13lac.csv")
# model_df2 = pd.read_csv("/content/drive/MyDrive/RupolSir/p6.csv")

In [7]:
# Import the csv into pandas dataframe and add the headers
#df = pd.read_csv('drive/My Drive/3A2M/BERT/berttrain25k.csv')
df = pd.read_csv('drive/My Drive/Project/machinedata/machinesample416k_extNER.csv')
# # Removing unwanted columns and only leaving title of news and the category which will be the target
#df = df[['title','NER','label']]
df = df[['title','Extended_NER','label']]
df.head()

Unnamed: 0,title,Extended_NER,label
0,Matzo Candy,"['Easy', 'Brown Sugar', '5', '6', '9', 'the He...",1
1,"Mexican Wedding Cookies, With a Twist!","['butter', 'salt', 'pecans', 'salt & pecans', ...",1
2,Meringue,"['Dip', '200', 'Serve', 'a few minutes', 'heav...",1
3,Fudgy Brownies,"['butter', 'salt', 'sugar', 'eggs', '350', 'ba...",1
4,Ranger Cookies,"['375\\u00b0', 'baking powder', 'salt', 'cocon...",1


In [8]:
df.label = df.label - 1
df.head()

Unnamed: 0,title,Extended_NER,label
0,Matzo Candy,"['Easy', 'Brown Sugar', '5', '6', '9', 'the He...",0
1,"Mexican Wedding Cookies, With a Twist!","['butter', 'salt', 'pecans', 'salt & pecans', ...",0
2,Meringue,"['Dip', '200', 'Serve', 'a few minutes', 'heav...",0
3,Fudgy Brownies,"['butter', 'salt', 'sugar', 'eggs', '350', 'ba...",0
4,Ranger Cookies,"['375\\u00b0', 'baking powder', 'salt', 'cocon...",0


In [9]:
import re, string
def cleanNER(model):
  ner_list = []
  for ing in model:
    s = ing.strip("[]")
    s = s.split(",")
    strng = ""
    for i in range(len(s)):
      strng = strng + " " + (s[i].translate(str.maketrans('', '', string.punctuation)))
    ner_list.append(strng.lstrip())
  return ner_list

In [10]:
#ner_list_train = cleanNER(df.NER)
ner_list_train = cleanNER(df.Extended_NER)

In [11]:
df["NER_cleaned"] = ner_list_train

In [12]:
#df = df.drop('NER', axis=1)
df = df.drop('Extended_NER', axis=1)

In [13]:
df['data'] = df['title'] + ", " + df['NER_cleaned']

In [14]:
df.head()

Unnamed: 0,title,label,NER_cleaned,data
0,Matzo Candy,0,Easy Brown Sugar 5 6 9 the HeathSkor Cho...,"Matzo Candy, Easy Brown Sugar 5 6 9 the H..."
1,"Mexican Wedding Cookies, With a Twist!",0,butter salt pecans salt pecans icing suga...,"Mexican Wedding Cookies, With a Twist!, butter..."
2,Meringue,0,Dip 200 Serve a few minutes heavy cream s...,"Meringue, Dip 200 Serve a few minutes heav..."
3,Fudgy Brownies,0,butter salt sugar eggs 350 baking soda 2...,"Fudgy Brownies, butter salt sugar eggs 350..."
4,Ranger Cookies,0,375u00b0 baking powder salt coconut sugar ...,"Ranger Cookies, 375u00b0 baking powder salt ..."


In [15]:
df.isnull().sum().sum()

0

In [16]:
df.title.isnull().sum().sum()

0

In [17]:
x=df[df.label.isnull()]

In [18]:
x

Unnamed: 0,title,label,NER_cleaned,data


<a id='section03'></a>
### Preparing the Dataset and Dataloader

We will start with defining few key variables that will be used later during the training/fine tuning stage.
Followed by creation of Dataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed  the data in batches to the neural network for suitable training and processing. 
Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the [docs at PyTorch](https://pytorch.org/docs/stable/data.html)

#### *Triage* Dataset Class
- This class is defined to accept the Dataframe as input and generate tokenized output that is used by the DistilBERT model for training. 
- We are using the DistilBERT tokenizer to tokenize the data in the `TITLE` column of the dataframe. 
- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`
- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/distilbert.html#distilberttokenizer)
- `target` is the encoded category on the news headline. 
- The *Triage* class is used to create 2 datasets, for training and for validation.
- *Training Dataset* is used to fine tune the model: **80% of the original data**
- *Validation Dataset* is used to evaluate the performance of the model. The model has not seen this data during training. 

#### Dataloader
- Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
- This control is achieved using the parameters such as `batch_size` and `max_len`.
- Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [19]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 100
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [20]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.data[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [21]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (416232, 4)
TRAIN Dataset: (374609, 4)
TEST Dataset: (41623, 4)


In [22]:
training_set[0]



{'ids': tensor([  101,  2586, 10559,  1161, 18613,  3202,   117,   123,  2005, 18669,
         12362, 18700,  1299,  2758,  4489, 10194, 11478,  7136,  1894,  7315,
         18700,  1894,  1113,  1988,  4489,   172,  8009,  2227,  2180, 17087,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0

In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

<a id='section04'></a>
### Creating the Neural Network for Fine Tuning

#### Neural Network
 - We will be creating a neural network with the `DistillBERTClass`. 
 - This network will have the DistilBERT Language model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs. 
 - The data will be fed to the DistilBERT Language model as defined in the dataset. 
 - Final layer outputs is what will be compared to the `encoded category` to determine the accuracy of models prediction. 
 - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference. 
 
#### Loss Function and Optimizer
 - `Loss Function` and `Optimizer` and defined in the next cell.
 - The `Loss Function` is used the calculate the difference in the output created by the model and the actual output. 
 - `Optimizer` is used to update the weights of the neural network to improve its performance.
 
#### Further Reading
- You can refer to my [Pytorch Tutorials](https://github.com/abhimishra91/pytorch-tutorials) to get an intuition of Loss Function and Optimizer.
- [Pytorch Documentation for Loss Function](https://pytorch.org/docs/stable/nn.html#loss-functions)
- [Pytorch Documentation for Optimizer](https://pytorch.org/docs/stable/optim.html)
- Refer to the links provided on the top of the notebook to read more about DistiBERT. 

In [24]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 9)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [25]:
model = DistillBERTClass()
model.to(device)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [26]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning the Model

After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process. 

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. 

Following events happen in this function to fine tune the neural network:
- The dataloader passes data to the model based on the batch size. 
- Subsequent output from the model and the actual category are compared to calculate the loss. 
- Loss value is used to optimize the weights of the neurons in the network.
- After every 5000 steps the loss value is printed in the console.

As you can see just in 1 epoch by the final step the model was working with a miniscule loss of 0.0002485 i.e. the output is extremely close to the actual output.

In [27]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [28]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if nb_tr_steps%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    #print(f"Training Loss Epoch: {epoch_loss}")
    #print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [29]:
for epoch in range(EPOCHS):
    train(epoch)

The Total Accuracy for Epoch 0: 80.10859322653754
The Total Accuracy for Epoch 1: 94.71315424883011
The Total Accuracy for Epoch 2: 96.36981492703059
The Total Accuracy for Epoch 3: 97.08896476059037
The Total Accuracy for Epoch 4: 97.57907578301642


<a id='section06'></a>
### Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. 


In [30]:
def valid(model, testing_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if nb_tr_steps%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    #print(f"Validation Loss Epoch: {epoch_loss}")
    #print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [31]:
print('This is the validation section to print the accuracy and see how it performs')
acc = valid(model, testing_loader)
print("Accuracy on validation data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Accuracy on validation data = 97.28%


<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

In [32]:
# Saving the files for re-use

#output_model_file = 'pytorch_distilbert_recipe.h5'
#output_vocab_file = './models/vocab_distilbert_news.bin'

#model_to_save = model
#torch.save(model_to_save, output_model_file)
#tokenizer.save_vocabulary(output_vocab_file)

#print('All files saved')
#print('This tutorial is completed')

### **Prediction**

In [33]:
def recipe_prediction(text):
    title = text
    inputs = tokenizer.encode_plus(
        title,
        None,
        add_special_tokens=True,
        max_length=32,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation=True,
        return_tensors="pt"
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    return ids, mask

In [34]:
# 3 lac human annotation data input
#unlabeled_df = pd.read_csv("drive/My Drive/3A2M/BERT/berttest33.csv")
unlabeled_df = pd.read_csv("drive/My Drive/Project/humandata/humansample27k_extNER.csv")
#unlabeled_df = unlabeled_df[['title', 'NER', 'label']]
unlabeled_df = unlabeled_df[['title', 'Extended_NER', 'label']]
unlabeled_df.head()

Unnamed: 0,title,Extended_NER,label
0,Isaac'S Homemade Powdered Donut Holes,"['powdered sugar', 'four', 'Cut', 'Donut', 'ci...",1
1,Harris Cake,"['3', '300\\u00b0', 'sugar', 'eggs', '9-inch',...",1
2,Aunt Pearl'S Blackberry Jam Cake,"['allspice', 'two minutes', 'sugar', 'butter',...",1
3,2-Layer Fudge,"['1 teaspoon', 'Marshmallow Creme', 'milk', '4...",1
4,Jello Cake,"['oil', '1 cup', '350\\u00b0', 'eggs', '1', 'J...",1


In [35]:
ner_list_predict = cleanNER(unlabeled_df.Extended_NER)
unlabeled_df["NER_cleaned"] = ner_list_predict
#unlabeled_df = unlabeled_df.drop('NER', axis=1)
unlabeled_df = unlabeled_df.drop('Extended_NER', axis=1)
unlabeled_df['data'] = unlabeled_df['title'] + ", " + unlabeled_df['NER_cleaned']

In [36]:
prediction_list = []

for title in unlabeled_df.data:
  title = str(title)
  ids, mask = recipe_prediction(title)
  ids = ids.to(device, dtype = torch.long)
  mask = mask.to(device, dtype = torch.long)
  outputs = model(ids, mask)
  _, genre = torch.max(outputs.data, dim=1)
  prediction_list.append(genre[0].item())



In [37]:
len(prediction_list)

27000

In [38]:
unlabeled_df['predicted_label'] = prediction_list
unlabeled_df.predicted_label += 1 
unlabeled_df.head()

Unnamed: 0,title,label,NER_cleaned,data,predicted_label
0,Isaac'S Homemade Powdered Donut Holes,1,powdered sugar four Cut Donut cinnamon Voila,"Isaac'S Homemade Powdered Donut Holes, powdere...",1
1,Harris Cake,1,3 300u00b0 sugar eggs 9inch baking powder...,"Harris Cake, 3 300u00b0 sugar eggs 9inch ...",1
2,Aunt Pearl'S Blackberry Jam Cake,1,allspice two minutes sugar butter flour m...,"Aunt Pearl'S Blackberry Jam Cake, allspice tw...",1
3,2-Layer Fudge,1,1 teaspoon Marshmallow Creme milk 4 minutes...,"2-Layer Fudge, 1 teaspoon Marshmallow Creme ...",1
4,Jello Cake,1,oil 1 cup 350u00b0 eggs 1 JellO water C...,"Jello Cake, oil 1 cup 350u00b0 eggs 1 Jel...",1


In [39]:
unlabeled_df.predicted_label.value_counts()

9    3046
1    3032
5    3018
4    3012
2    2995
3    2994
8    2984
6    2981
7    2938
Name: predicted_label, dtype: int64

In [40]:
# predition file save
#unlabeled_df.to_csv("drive/My Drive/3A2M/AllData/Human/3lac-human_pred_bert.csv", index=False)

In [41]:
cnt = 0
for i in range(len(unlabeled_df)):
  if unlabeled_df["label"][i] == unlabeled_df["predicted_label"][i]:
    cnt = cnt+1

In [42]:
y_true = []
y_pred = []

for i in range(len(unlabeled_df)):
  y_true.append(unlabeled_df["label"][i])
  y_pred.append(unlabeled_df["predicted_label"][i])

In [43]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.97      0.99      0.98      3000
           2       0.98      0.98      0.98      3000
           3       0.99      0.99      0.99      3000
           4       0.99      0.99      0.99      3000
           5       0.99      0.99      0.99      3000
           6       1.00      0.99      0.99      3000
           7       1.00      0.98      0.99      3000
           8       1.00      0.99      1.00      3000
           9       0.97      0.99      0.98      3000

    accuracy                           0.99     27000
   macro avg       0.99      0.99      0.99     27000
weighted avg       0.99      0.99      0.99     27000



In [44]:
accuracy = cnt/len(unlabeled_df)
print(accuracy)


0.9873333333333333
