https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb

In [1]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer



In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

## formating data

In [4]:
def encode_cat(string, ref):
     return ref.index(string)

In [5]:
# Import data from csv
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

# drop duplicates
df.drop_duplicates(subset=['combined_text'], keep='first', ignore_index=True, inplace=True)

# encode category
category_list = list(df.category.unique())
df['label'] = df['category'].map(lambda x: encode_cat(x, category_list))

df.drop(columns=['asin'], inplace=True)
df.rename(columns={'combined_text': 'text'}, inplace=True)

## Preparing dataset and dataloader

In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


In [7]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1271009, 3)
TRAIN Dataset: (1143908, 3)
TEST Dataset: (127101, 3)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Creating the Neural Network for Fine Tuning

In [10]:
# Creating the customized model, 
# by adding a drop out and a dense layer on top of distil bert 
# to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 21)
    
        

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [12]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Fine Tuning the Model

In [13]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch, loader = training_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [15]:
%%time
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 3.032930850982666
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 2.448926548163096
Training Accuracy per 5000 steps: 28.144371125774846
Training Loss per 5000 steps: 2.3054151399223675
Training Accuracy per 5000 steps: 32.62423757624238
Training Loss per 5000 steps: 2.1996813833417517
Training Accuracy per 5000 steps: 35.880941270581964
Training Loss per 5000 steps: 2.122731206913387
Training Accuracy per 5000 steps: 38.169341532923355
Training Loss per 5000 steps: 2.056580785453179
Training Accuracy per 5000 steps: 40.22439102435902
Training Loss per 5000 steps: 1.9986112759335313
Training Accuracy per 5000 steps: 41.927769074364186
Training Loss per 5000 steps: 1.9539980673242652
Training Accuracy per 5000 steps: 43.23019342304506
Training Loss per 5000 steps: 1.9125094847636626
Training Accuracy per 5000 steps: 44.43326416839579
Training Loss per 5000 steps: 1.875731753826748
Training Accuracy per 5000 steps: 45.49621119530677
Train

In [16]:
# Saving fully trained model

output_model_file = '../models/pytorch_distilbert_amazon_imbalanced_eval.bin'
output_vocab_file = '../models/vocab_distilbert_amazon_imbalanced_eval.bin'

model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


## validating model

In [17]:
def valid(model, testing_loader):
    tr_loss=0
    nb_tr_steps=0
    nb_tr_examples=0
    
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            
#             outputs = model(ids, mask).squeeze()
            outputs = model(ids, mask)
            
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [18]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print(f"Accuracy on test data = {acc}")

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 1.089339256286621
Validation Accuracy per 100 steps: 50.0
Validation Loss per 100 steps: 1.170224219082764
Validation Accuracy per 100 steps: 65.06698660267946
Validation Loss per 100 steps: 1.1618767426983483
Validation Accuracy per 100 steps: 65.1934806519348
Validation Loss per 100 steps: 1.1616851459648434
Validation Accuracy per 100 steps: 65.21898540097327
Validation Loss per 100 steps: 1.1670875136965082
Validation Accuracy per 100 steps: 64.95175241237938
Validation Loss per 100 steps: 1.1664611940916378
Validation Accuracy per 100 steps: 65.00139994400224
Validation Loss per 100 steps: 1.1682161804553919
Validation Accuracy per 100 steps: 64.96450118329389
Validation Loss per 100 steps: 1.1694670404466903
Validation Accuracy per 100 steps: 65.01814233879033

## Tuning with rest of dataset

In [19]:
%%time
for epoch in range(EPOCHS):
    train(epoch, loader=testing_loader)

Training Loss per 5000 steps: 2.310896873474121
Training Accuracy per 5000 steps: 50.0
Training Loss per 5000 steps: 1.2152239886251996
Training Accuracy per 5000 steps: 63.6372725454909
Training Loss per 5000 steps: 1.2118654553385957
Training Accuracy per 5000 steps: 63.81861813818618
Training Loss per 5000 steps: 1.2042404591154863
Training Accuracy per 5000 steps: 64.12905806279582
Training Loss per 5000 steps: 1.207519852453522
Training Accuracy per 5000 steps: 63.974301284935756
Training Loss per 5000 steps: 1.210596567041873
Training Accuracy per 5000 steps: 63.821447142114316
Training Loss per 5000 steps: 1.2124395968648225
Training Accuracy per 5000 steps: 63.74787507083097
Training Loss per 5000 steps: 1.2099000658072925
Training Accuracy per 5000 steps: 63.86674666438102
Training Loss per 5000 steps: 1.2115825196247887
Training Accuracy per 5000 steps: 63.8321541961451
Training Loss per 5000 steps: 1.2127715741149132
Training Accuracy per 5000 steps: 63.724139463567475
Train

## Saving the Trained Model Artifacts for inference

In [20]:
# Saving fully trained model

output_model_file = '../models/pytorch_distilbert_amazon_imbalanced_full.bin'
output_vocab_file = '../models/vocab_distilbert_amazon_imbalanced_full.bin'

model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
