https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb

In [1]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer



In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

## formating data

In [4]:
def encode_cat(string, ref):
     return ref.index(string)

In [5]:
# Import data from csv
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

# drop duplicates
df.drop_duplicates(subset=['combined_text'], keep='first', ignore_index=True, inplace=True)

# encode category
category_list = list(df.category.unique())
df['label'] = df['category'].map(lambda x: encode_cat(x, category_list))

df.drop(columns=['asin'], inplace=True)
df.rename(columns={'combined_text': 'text'}, inplace=True)

## Preparing dataset and dataloader

In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


In [7]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1271009, 3)
TRAIN Dataset: (1143908, 3)
TEST Dataset: (127101, 3)


In [10]:
test_dataset.to_csv('../datasets/cleaned/test_set_for_evaluating_bert.csv')

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Creating the Neural Network for Fine Tuning

In [10]:
# Creating the customized model, 
# by adding a drop out and a dense layer on top of distil bert 
# to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        
        # set the shape = (768, number_of_classes)
        self.classifier = torch.nn.Linear(768, 21)

    
        

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [12]:
# Creating the loss function and optimizer

class_values = df['label'].value_counts(sort=False)
weights = class_values.sum() / class_values
normalized_weights = list(weights/weights.sum())
class_weights = torch.FloatTensor(normalized_weights).cuda()

loss_function = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Fine Tuning the Model

In [13]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch, loader = training_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [15]:
%%time
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 3.047241687774658
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 2.914866801620221
Training Accuracy per 5000 steps: 17.98640271945611
Training Loss per 5000 steps: 2.813370509822778
Training Accuracy per 5000 steps: 22.835216478352166
Training Loss per 5000 steps: 2.7174937577893212
Training Accuracy per 5000 steps: 26.441570561962536
Training Loss per 5000 steps: 2.61451882978188
Training Accuracy per 5000 steps: 29.357282135893204
Training Loss per 5000 steps: 2.531974638333526
Training Accuracy per 5000 steps: 31.606735730570776
Training Loss per 5000 steps: 2.4622544852379047
Training Accuracy per 5000 steps: 33.393886870437655
Training Loss per 5000 steps: 2.4014447268517625
Training Accuracy per 5000 steps: 34.904717008085484
Training Loss per 5000 steps: 2.3518925433491042
Training Accuracy per 5000 steps: 36.12659683507912
Training Loss per 5000 steps: 2.3065135209470355
Training Accuracy per 5000 steps: 37.3597253394369
Train

In [16]:
# Saving evaluation model

output_model_file = '../models/pytorch_distilbert_amazon_weighted_eval.bin'
output_vocab_file = '../models/vocab_distilbert_amazon_weighted_eval.bin'

model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


## validating model

In [17]:
def valid(model, testing_loader):
    tr_loss=0
    nb_tr_steps=0
    nb_tr_examples=0
    
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            
#             outputs = model(ids, mask).squeeze()
            outputs = model(ids, mask)
            
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [18]:
acc = valid(model, testing_loader)
print(f"Accuracy on test data = {acc}")

Validation Loss per 100 steps: 0.10859662294387817
Validation Accuracy per 100 steps: 100.0
Validation Loss per 100 steps: 1.3412874694455508
Validation Accuracy per 100 steps: 61.67766446710658
Validation Loss per 100 steps: 1.347678253743347
Validation Accuracy per 100 steps: 61.8988101189881
Validation Loss per 100 steps: 1.352023130318458
Validation Accuracy per 100 steps: 61.289247383507764
Validation Loss per 100 steps: 1.3493625808715903
Validation Accuracy per 100 steps: 61.314434278286086
Validation Loss per 100 steps: 1.3417594904380987
Validation Accuracy per 100 steps: 61.47954081836726
Validation Loss per 100 steps: 1.3404679150292775
Validation Accuracy per 100 steps: 61.519616012799574
Validation Loss per 100 steps: 1.3355486017672558
Validation Accuracy per 100 steps: 61.57395502985629
Validation Loss per 100 steps: 1.3321532088558696
Validation Accuracy per 100 steps: 61.69720756981076
Validation Loss per 100 steps: 1.3376154334080244
Validation Accuracy per 100 steps:

## Tuning with rest of dataset

In [19]:
%%time
for epoch in range(EPOCHS):
    train(epoch, loader=testing_loader)

Training Loss per 5000 steps: 0.0659838318824768
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 1.3968120152724788
Training Accuracy per 5000 steps: 60.80783843231354
Training Loss per 5000 steps: 1.3896202186294628
Training Accuracy per 5000 steps: 61.48885111488851
Training Loss per 5000 steps: 1.3861922403933098
Training Accuracy per 5000 steps: 61.49256716218919
Training Loss per 5000 steps: 1.389867887752088
Training Accuracy per 5000 steps: 61.116944152792364
Training Loss per 5000 steps: 1.3882942344849152
Training Accuracy per 5000 steps: 61.23355065797368
Training Loss per 5000 steps: 1.391144970667556
Training Accuracy per 5000 steps: 61.26295790140329
Training Loss per 5000 steps: 1.392937280102996
Training Accuracy per 5000 steps: 61.1996800091426
Training Loss per 5000 steps: 1.390451846507463
Training Accuracy per 5000 steps: 61.26721831954201
Training Loss per 5000 steps: 1.3890476740060247
Training Accuracy per 5000 steps: 61.39530232661497
Traini

## Saving the Trained Model Artifacts for inference

In [20]:
# Saving fully trained model

output_model_file = '../models/pytorch_distilbert_amazon_weighted_full.bin'
output_vocab_file = '../models/vocab_distilbert_amazon_weighted_full.bin'

model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
