https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb

In [1]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer



In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

## formating data

In [4]:
def encode_cat(string, ref):
     return ref.index(string)

In [5]:
# Import data from csv
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

# drop duplicates
df.drop_duplicates(subset=['combined_text'], keep='first', ignore_index=True, inplace=True)

# encode category
category_list = list(df.category.unique())
df['label'] = df['category'].map(lambda x: encode_cat(x, category_list))

df.drop(columns=['asin'], inplace=True)
df.rename(columns={'combined_text': 'text'}, inplace=True)

In [6]:
len(category_list)

21

In [7]:
df.head(2)

Unnamed: 0,text,category,label
0,I have a 9 year old Badger 1 that needs replac...,appliances,0
1,model number This may help InSinkErator Model ...,appliances,0


## Preparing dataset and dataloader

In [8]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


In [9]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [10]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1271009, 3)
TRAIN Dataset: (1143908, 3)
TEST Dataset: (127101, 3)


In [11]:
train_dataset

Unnamed: 0,text,category,label
0,Does this have a 240V Twist-lock socket? It lo...,patio_lawn_and_garden,14
1,i don't see a plug underneath the unit where t...,home_and_kitchen,10
2,Will these fit an Artic Cat Wilcat 2014 as lon...,automotive,2
3,I will have no problem when I get the phone tu...,cell_phones_and_accessories,5
4,jWhat is the size of each stamp? I need someth...,arts_crafts_and_sewing,1
...,...,...,...
1143903,"On this product description, it doesn't say if...",beauty,4
1143904,What are the actual measurements of this item?...,tools_and_home_improvement,18
1143905,can i use this in my 2014 Forrester (non turbo...,automotive,2
1143906,"DRM? Hi, Does anyone know what DRM this compil...",video_games,20


In [12]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Creating the Neural Network for Fine Tuning

In [13]:
# Creating the customized model, 
# by adding a drop out and a dense layer on top of distil bert 
# to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(768, 4)
        self.classifier = torch.nn.Linear(768, 21)
        

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [14]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [15]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Fine Tuning the Model

In [16]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [17]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [18]:
%%time
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 2.988100528717041
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 2.46383612243754
Training Accuracy per 5000 steps: 28.159368126374726
Training Loss per 5000 steps: 2.325095835154223
Training Accuracy per 5000 steps: 32.206779322067796
Training Loss per 5000 steps: 2.232645457535118
Training Accuracy per 5000 steps: 34.90100659956003
Training Loss per 5000 steps: 2.1474510374649465
Training Accuracy per 5000 steps: 37.36563171841408
Training Loss per 5000 steps: 2.0800394696304907
Training Accuracy per 5000 steps: 39.34842606295748
Training Loss per 5000 steps: 2.024806825998221
Training Accuracy per 5000 steps: 41.00529982333922
Training Loss per 5000 steps: 1.9775006108969209
Training Accuracy per 5000 steps: 42.41878803462758
Training Loss per 5000 steps: 1.9349074564314088
Training Accuracy per 5000 steps: 43.66890827729307
Training Loss per 5000 steps: 1.8984823686601107
Training Accuracy per 5000 steps: 44.78233817026288
Training

## validating model

In [27]:
def valid(model, testing_loader):
    tr_loss=0
    nb_tr_steps=0
    nb_tr_examples=0
    
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [29]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print(f"Accuracy on test data = {acc}")

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.6827276349067688
Validation Accuracy per 100 steps: 50.0
Validation Loss per 100 steps: 1.153610627326896
Validation Accuracy per 100 steps: 65.56688662267547
Validation Loss per 100 steps: 1.1717454403008956
Validation Accuracy per 100 steps: 65.003499650035
Validation Loss per 100 steps: 1.1732460206224469
Validation Accuracy per 100 steps: 64.98566762215852
Validation Loss per 100 steps: 1.1695380986668962
Validation Accuracy per 100 steps: 64.91925403729813
Validation Loss per 100 steps: 1.1730626189064572
Validation Accuracy per 100 steps: 64.83940642374306
Validation Loss per 100 steps: 1.1722533300618856
Validation Accuracy per 100 steps: 64.90783640545315
Validation Loss per 100 steps: 1.1709802085025252
Validation Accuracy per 100 steps: 64.95957258364047

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

## Saving the Trained Model Artifacts for inference

In [None]:
# Saving the files for re-use

output_model_file = '../models/pytorch_distilbert_amazon.bin'
output_vocab_file = '../models/vocab_distilbert_amazon.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')