In [1]:
import numpy as np
from utils import *
from networks.classification import *
from data.categories import cat_map
from transformers import DistilBertTokenizer
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import hamming_loss

In [2]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
df = data_to_df(10000)

In [4]:
df.head()

Unnamed: 0,id,title,abstract,category
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA


In [5]:
labels = list(cat_map.values())
num_classes = len(labels)
print(f'{num_classes} Unique Classes')

153 Unique Classes


### Process Data

In [6]:
MAX_LEN = 512
TRAINING_SPLIT = 0.8
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased');

In [7]:
dataset = Data_Processor(df, tokenizer, MAX_LEN)

train_size = int(len(dataset) * TRAINING_SPLIT)
val_size = len(dataset) - train_size

train_set, val_set = random_split(dataset, [train_size, val_size])

In [8]:
# Store label encoding dict 
class_array = dataset.classes_()

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(val_set, **test_params)

In [10]:
print(f'Batches in training set {len(training_loader)}')
print(f'Batches in test set {len(testing_loader)}')

Batches in training set 2000
Batches in test set 1000


### Fine-Tune Classification Model

In [11]:
model = DistillBertClass(len(class_array))
model.to(device)

DistillBertClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [12]:
# Creating the loss function and optimizer

loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [60]:
x = next(iter(training_loader))
x['ids'].shape

torch.Size([4, 512])

In [14]:
def train(epoch):
    """
    Training function for topic classification
    nn model
    """
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()

    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)

        outputs = model(ids, mask)
        top_val, top_idx = torch.topk(outputs, 3, dim=1)
        
        # create one-hot encoding of top 3 predictions
        outputs_oh = torch.zeros(outputs.shape)
        outputs_oh[torch.arange(outputs_oh.size(0)).unsqueeze(1), top_idx] = 1.
        
        loss = loss_function(outputs_oh, targets)
        loss.requires_grad = True
        tr_loss += loss.item()
        n_correct += int(torch.eq(outputs_oh, targets).sum())

        nb_tr_steps += 1
        nb_tr_examples+=targets.numel()
        
        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu

In [16]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 500 steps: 0.7062482237815857
Training Accuracy per 500 steps: 97.00704225352112


KeyboardInterrupt: 

In [36]:
def valid(model, testing_loader):
    """
    Validation function to assess performance
    on validation set of data
    """

    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    val_loss = 0;
    nb_val_steps = 0;
    nb_val_examples = 0;
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float32)
            
            outputs = model(ids, mask).squeeze()
            top_val, top_idx = torch.topk(outputs, 3, dim=1)
            
            outputs_oh = torch.zeros(outputs.shape)
            outputs_oh[torch.arange(outputs_oh.size(0)).unsqueeze(1), top_idx] = 1.
            
            loss = loss_function(outputs_oh, targets)
            val_loss += loss.item()
            n_correct += int(torch.eq(outputs_oh, targets).sum())

            nb_val_steps += 1
            nb_val_examples+=targets.numel()
            
            if _%5000==0:
                loss_step = val_loss/nb_val_steps
                accu_step = (n_correct*100)/nb_val_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = val_loss/nb_val_steps
    epoch_accu = (n_correct*100)/nb_val_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [37]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss per 100 steps: 0.7062480449676514
Validation Accuracy per 100 steps: 96.47887323943662


KeyboardInterrupt: 

### Test on Sample input

In [None]:
# Saving the files for re-use
output_model_file = './models/pytorch_distilbert_arxiv.bin'
output_vocab_file = './models/vocab_distilbert_arxiv.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

In [110]:
text = 'Materials'
data = tokenizer.encode_plus(text, 
                             add_special_tokens=True, 
                             max_length=MAX_LEN,
                             padding='max_length',
                             return_token_type_ids=True)

ids = torch.tensor(data['input_ids']).unsqueeze(0)

mask = torch.tensor(data['attention_mask']).unsqueeze(0)

output = model(ids, mask)

top_val, top_idx = torch.topk(output, 3, dim=1)
preds = class_array[top_idx].tolist()
[cat_map[cat] for cat in preds[0]]

['Computational Geometry',
 'Other Computer Science',
 'Computer Vision and Pattern Recognition']