### Explanatory data analysis and Preprocessing

In [2]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast 

PRE_TRAINED_MODEL_NAME = 'bert-large-uncased'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv(
    'data/trainSet.csv',
    names=['search_term', 'category'])

In [4]:
possible_labels = df.category.unique() # list of unique existing categories
len(possible_labels)

1419

In [5]:
df_test = pd.read_table('data/candidateTestSet.txt', names=['search_term'])

### Training/Validation Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_text, val_text, train_labels, val_labels = train_test_split(
    df['search_term'],
    df['category'],
    test_size=0.1,
    random_state=RANDOM_SEED,
    stratify=df['category'] 
)

In [8]:
len(train_text)

546140

In [9]:
len(val_text)

60683

### Import BERT Model and BERT Tokenizer

In [10]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [11]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 12, # max length of search terms in train and test sets
    padding=True,
    truncation=True    
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 12,
    padding=True,
    truncation=True
)

# tokenize and encode sequences in the test set - df_test
tokens_test = tokenizer.batch_encode_plus(
    df_test['search_term'].tolist(),
    max_length = 12,
    padding=True,
    truncation=True
)

In [12]:
## convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])

### Creating Data Loaders
Now we will create dataloaders for both train and validation set. These dataloaders will pass batches of train data and validation data as input to the model during the training phase.

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [14]:
#define a batch size
BATCH_SIZE = 32

# wrap tensors
train_data = TensorDataset(train_seq, 
                           train_mask, train_y)
val_data = TensorDataset(val_seq, 
                         val_mask, val_y)

# dataLoader for train set
train_dataloader = DataLoader(
    train_data, 
    sampler=RandomSampler(train_data), 
    batch_size=BATCH_SIZE)

# dataLoader for validation set
val_dataloader = DataLoader(
    val_data, 
    sampler = SequentialSampler(val_data), 
    batch_size=BATCH_SIZE)

### Define Model Architecture
Freeze the entire architecture

Freeze all the layers of the model and attach a few neural network layers of our own and train this new model. 

In [15]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [16]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        
        super(BERT_Arch, self).__init__()
      
        self.bert = bert 
      
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(1024, 512)
      
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512, len(possible_labels))

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask)[:2]
      
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [17]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [18]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 395 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 1024)
bert.embeddings.position_embeddings.weight               (512, 1024)
bert.embeddings.token_type_embeddings.weight               (2, 1024)
bert.embeddings.LayerNorm.weight                             (1024,)
bert.embeddings.LayerNorm.bias                               (1024,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight        (1024, 1024)
bert.encoder.layer.0.attention.self.query.bias               (1024,)
bert.encoder.layer.0.attention.self.key.weight          (1024, 1024)
bert.encoder.layer.0.attention.self.key.bias                 (1024,)
bert.encoder.layer.0.attention.self.value.weight        (1024, 1024)
bert.encoder.layer.0.attention.self.value.bias               (1024,)
bert.encoder.layer.0.attention.output.dense.weight      (1024, 1024)
bert.encoder.layer.0.attention.output.dense.bias             (

### Setting Up Optimizer and Scheduler

In [19]:
# optimizer from hugging face transformers
from transformers import AdamW, get_linear_schedule_with_warmup

# define the optimizer
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5)        

In [20]:
EPOCHS = 100
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = total_steps 
)

There is a class imbalance in our dataset. So, we will first compute class weights for the labels in the train set and then pass these weights to the loss function so that it takes care of the class imbalance.

In [21]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)
print("Class Weights:", class_weights)

Class Weights: [0.94332518 1.0573535  0.73449747 ... 0.85338509 0.82063257 0.94332518]


In [22]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights, dtype=torch.float)
# push to GPU
weights = weights.to(device)
# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights)

### Fine-Tune BERT

In [23]:
# function to train the model
def train():
  
    model.train()

    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step, batch in enumerate(train_dataloader):
    
        # progress update after every 5000 batches.
        if step % 5000 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss += loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()
        scheduler.step()

        preds=preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [24]:
# function for evaluating the model
def evaluate():
  
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
  
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
    
        # Progress update every 1000 batches.
        if step % 1000 == 0 and not step == 0:
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)
            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

#### Start fine-tuning of the model

In [25]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
history = {'train': {}, 'val': {}}

#for each epoch
for epoch in range(EPOCHS):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, EPOCHS))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'model/bert_version_1_saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    history['train'][epoch] = train_loss
    history['val'][epoch] =  valid_loss

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 100
  Batch 5,000  of  17,067.
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Evaluating...
  Batch 1,000  of  1,897.

Training Loss: 7.252
Validation Loss: 7.235

 Epoch 2 / 100
  Batch 5,000  of  17,067.
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Evaluating...
  Batch 1,000  of  1,897.

Training Loss: 7.226
Validation Loss: 7.204

 Epoch 3 / 100
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Evaluating...
  Batch 1,000  of  1,897.

Training Loss: 7.199
Validation Loss: 7.173

 Epoch 4 / 100
  Batch 5,000  of  17,067.
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Evaluating...
  Batch 1,000  of  1,897.

Training Loss: 7.170
Validation Loss: 7.139

 Epoch 5 / 100
  Batch 5,000  of  17,067.
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Evaluating...
  Batch 1,000  of  1,897.

Training Loss: 7.137
Validation Loss: 7.105

 Epoch 6 / 100
  Batch 5,000  of  17,067.
  Batch 10,000  of  17,067.
  Batch 15,000  of  17,067.

Ev