In [None]:
import os
import re
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import io

In [None]:
df = pd.read_csv("/content/mod_data.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,category,value,ques,ans
0,offici state seal,$200,it seal show a map of the state with georgia o...,alabama
1,up on mount rushmor,$1000,appoint oliv ellsworth chief justic,washington
2,let the game begin,$1000,a href http www j archiv com medium two thousa...,grace
3,thoma pain,$200,with letter of introduct from ben franklin tho...,philadelphia
4,the new testament,$200,chapter six of thi new testament book describ ...,revel
...,...,...,...,...
3995,literari movement,$200,thi movement that includ shelley byron wa base...,romantic
3996,cocoa channel,$200,there are fruiti cocoa varieti of thi cereal p...,pebbl
3997,travel tourism,$1000,about half of the visitor to thi former portug...,macao
3998,desert,$200,roald dahl 's augustu gloop get hi just de s e...,chocol


In [None]:
df['ques'] = df['ques'].str.cat(df['category'],sep=" ")
df['ques'] = df['ques'].str.cat(df['ans'],sep=" ")
del df['category']
del df['ans']


In [None]:
df

Unnamed: 0,value,ques
0,$200,it seal show a map of the state with georgia o...
1,$1000,appoint oliv ellsworth chief justic up on moun...
2,$1000,a href http www j archiv com medium two thousa...
3,$200,with letter of introduct from ben franklin tho...
4,$200,chapter six of thi new testament book describ ...
...,...,...
3995,$200,thi movement that includ shelley byron wa base...
3996,$200,there are fruiti cocoa varieti of thi cereal p...
3997,$1000,about half of the visitor to thi former portug...
3998,$200,roald dahl 's augustu gloop get hi just de s e...


In [None]:
def label_encode(df):
    d = {"$200": 0,"$1000": 1}
    
    for i in range(len(df)):

        if df['value'][i] in d:
            df['value'][i] = d[df['value'][i]]
        else:
            df['value'][i] = 'None'
    return df
df = label_encode(df)
labels = df["value"].astype(str).values

In [None]:
df

Unnamed: 0,value,ques
0,0,it seal show a map of the state with georgia o...
1,1,appoint oliv ellsworth chief justic up on moun...
2,1,a href http www j archiv com medium two thousa...
3,0,with letter of introduct from ben franklin tho...
4,0,chapter six of thi new testament book describ ...
...,...,...
3995,0,thi movement that includ shelley byron wa base...
3996,0,there are fruiti cocoa varieti of thi cereal p...
3997,1,about half of the visitor to thi former portug...
3998,0,roald dahl 's augustu gloop get hi just de s e...


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.5)
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [None]:
test

Unnamed: 0,value,ques
0,0,in two thousand and six it wa downgrad to a dw...
1,0,matthew wa a publican mean he collect these th...
2,0,the finest type of thi liquor are certifi one ...
3,0,on dec twenty-three two thousand and nine the ...
4,1,perfect colorado citi for a valuabl opportun c...
...,...,...
1995,1,oscar wild windermer 's fan titl contain titl ...
1996,1,the top letter row of a standard comput keyboa...
1997,0,ye there realli is a name for the fear of thi ...
1998,1,the remak had ben stiller the kid movi remak h...


In [None]:
# Load training data and set labels
data = train

# Load test data
test_data = test


In [None]:
data.head()

Unnamed: 0,value,ques
0,0,rub shoulder with nebraska state legisl at bil...
1,1,do n't be the person who toppl the tower that ...
2,0,when nanci pelosi maxin water go home from the...
3,0,web address that end with au are locat on host...
4,1,it 's not a felin it 's not a reptil it 's the...


In [None]:
data['value'].value_counts()

0    1005
1     995
Name: value, dtype: int64

In [None]:
test_data.head()

Unnamed: 0,value,ques
0,0,in two thousand and six it wa downgrad to a dw...
1,0,matthew wa a publican mean he collect these th...
2,0,the finest type of thi liquor are certifi one ...
3,0,on dec twenty-three two thousand and nine the ...
4,1,perfect colorado citi for a valuabl opportun c...


We will randomly split the entire training data into two sets: a train set with 90% of the data and a validation set with 10% of the data. We will perform hyperparameter tuning using cross-validation on the train set and use the validation set to compare models

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(data['ques'], data['value'], test_size = 0.2, random_state = 123)

The transformer library of Hugging Face contains PyTorch implementation of state-of-the-art NLP models including BERT (from Google), GPT (from OpenAI) ... and pre-trained model weights.

In [None]:
#!pip install transformers

The level of processing here is much less than in previous approachs because BERT was trained with the entire sentences.

In [None]:
def text_preprocessing(text):

    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In order to apply the pre-trained BERT, we must use the tokenizer provided by the library. This is because (1) the model has a specific, fixed vocabulary and (2) the BERT tokenizer has a particular way of handling out-of-vocabulary words.

In addition, we are required to add special tokens to the start and end of each sentence, pad & truncate all sentences to a single constant length, and explicitly specify what are padding tokens with the "attention mask".

The encode_plus method of BERT tokenizer will:

(1) split our text into tokens,

(2) add the special [CLS] and [SEP] tokens, and

(3) convert these tokens into indexes of the tokenizer vocabulary,

(4) pad or truncate sentences to max length, and

(5) create attention mask.

In [None]:
from transformers import BertTokenizer
# Loading the bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [None]:
def bert_preprocessing(data):
  # Initialise empty arrays
  input_ids = []
  attention_masks = []
  # Encode_plus with above processing
  for sent in data:
    #print(sent)
    encoded_sent = tokenizer.encode_plus(text = text_preprocessing(sent),add_special_tokens = True,max_length = MAX_LEN,pad_to_max_length = True,return_attention_mask = True,truncation = True)
    # put the output into the list above
    input_ids.append(encoded_sent.get('input_ids'))
    attention_masks.append(encoded_sent.get('attention_mask'))
  # Convert list to tensors
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  return input_ids, attention_masks

Before tokenizing, we need to specify the maximum length of our sentences.

In [None]:
MAX_LEN = 128
print("Tokenizing the data")
train_inputs, train_masks = bert_preprocessing(X_train)
val_inputs, val_masks = bert_preprocessing(X_val)

Tokenizing the data




In [None]:

y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)


## Creating PyTorch DataLoader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Converting other data types to torch tensor
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)
batch_size = 16   # recommended batch size = 16 or 32

## Creating DataLoader for Training set

In [None]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

## Creating DataLoader for Val set

In [None]:
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

## Training the Model

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):

  def __init__(self, freeze_bert = False):
    super(BertClassifier, self).__init__()

    """ Specify hidden size of BERT,
     hidden size of our classifier,
      and number of labels """

    D_in, H, D_out = 768, 50, 3
    # Bert layer
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    # Linear layer with ReLU
    self.classifier = nn.Sequential(
        nn.Linear(D_in, H),
        nn.ReLU(),
        nn.Linear(H, D_out)
        )
    # Freeze BERT MODEL
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False

  def forward(self, input_ids, attention_mask):
    """ Feed input to BERT and the classifier to compute logits.
        It returns logits (torch.Tensor): an output tensor with shape (batch_size,num_labels)"""
    outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask)
    first_hidden_state_cls = outputs[0][:, 0, :]
    logits = self.classifier(first_hidden_state_cls)
    return logits

## Optimizer & Learning Rate Scheduler

To fine-tune our Bert Classifier, we need to create an optimizer. The authors recommend following hyper-parameters:

Batch size: 16 or 32
Learning rate (Adam): 5e-5, 3e-5 or 2e-5
Number of epochs: 2, 3, 4

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=2e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

## Training Loop

We will train our Bert Classifier for 4 epochs. In each epoch, we will train our model and evaluate its performance on the validation set. In more details, we will:

Training:

Unpack our data from the dataloader and load the data onto the GPU
Zero out gradients calculated in the previous pass
Perform a forward pass to compute logits and loss
Perform a backward pass to compute gradients (loss.backward())
Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
Update the model's parameters (optimizer.step())
Update the learning rate (scheduler.step())
Evaluation:

Unpack our data and load onto the GPU
Forward pass
Compute loss and accuracy rate over the validation set

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")

In [None]:
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=8)
train(bert_classifier, train_dataloader, val_dataloader, epochs=8, evaluation=True)


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.975078   |     -      |     -     |   4.60   
   1    |   40    |   0.757726   |     -      |     -     |   4.36   
   1    |   60    |   0.734348   |     -      |     -     |   4.36   
   1    |   80    |   0.705319   |     -      |     -     |   4.39   
   1    |   99    |   0.718095   |     -      |     -     |   4.15   
----------------------------------------------------------------------
   1    |    -    |   0.780683   |  0.698377  |   58.25   |   23.44  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.687370   |     -      |     -     |   4.58   
   2    |   40    |   0.676298   |     -      |     -     |   4.3

## Evaluation on Validation Set

The prediction step is similar to the evaluation step that we did in the training loop, but simpler. We will perform a forward pass to compute logits and apply softmax to calculate probabilities

In [None]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, val_dataloader)

# Evaluate the Bert classifier
from sklearn.metrics import accuracy_score, roc_curve, auc
# accuracy_score(y_true, y_pred, normalize=False)
y_pred = []
for items in probs:
  y_pred.append(np.argmax(items))
accuracy = accuracy_score(y_pred,y_val)
print(f'Accuracy: {accuracy*100:.2f}%')

  


Accuracy: 49.75%


## Train Model on the entire training data

In [None]:
# Concatenate the train set and the validation set


full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=32)

# Train the Bert Classifier on the entire training data

set_seed(42) 
bert_classifier, optimizer, scheduler = initialize_model(epochs=8)
train(bert_classifier, full_train_dataloader, epochs=8)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.948781   |     -      |     -     |   8.22   
   1    |   40    |   0.750142   |     -      |     -     |   7.80   
   1    |   60    |   0.727144   |     -      |     -     |   7.80   
   1    |   62    |   0.706987   |     -      |     -     |   0.61   
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.704959   |     -      |     -     |   8.17   
   2    |   40    |   0.675953   |     -      |     -     |   7.80   
   2    |   60    |   0.673205   |     -      |     -     |   7.84   
   2    |   62    |   0.656136   |     -      |     -     |   0.61   
------------------------------------------------------------------

##  Predictions on Test Set

Before making predictions on the test set, we need to redo processing and encoding steps done on the training data. Fortunately, we have written the preprocessing_for_bert function to do that for us.

In [None]:
# Preparing the test data
# Preprocessing the test data
test_inputs, test_masks = bert_preprocessing(test_data.ques)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)



In [None]:
probs = bert_predict(bert_classifier, test_dataloader)

In [None]:
# Evaluate the Bert classifier
from sklearn.metrics import accuracy_score
# accuracy_score(y_true, y_pred, normalize=False)
y_pred_test = []
for items in probs:
  y_pred_test.append(np.argmax(items))
y_pred_test = np.array(y_pred_test)
y_pred_test = y_pred_test.astype(np.int64)

In [None]:
print(type(y_pred_test))
test_data.value = test_data.value.astype(np.int64)

<class 'numpy.ndarray'>


In [None]:
accuracy = accuracy_score(test_data.value,y_pred_test)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 57.90%
