In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install xformers
!pip install transformers[torch]
!pip install datasets
!pip install  torchtext



In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import time
import copy

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import math
import random
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import gc
import torchtext
from tqdm import tqdm

# Misc.
import warnings
warnings.filterwarnings('ignore')



In [None]:
batch_size = 16
MAX_LEN = 160
RANDOM_SEED = 16
MODEL_NAME = 'bert-base-cased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 20


In [None]:

device

device(type='cuda')

# Bert

In [None]:
df = pd.read_csv("/content/drive/MyDrive/llm/data/Expanded_PROMISE.csv")

In [None]:
df.head(10)

Unnamed: 0,Id,RequirementText,Class,Binary_Label
0,1,The product shall be available during normal b...,A,1
1,2,The product shall be available for use 24 hour...,A,1
2,2,Out of 1000 accesses to the system the system ...,A,1
3,3,The system shall be available for use between ...,A,1
4,3,The system shall achieve 95% up time.,A,1
5,5,The product shall adhere to the corporate onli...,A,1
6,5,The product shall achieve a 98% uptime. The pr...,A,1
7,6,Aside from server failure the software produc...,A,1
8,8,The website shall be available for use 24 hour...,A,1
9,8,The website shall achieve 99.5% up time.,A,1


In [None]:


df.isnull().sum()

Id                 0
RequirementText    0
Class              0
Binary_Label       0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969 entries, 0 to 968
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               969 non-null    int64 
 1   RequirementText  969 non-null    object
 2   Class            969 non-null    object
 3   Binary_Label     969 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 30.4+ KB


In [None]:

df['Binary_Label'].value_counts()

1    525
0    444
Name: Binary_Label, dtype: int64

In [None]:


# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
tokens = []

max_ = float("-inf")

# Iterate through the content slide
for txt in df["RequirementText"]:
    t = np.array(tokenizer.encode(txt))

    max_ = max(max_ , len(t))

    tokens.append(t)

tokens = np.array(tokens)

print(max_)

110


In [None]:
print(tokens.shape)

(969,)


In [None]:
class GPReviewDataset(Dataset):
    # Constructor Function
    def __init__(self, reviews, targets, tokenizer, max_len = 160):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    # Length magic method
    def __len__(self):
        return len(self.reviews)

    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        # Encoded format to be returned
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
df_train, df_val= train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)


In [None]:
df_train['Binary_Label'].value_counts() , df_val['Binary_Label'].value_counts()

(1    414
 0    361
 Name: Binary_Label, dtype: int64,
 1    111
 0     83
 Name: Binary_Label, dtype: int64)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df["RequirementText"].to_numpy(),
        targets=df["Binary_Label"].to_numpy(),
        tokenizer=tokenizer,

    )

    return DataLoader(
        ds,
        batch_size=batch_size,

    )





In [None]:

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, batch_size)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, batch_size)


In [None]:
data = next(iter(train_data_loader))

print(data)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'review_text': ['The system shall allow user to choose their preferable topping.', 'The product shall display each type of ship in a grid using an image of the specific type of ship.', 'The Disputes applications shall interface with the Merchant Information Database. The Merchant Information Database provides detailed information with regard to the merchant. All merchant detail information shall be obtained from the Merchant Information Database.', 'The system shall display only 10 matching result on the current screen.', 'Can give feedback to the 24X7 Customer Care Service center about their impression for the site and services.', 'The users should be able to easily use the system to successfully complete their budgets  within the stipulated time for completion. With a week of training prior to product implementation  98% of the users shall complete their budgets within the stipulated time.', 'For each turn  the product shall allow the offensive player to define a shot.', 'The websit

In [None]:
data['input_ids']

tensor([[  101,  1109,  1449,  ...,     0,     0,     0],
        [  101,  1109,  3317,  ...,     0,     0,     0],
        [  101,  1109, 12120,  ...,     0,     0,     0],
        ...,
        [  101,  1109, 14274,  ...,     0,     0,     0],
        [  101,  1109,  1440,  ...,     0,     0,     0],
        [  101,  1109,  2593,  ...,     0,     0,     0]])

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([16, 160])
torch.Size([16, 160])
torch.Size([16])


In [None]:
# bert_model = BertModel.from_pretrained(MODEL_NAME)

# Build the Sentiment Classifier class
class SentimentClassifier(nn.Module):

    # Constructor class
    def __init__(self, n_classes = 2):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)


    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
          return_dict=False
        )
        #  Add a dropout layer
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
sentiment_int = {'functional' : 0, 'non-functional': 1}

In [None]:
model = SentimentClassifier(len(sentiment_int))
model = model.to(device)

In [None]:
model

SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
device

device(type='cuda')

In [None]:
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples,bert = True):



    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)


        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        # Backward prop
        loss.backward()

        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):

    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:



def train(model,EPOCHS = 10):




  print()
  print()
  print()
  print(model)
  print()
  print()
  print()


  optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

  total_steps = len(train_data_loader) * EPOCHS

  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
  )

  # Set the loss function
  loss_fn = nn.CrossEntropyLoss().to(device)

  # history = defaultdict(list)
  best_accuracy = 0

  for epoch in range(EPOCHS):


      print(f'Epoch {epoch + 1}/{EPOCHS}')
      print('-' * 20)

      train_acc, train_loss = train_epoch(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(df_train)
    )



      val_acc, val_loss = eval_model(
      model,
      val_data_loader,
      loss_fn,
      device,
      len(df_val)
    )


      print(f' train_loss {train_loss:.5f}  train_acc {train_acc:.5f} <-> Val_loss {val_loss:.5f} val_accuracy {val_acc:.5f}')
      print()

      # history['train_acc'].append(train_acc)
      # history['train_loss'].append(train_loss)
      # history['val_acc'].append(val_acc)
      # history['val_loss'].append(val_loss)



# Bert Training

In [None]:
train(model,EPOCHS = 20)




SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

100%|██████████| 49/49 [00:20<00:00,  2.43it/s]
100%|██████████| 13/13 [00:01<00:00,  7.33it/s]


 train_loss 0.51357  train_acc 0.74065 <-> Val_loss 0.29664 val_accuracy 0.90206

Epoch 2/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.45it/s]
100%|██████████| 13/13 [00:01<00:00,  7.43it/s]


 train_loss 0.20213  train_acc 0.93419 <-> Val_loss 0.66899 val_accuracy 0.87629

Epoch 3/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.50it/s]
100%|██████████| 13/13 [00:01<00:00,  7.70it/s]


 train_loss 0.13135  train_acc 0.96774 <-> Val_loss 0.94997 val_accuracy 0.84536

Epoch 4/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.49it/s]
100%|██████████| 13/13 [00:01<00:00,  6.99it/s]


 train_loss 0.05256  train_acc 0.98581 <-> Val_loss 0.61122 val_accuracy 0.92268

Epoch 5/20
--------------------


100%|██████████| 49/49 [00:20<00:00,  2.41it/s]
100%|██████████| 13/13 [00:01<00:00,  7.54it/s]


 train_loss 0.06800  train_acc 0.98581 <-> Val_loss 0.54900 val_accuracy 0.93299

Epoch 6/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.48it/s]
100%|██████████| 13/13 [00:01<00:00,  7.64it/s]


 train_loss 0.04155  train_acc 0.99097 <-> Val_loss 0.56699 val_accuracy 0.93299

Epoch 7/20
--------------------


100%|██████████| 49/49 [00:20<00:00,  2.42it/s]
100%|██████████| 13/13 [00:01<00:00,  6.57it/s]


 train_loss 0.02492  train_acc 0.99484 <-> Val_loss 0.56818 val_accuracy 0.93814

Epoch 8/20
--------------------


100%|██████████| 49/49 [00:20<00:00,  2.41it/s]
100%|██████████| 13/13 [00:01<00:00,  7.45it/s]


 train_loss 0.00864  train_acc 0.99742 <-> Val_loss 0.59904 val_accuracy 0.92784

Epoch 9/20
--------------------


100%|██████████| 49/49 [00:20<00:00,  2.39it/s]
100%|██████████| 13/13 [00:01<00:00,  6.52it/s]


 train_loss 0.00053  train_acc 1.00000 <-> Val_loss 0.65773 val_accuracy 0.93299

Epoch 10/20
--------------------


100%|██████████| 49/49 [00:21<00:00,  2.33it/s]
100%|██████████| 13/13 [00:01<00:00,  7.37it/s]


 train_loss 0.00030  train_acc 1.00000 <-> Val_loss 0.67716 val_accuracy 0.93299

Epoch 11/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.49it/s]
100%|██████████| 13/13 [00:01<00:00,  7.68it/s]


 train_loss 0.00025  train_acc 1.00000 <-> Val_loss 0.69549 val_accuracy 0.93299

Epoch 12/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.50it/s]
100%|██████████| 13/13 [00:01<00:00,  7.67it/s]


 train_loss 0.00020  train_acc 1.00000 <-> Val_loss 0.70873 val_accuracy 0.93299

Epoch 13/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.49it/s]
100%|██████████| 13/13 [00:01<00:00,  7.44it/s]


 train_loss 0.00018  train_acc 1.00000 <-> Val_loss 0.72103 val_accuracy 0.93299

Epoch 14/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.48it/s]
100%|██████████| 13/13 [00:01<00:00,  7.68it/s]


 train_loss 0.00016  train_acc 1.00000 <-> Val_loss 0.72925 val_accuracy 0.93299

Epoch 15/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.50it/s]
100%|██████████| 13/13 [00:01<00:00,  7.36it/s]


 train_loss 0.00014  train_acc 1.00000 <-> Val_loss 0.73818 val_accuracy 0.93299

Epoch 16/20
--------------------


100%|██████████| 49/49 [00:20<00:00,  2.40it/s]
100%|██████████| 13/13 [00:01<00:00,  7.68it/s]


 train_loss 0.00014  train_acc 1.00000 <-> Val_loss 0.74330 val_accuracy 0.93814

Epoch 17/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.50it/s]
100%|██████████| 13/13 [00:01<00:00,  7.74it/s]


 train_loss 0.00013  train_acc 1.00000 <-> Val_loss 0.74768 val_accuracy 0.93814

Epoch 18/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.50it/s]
100%|██████████| 13/13 [00:01<00:00,  7.36it/s]


 train_loss 0.00013  train_acc 1.00000 <-> Val_loss 0.75087 val_accuracy 0.93814

Epoch 19/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.48it/s]
100%|██████████| 13/13 [00:01<00:00,  7.63it/s]


 train_loss 0.00012  train_acc 1.00000 <-> Val_loss 0.75277 val_accuracy 0.93814

Epoch 20/20
--------------------


100%|██████████| 49/49 [00:19<00:00,  2.49it/s]
100%|██████████| 13/13 [00:01<00:00,  7.49it/s]

 train_loss 0.00012  train_acc 1.00000 <-> Val_loss 0.75344 val_accuracy 0.93814






# Transformer

In [None]:
df.head()

Unnamed: 0,Id,RequirementText,Class,Binary_Label
0,1,The product shall be available during normal b...,A,1
1,2,The product shall be available for use 24 hour...,A,1
2,2,Out of 1000 accesses to the system the system ...,A,1
3,3,The system shall be available for use between ...,A,1
4,3,The system shall achieve 95% up time.,A,1


In [None]:
df_train, df_val

(     Id                                    RequirementText Class  Binary_Label
 337  33  The system shall allow user to choose their pr...     F             0
 528  10  The product shall display each type of ship in...    LF             1
 591   4  The Disputes applications shall interface with...     O             1
 427  46  The system shall display only 10 matching resu...     F             0
 351  38  Can give feedback to the 24X7 Customer Care Se...     F             0
 ..   ..                                                ...   ...           ...
 321  30  Enable the admin to generate reports which con...     F             0
 581   3  The system shall able to operate within a busi...     O             1
 121   4  The Disputes System must provide a confirmatio...     F             0
 238   9  The leads washing functionality will compile b...     F             0
 681   8  System shall let existing customers log into t...    PE             1
 
 [775 rows x 4 columns],
      Id     

In [None]:
df2 = df.copy()

In [None]:
def map_label(label):
    if label == 0:
        return 'function'
    elif label == 1:
        return 'non-function'
    else:
        return 'unknown'

# Apply the mapping function to create a new column
df2['category'] = df2["Binary_Label"].apply(map_label)



In [None]:
df2.head()

Unnamed: 0,Id,RequirementText,Class,Binary_Label,category
0,1,The product shall be available during normal b...,A,1,non-function
1,2,The product shall be available for use 24 hour...,A,1,non-function
2,2,Out of 1000 accesses to the system the system ...,A,1,non-function
3,3,The system shall be available for use between ...,A,1,non-function
4,3,The system shall achieve 95% up time.,A,1,non-function


In [None]:
df_train2, df_val2= train_test_split(df2, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# trg_langs = [str(i) for i in df_train["Binary_Label"].unique().tolist()]
# trg_langs


trg_langs = df_train2["category"].unique().tolist()
trg_langs


['function', 'non-function']

In [None]:
df_train

Unnamed: 0,Id,RequirementText,Class,Binary_Label
337,33,The system shall allow user to choose their pr...,F,0
528,10,The product shall display each type of ship in...,LF,1
591,4,The Disputes applications shall interface with...,O,1
427,46,The system shall display only 10 matching resu...,F,0
351,38,Can give feedback to the 24X7 Customer Care Se...,F,0
...,...,...,...,...
321,30,Enable the admin to generate reports which con...,F,0
581,3,The system shall able to operate within a busi...,O,1
121,4,The Disputes System must provide a confirmatio...,F,0
238,9,The leads washing functionality will compile b...,F,0


In [None]:
class LangDataset(Dataset):
    def __init__(self, ds, trg_langs, train_vocab=None):
        self.corpus = ds

        if not train_vocab:
            self.src_vocab, self.trg_vocab = self._build_vocab()
        else:
            self.src_vocab, self.trg_vocab = train_vocab

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, item):
        text = self.corpus.iloc[item].RequirementText
        lang = self.corpus.iloc[item].category

        return {
            'src': self.src_vocab.lookup_indices(text.lower().split()),
            'trg': self.trg_vocab.lookup_indices([lang])
        }

    def _build_vocab(self):
        # Here one could remove stopwords and use word lemmatisation.
        # Both techniques will reduce the vocab size and hence model size
        # and could also enhance the model's performance
        src_tokens = self.corpus.RequirementText.str.cat().lower().split()

        src_vocab = build_vocab_from_iterator([src_tokens], specials=["<unk>", "<pad>"])
        src_vocab.set_default_index(src_vocab['<unk>'])

        trg_vocab = build_vocab_from_iterator([trg_langs])

        return src_vocab, trg_vocab

In [None]:
def collate_fn(batch, pad_value, device):
    trgs = []
    srcs = []
    for row in batch:
        srcs.append(torch.tensor(row["src"], dtype=torch.long).to(device))
        trgs.append(torch.tensor(row["trg"]).to(device))

    padded_srcs = pad_sequence(srcs, padding_value=pad_value)
    return {"src": padded_srcs, "trg": torch.tensor([trgs]).to(device)}

train_langds = LangDataset(df_train2, trg_langs)
test_langds = LangDataset(df_val2, trg_langs, (train_langds.src_vocab, train_langds.trg_vocab))

SRC_PAD_IDX = train_langds.src_vocab["<pad>"]

train_dt = DataLoader(train_langds, batch_size= batch_size, shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))

test_dt = DataLoader(test_langds, batch_size=batch_size, shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))


# hyp_params = {
#     "batch_size": batch_size,
#     "lr": 0.0005,
#     "num_epochs": EPOCHS,
#     "d_model": 768, # Input embedding dimension
#     "n_head": 12, # No. of multi-head attention block (aka paralle self-attention layers)
#     "n_layers": 12,
#     "feedforward_dim": 128,
#     "dropout": 0.1
# }


hyp_params = {
    "batch_size": batch_size,
    "lr": 0.0005,
    "num_epochs": EPOCHS,
    "d_model": 512, # Input embedding dimension
    "n_head": 8, # No. of multi-head attention block (aka paralle self-attention layers)
    "n_layers": 3,
    "feedforward_dim": 128,
    "dropout": 0.1
}


hyp_params["src_vocab_size"] = len(train_langds.src_vocab)
hyp_params["trg_vocab_size"] = len(trg_langs)

In [None]:
next(iter(train_dt))

{'src': tensor([[   2,    2,   11,    2,  854,    7,    2,    2,   38,    2,    2,    8,
             2,    2,    2,   17],
         [ 304,    7,   50,   37,  478,    3,    7,  350, 1668,  128,    7,  537,
             7,    7,   10,    8],
         [   3,    3,  679,    7,  426,    6,    3,  262, 1041,  118,    3, 1434,
             3,    3,    3, 1144],
         [   6,   29,    2,    3,    3,   16,   21,   26,    4,  134,   29,    5,
           932,  320,  164,  533],
         [ 606,    2,   10,  253,    6,    4,   12,   29,    2,  198, 1419,  685,
             8,  202,   19,   28],
         [   5,  162,    3,    8,  891,  344,    4, 1291,   78,    3,  459,    9,
            29,  110,  103, 1664],
         [  57,  677,   21,   12,   13,   24,  133,   13,   17,    6, 1777,  195,
             5,  675,   65,    2],
         [ 331,   25,    2,   61,  950,    5,    2,   24,   43,  647,   13,   22,
            24,    1,   53,    7],
         [ 508,  157,  138,   19,  154,    2,   68, 1009,

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, maxlen = 5000):
        super(PositionalEncoding, self).__init__()

        # A tensor consists of all the possible positions (index) e.g 0, 1, 2, ... max length of input
        # Shape (pos) --> [max len, 1]
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_encoding = torch.zeros((maxlen, d_model))

        sin_den = 10000 ** (torch.arange(0, d_model, 2)/d_model) # sin for even item of position's dimension
        cos_den = 10000 ** (torch.arange(1, d_model, 2)/d_model) # cos for odd

        pos_encoding[:, 0::2] = torch.sin(pos / sin_den)
        pos_encoding[:, 1::2] = torch.cos(pos / cos_den)

        # Shape (pos_embedding) --> [max len, d_model]
        # Adding one more dimension in-between
        pos_encoding = pos_encoding.unsqueeze(-2)
        # Shape (pos_embedding) --> [max len, 1, d_model]

        self.dropout = nn.Dropout(dropout)

        # We want pos_encoding be saved and restored in the `state_dict`, but not trained by the optimizer
        # hence registering it!
        # Source & credits: https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723/2
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        # shape (token_embedding) --> [sentence len, batch size, d_model]

        # Concatenating embeddings with positional encodings
        # Note: As we made positional encoding with the size max length of sentence in our dataset
        #       hence here we are picking till the sentence length in a batch
        #       Another thing to notice is in the Transformer's paper they used FIXED positional encoding,
        #       there are methods where we can also learn them
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])


class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(InputEmbedding, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        # shape (tokens) --> [sentence len, batch size]
        # shape (inp_emb) --> [sentence len, batch size, d_model]
        # Multiplying with square root of d_model as they mentioned in the Transformer's paper
        inp_emb = self.embedding(tokens.long()) * math.sqrt(self.d_model)
        return inp_emb


class TransformerClassifier(nn.Module):
    def __init__(self,
                  src_vocab_size,
                 trg_vocab_size ,
                 d_model,
                 dropout,
                 n_head,
                 dim_feedforward,
                 n_layers,
                ):
        super().__init__()

        self.src_inp_emb = InputEmbedding(src_vocab_size, d_model)
        self.trg_inp_emb = InputEmbedding(trg_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)

        # Only using Encoder of Transformer model
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_head, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)

        self.d_model = d_model
        self.decoder = nn.Linear(d_model, trg_vocab_size)

    def forward(self, x):
        x_emb = self.positional_encoding(self.src_inp_emb(x))
        # Shape (output) -> (Sequence length, batch size, d_model)
        output = self.transformer_encoder(x_emb)
        # We want our output to be in the shape of (batch size, d_model) so that
        # we can use it with CrossEntropyLoss hence averaging using first (Sequence length) dimension
        # Shape (mean) -> (batch size, d_model)
        # Shape (decoder) -> (batch size, d_model)
        return self.decoder(output.mean(0))

In [None]:
def train_model(model, train_dataloader, criterion, optimizer , n_examples ):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        # Clear the accumulating gradients
        optimizer.zero_grad()

        src = batch["src"]  # shape --> [seq len, batch size]
        trg = batch["trg"]  # shape --> [1, batch size]

        # shape (out) --> [batch size, trg size]
        out = model(src)
        _, preds = torch.max(out, dim=1)
        correct_predictions += torch.sum(preds == trg)
        loss = criterion(out, trg.squeeze(0))

        loss.backward()

        optimizer.step()
        epoch_loss += loss.detach().cpu()

    return correct_predictions.double() / n_examples, epoch_loss/len(train_dataloader)


def evaluate_model(model, valid_dataloader, criterion , n_examples):
    model.eval()
    epoch_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(valid_dataloader):
            src = batch["src"]  # shape --> [seq len, batch size]
            trg = batch["trg"]  # shape --> [1, batch size]

            # shape (out) --> [batch size, trg size]
            out = model(src)
            _, preds = torch.max(out, dim=1)
            correct_predictions += torch.sum(preds == trg)
            loss = criterion(out, trg.squeeze(0))

            epoch_loss += loss.detach().cpu()



    return correct_predictions.double() / n_examples, epoch_loss/len(valid_dataloader)




In [None]:
model2 = TransformerClassifier(  hyp_params["src_vocab_size"],
                                hyp_params["trg_vocab_size"] ,
                                hyp_params["d_model"],
                                hyp_params["dropout"],
                                hyp_params["n_head"],
                                hyp_params["feedforward_dim"],
                                hyp_params["n_layers"]
                                ).to(device)


criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(model.parameters(), lr=hyp_params["lr"])

print(model2)

TransformerClassifier(
  (src_inp_emb): InputEmbedding(
    (embedding): Embedding(2602, 512)
  )
  (trg_inp_emb): InputEmbedding(
    (embedding): Embedding(2, 512)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Linear

In [None]:
hyp_params

{'batch_size': 16,
 'lr': 0.0005,
 'num_epochs': 20,
 'd_model': 512,
 'n_head': 8,
 'n_layers': 3,
 'feedforward_dim': 128,
 'dropout': 0.1,
 'src_vocab_size': 2602,
 'trg_vocab_size': 2}

In [None]:


def train2(model,EPOCHS = EPOCHS):


  for epoch in range(EPOCHS):
    start = time.time()
    gc.collect()
    torch.cuda.empty_cache()

    train_acc,train_loss = train_model(model, train_dt, criterion, optimizer,len(df_train2))
    val_acc,val_loss = evaluate_model(model, test_dt, criterion,len(df_val2))

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    print(f' train_loss {train_loss:.5f}  train_acc {train_acc:.5f} <-> Val_loss {val_loss:.5f} val_accuracy {val_acc:.5f}')



# Transformer Training

In [None]:
train2(model2, 20)


# {'batch_size': 16,
#  'lr': 0.0005,
#  'num_epochs': 20,
#  'd_model': 768,
#  'n_head': 12,
#  'n_layers': 12,
#  'feedforward_dim': 128,
#  'dropout': 0.1,
#  'src_vocab_size': 2602,
#  'trg_vocab_size': 2}

100%|██████████| 49/49 [00:02<00:00, 17.73it/s]


Epoch 1/20
----------
 train_loss 0.86220  train_acc 0.46581 <-> Val_loss 0.87704 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 18.79it/s]


Epoch 2/20
----------
 train_loss 0.85912  train_acc 0.46581 <-> Val_loss 0.91398 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.95it/s]


Epoch 3/20
----------
 train_loss 0.85262  train_acc 0.46581 <-> Val_loss 0.89972 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 15.70it/s]


Epoch 4/20
----------
 train_loss 0.85759  train_acc 0.46581 <-> Val_loss 0.81039 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 16.52it/s]


Epoch 5/20
----------
 train_loss 0.85460  train_acc 0.46581 <-> Val_loss 0.85845 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.81it/s]


Epoch 6/20
----------
 train_loss 0.84704  train_acc 0.46581 <-> Val_loss 0.85798 val_accuracy 0.43299


100%|██████████| 49/49 [00:02<00:00, 18.04it/s]


Epoch 7/20
----------
 train_loss 0.85282  train_acc 0.46581 <-> Val_loss 0.92332 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 12.61it/s]


Epoch 8/20
----------
 train_loss 0.85372  train_acc 0.46581 <-> Val_loss 0.90421 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 15.11it/s]


Epoch 9/20
----------
 train_loss 0.85572  train_acc 0.46581 <-> Val_loss 0.85028 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.69it/s]


Epoch 10/20
----------
 train_loss 0.85061  train_acc 0.46581 <-> Val_loss 0.94596 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.80it/s]


Epoch 11/20
----------
 train_loss 0.85877  train_acc 0.46581 <-> Val_loss 0.89875 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 18.05it/s]


Epoch 12/20
----------
 train_loss 0.85611  train_acc 0.46581 <-> Val_loss 0.88542 val_accuracy 0.43299


100%|██████████| 49/49 [00:03<00:00, 15.70it/s]


Epoch 13/20
----------
 train_loss 0.85544  train_acc 0.46581 <-> Val_loss 0.83980 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 12.50it/s]


Epoch 14/20
----------
 train_loss 0.85244  train_acc 0.46581 <-> Val_loss 0.83087 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 15.62it/s]


Epoch 15/20
----------
 train_loss 0.85501  train_acc 0.46581 <-> Val_loss 0.87271 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.76it/s]


Epoch 16/20
----------
 train_loss 0.85184  train_acc 0.46581 <-> Val_loss 0.87348 val_accuracy 0.43299


100%|██████████| 49/49 [00:02<00:00, 16.81it/s]


Epoch 17/20
----------
 train_loss 0.84535  train_acc 0.46581 <-> Val_loss 0.89827 val_accuracy 0.42784


100%|██████████| 49/49 [00:03<00:00, 15.09it/s]


Epoch 18/20
----------
 train_loss 0.85350  train_acc 0.46581 <-> Val_loss 0.89387 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.70it/s]


Epoch 19/20
----------
 train_loss 0.84866  train_acc 0.46581 <-> Val_loss 0.88803 val_accuracy 0.42784


100%|██████████| 49/49 [00:02<00:00, 17.62it/s]


Epoch 20/20
----------
 train_loss 0.85107  train_acc 0.46581 <-> Val_loss 0.87656 val_accuracy 0.42784


In [None]:
train2(model2, 40)


# {'batch_size': 16,
#  'lr': 0.0005,
#  'num_epochs': 20,
#  'd_model': 512,
#  'n_head': 8,
#  'n_layers': 3,
#  'feedforward_dim': 128,
#  'dropout': 0.1,
#  'src_vocab_size': 2602,
#  'trg_vocab_size': 2}

100%|██████████| 49/49 [00:00<00:00, 69.06it/s]


Epoch 1/40
----------
 train_loss 0.71055  train_acc 0.48129 <-> Val_loss 0.68619 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 74.51it/s]


Epoch 2/40
----------
 train_loss 0.70949  train_acc 0.46065 <-> Val_loss 0.69335 val_accuracy 0.54124


100%|██████████| 49/49 [00:00<00:00, 72.51it/s]


Epoch 3/40
----------
 train_loss 0.71526  train_acc 0.46839 <-> Val_loss 0.69335 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 54.88it/s]


Epoch 4/40
----------
 train_loss 0.70939  train_acc 0.48387 <-> Val_loss 0.69559 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 51.54it/s]


Epoch 5/40
----------
 train_loss 0.71294  train_acc 0.47355 <-> Val_loss 0.70540 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 50.12it/s]


Epoch 6/40
----------
 train_loss 0.71075  train_acc 0.47742 <-> Val_loss 0.71640 val_accuracy 0.53608


100%|██████████| 49/49 [00:01<00:00, 46.47it/s]


Epoch 7/40
----------
 train_loss 0.71202  train_acc 0.45290 <-> Val_loss 0.71024 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 70.22it/s]


Epoch 8/40
----------
 train_loss 0.71496  train_acc 0.48516 <-> Val_loss 0.69081 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 72.43it/s]


Epoch 9/40
----------
 train_loss 0.71537  train_acc 0.45161 <-> Val_loss 0.71283 val_accuracy 0.53093


100%|██████████| 49/49 [00:00<00:00, 73.10it/s]


Epoch 10/40
----------
 train_loss 0.71441  train_acc 0.47742 <-> Val_loss 0.69893 val_accuracy 0.51031


100%|██████████| 49/49 [00:00<00:00, 73.44it/s]


Epoch 11/40
----------
 train_loss 0.71347  train_acc 0.48774 <-> Val_loss 0.68918 val_accuracy 0.54639


100%|██████████| 49/49 [00:00<00:00, 71.29it/s]


Epoch 12/40
----------
 train_loss 0.71081  train_acc 0.47355 <-> Val_loss 0.69963 val_accuracy 0.52062


100%|██████████| 49/49 [00:00<00:00, 72.85it/s]


Epoch 13/40
----------
 train_loss 0.71073  train_acc 0.47613 <-> Val_loss 0.68871 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 73.22it/s]


Epoch 14/40
----------
 train_loss 0.70993  train_acc 0.47613 <-> Val_loss 0.69620 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 72.43it/s]


Epoch 15/40
----------
 train_loss 0.71159  train_acc 0.47097 <-> Val_loss 0.70425 val_accuracy 0.53093


100%|██████████| 49/49 [00:00<00:00, 73.58it/s]


Epoch 16/40
----------
 train_loss 0.71246  train_acc 0.46968 <-> Val_loss 0.69685 val_accuracy 0.54639


100%|██████████| 49/49 [00:00<00:00, 63.16it/s]


Epoch 17/40
----------
 train_loss 0.71361  train_acc 0.47613 <-> Val_loss 0.69384 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 51.25it/s]


Epoch 18/40
----------
 train_loss 0.71459  train_acc 0.48000 <-> Val_loss 0.70820 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 50.29it/s]


Epoch 19/40
----------
 train_loss 0.70990  train_acc 0.48387 <-> Val_loss 0.70020 val_accuracy 0.52577


100%|██████████| 49/49 [00:01<00:00, 44.67it/s]


Epoch 20/40
----------
 train_loss 0.71040  train_acc 0.48258 <-> Val_loss 0.68743 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 59.38it/s]


Epoch 21/40
----------
 train_loss 0.71481  train_acc 0.45935 <-> Val_loss 0.71289 val_accuracy 0.54124


100%|██████████| 49/49 [00:00<00:00, 74.91it/s]


Epoch 22/40
----------
 train_loss 0.71987  train_acc 0.46452 <-> Val_loss 0.70960 val_accuracy 0.53608


100%|██████████| 49/49 [00:00<00:00, 74.84it/s]


Epoch 23/40
----------
 train_loss 0.71473  train_acc 0.46968 <-> Val_loss 0.70345 val_accuracy 0.50000


100%|██████████| 49/49 [00:00<00:00, 74.26it/s]


Epoch 24/40
----------
 train_loss 0.71429  train_acc 0.46968 <-> Val_loss 0.69940 val_accuracy 0.54124


100%|██████████| 49/49 [00:00<00:00, 75.02it/s]


Epoch 25/40
----------
 train_loss 0.71008  train_acc 0.47226 <-> Val_loss 0.69800 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 70.89it/s]


Epoch 26/40
----------
 train_loss 0.71350  train_acc 0.46194 <-> Val_loss 0.70441 val_accuracy 0.53093


100%|██████████| 49/49 [00:00<00:00, 73.00it/s]


Epoch 27/40
----------
 train_loss 0.71616  train_acc 0.46968 <-> Val_loss 0.68119 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 72.11it/s]


Epoch 28/40
----------
 train_loss 0.71234  train_acc 0.46452 <-> Val_loss 0.69071 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 71.52it/s]


Epoch 29/40
----------
 train_loss 0.71311  train_acc 0.47355 <-> Val_loss 0.69222 val_accuracy 0.55155


100%|██████████| 49/49 [00:00<00:00, 71.79it/s]


Epoch 30/40
----------
 train_loss 0.71187  train_acc 0.47097 <-> Val_loss 0.70902 val_accuracy 0.54124


100%|██████████| 49/49 [00:00<00:00, 51.83it/s]


Epoch 31/40
----------
 train_loss 0.71531  train_acc 0.48000 <-> Val_loss 0.70087 val_accuracy 0.51546


100%|██████████| 49/49 [00:00<00:00, 51.64it/s]


Epoch 32/40
----------
 train_loss 0.71220  train_acc 0.47613 <-> Val_loss 0.69548 val_accuracy 0.51546


100%|██████████| 49/49 [00:01<00:00, 47.47it/s]


Epoch 33/40
----------
 train_loss 0.71283  train_acc 0.46194 <-> Val_loss 0.69513 val_accuracy 0.53093


100%|██████████| 49/49 [00:01<00:00, 29.78it/s]


Epoch 34/40
----------
 train_loss 0.71006  train_acc 0.48129 <-> Val_loss 0.68716 val_accuracy 0.53093


100%|██████████| 49/49 [00:01<00:00, 47.52it/s]


Epoch 35/40
----------
 train_loss 0.70862  train_acc 0.48903 <-> Val_loss 0.71089 val_accuracy 0.52062


100%|██████████| 49/49 [00:01<00:00, 46.46it/s]


Epoch 36/40
----------
 train_loss 0.71587  train_acc 0.46194 <-> Val_loss 0.69556 val_accuracy 0.52577


100%|██████████| 49/49 [00:01<00:00, 47.02it/s]


Epoch 37/40
----------
 train_loss 0.71093  train_acc 0.46581 <-> Val_loss 0.69125 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 57.46it/s]


Epoch 38/40
----------
 train_loss 0.71250  train_acc 0.47355 <-> Val_loss 0.69570 val_accuracy 0.52062


100%|██████████| 49/49 [00:00<00:00, 73.39it/s]


Epoch 39/40
----------
 train_loss 0.71549  train_acc 0.45935 <-> Val_loss 0.70002 val_accuracy 0.51031


100%|██████████| 49/49 [00:00<00:00, 70.19it/s]


Epoch 40/40
----------
 train_loss 0.71353  train_acc 0.47355 <-> Val_loss 0.69067 val_accuracy 0.53093
