# Importing Libraries

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertModel, AutoTokenizer, TrainingArguments, Trainer, DistilBertModel
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn as nn
import pdb
import json
import ast

# Importing and Tokenizing train-validation-test

In [None]:
path = "toyset10000.csv"
df = pd.read_csv(path, dtype="string")
df['Definition'] = df['Definition'].astype(str)
df = df[['Word', 'Definition']]

In [None]:
df_train, df_test = train_test_split(df[['Definition','Word']], test_size=0.2,random_state=45)
df_test, df_val = train_test_split(df_test[['Definition','Word']], test_size=0.5,random_state=45)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

train_enc = tokenizer(df_train['Definition'].to_list(), padding=True, truncation=True, max_length=128)
test_enc = tokenizer(df_test['Definition'].to_list(), padding=True, truncation=True, max_length=128)
val_enc = tokenizer(df_val['Definition'].to_list(), padding=True, truncation=True, max_length=128)

In [None]:
'''One hot encoding of classes'''
train_label_enum = {k:j+1 for j, k in enumerate(df_train['Word'].unique())}
train_label_enum["<unk>"] = 0
train_num_labels = len(train_label_enum)
inv_train_label_enum= {v: k for k, v in train_label_enum.items()}
df_train['labels'] = df_train['Word'].apply(lambda x: [1.0 if train_label_enum[x]==i else 0.0 for i in range(train_num_labels)])
df_val['labels'] = df_val['Word'].apply(lambda x: [1.0 if train_label_enum[x]==i else 0.0 for i in range(train_num_labels)] if x in train_label_enum else [1.0] + [0.0]*(train_num_labels-1))

In [None]:
# Save the dictionary of indexed words to a text file
with open('train_label_enum.txt', 'w') as f:
    json.dump(train_label_enum, f)

# Dataset and DataLoader functions for Train, Validation and Test

In [3]:
class RevDictDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.labels = self.labels.to_list()
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = RevDictDataset(train_enc, df_train['labels'])
val_dataset = RevDictDataset(val_enc, df_val['labels'])

In [None]:
# Instantiate the DataLoader for train and validation datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)  # No need to shuffle for validation

In [None]:
labels = []
for word in list(df_test['Word']):
  if word in train_label_enum:
    labels.append(train_label_enum[word])
  else:
    labels.append(train_label_enum["<unk>"])
df_test['labels']=labels
test_dataset = RevDictDataset(test_enc, df_test['labels'])
test_loader = DataLoader(test_dataset, batch_size=8,shuffle=False) 

# Defining the functions for the model and train with validation

In [4]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

class BLmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4):
        super(BLmodel, self).__init__()
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.lstm_layer_1 = nn.LSTM(input_size=768, hidden_size=hidden_dim1, num_layers=1, batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=hidden_dim1, hidden_size=hidden_dim2, num_layers=1, batch_first=True)
        self.lstm_layer_3 = nn.LSTM(input_size=hidden_dim2, hidden_size=hidden_dim3, num_layers=1, batch_first=True)
        self.lstm_layer_4 = nn.LSTM(input_size=hidden_dim3, hidden_size=hidden_dim4, num_layers=1, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim4, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        bert_embedding = outputs[0]
        lstm_output_1, _ = self.lstm_layer_1(bert_embedding)
        lstm_output_2, _ = self.lstm_layer_2(lstm_output_1)
        lstm_output_3, _ = self.lstm_layer_3(lstm_output_2)
        lstm_output_4, _ = self.lstm_layer_4(lstm_output_3)
        output = self.output_layer(lstm_output_4[:, -1, :])
        return output


In [5]:
from tqdm import tqdm
def train(model, train_loader, val_loader, optimizer, num_epochs):
    model.train()
    min_val_loss = float('inf')
    for epoch in range(num_epochs):
        total_loss = 0.0
        # Training
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids']
            labels = batch['labels']
            attention_mask= batch['attention_mask']
            optimizer.zero_grad()
            outputs = model(input_ids,attention_mask)
            # pdb.set_trace()
            loss = nn.MSELoss()(outputs.view(-1, len(train_label_enum)), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        with torch.no_grad():
            total_val_loss = 0.0
            for batch in val_loader:
                input_ids = batch['input_ids']
                labels = batch['labels']
                attention_mask= batch['attention_mask']
                outputs = model(input_ids,attention_mask)

                # pdb.set_trace()
                
                # Generate top-k words for validation
                # _, topk_indices = torch.topk(outputs, k=10, dim=1)
                # for i,idx_row in enumerate(topk_indices):
                #   row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
                #   print(f"Top-10 words for {inv_train_label_enum[labels[i].item()]} generated are: {row_words}")
                
                val_loss = nn.MSELoss()(outputs.view(-1, len(train_label_enum)), labels)
                total_val_loss += val_loss.item()

        avg_loss = total_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)

        if avg_val_loss <= min_val_loss:
          min_val_loss = avg_val_loss
          torch.save(model.state_dict(), 'model.pt')

        print(f"Epoch [{epoch+1}/{num_epochs}]"
              f"\tTrain Loss: {avg_loss:.4f}"
              f"\tVal Loss: {avg_val_loss:.4f}"
              f"\tMin Val Loss: {min_val_loss:.4f}")

        model.train()


# Training the model

In [None]:
# Define the training parameters
vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
num_epochs = 10

# Train and validate the model
train(model, train_loader, val_loader, optimizer, num_epochs)

# Testing the model

In [None]:
vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
model.load_state_dict(torch.load('./model.pt'))

#On Validation
model.eval()
with torch.no_grad():
  val_final_output=[]
  for batch in val_loader:
      input_ids = batch['input_ids']
      attention_mask= batch['attention_mask']
      outputs = model(input_ids,attention_mask)

      # Generate top-k words for each instance in the batch
      _, topk_indices = torch.topk(outputs, k=100, dim=1)
      for i, idx_row in enumerate(topk_indices):
          row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
          val_final_output.append(row_words)

#On Test
model.eval()
with torch.no_grad():
  test_final_output=[]
  for batch in test_loader:
      input_ids = batch['input_ids']
      attention_mask= batch['attention_mask']
      outputs = model(input_ids,attention_mask)

      # Generate top-k words for each instance in the batch
      _, topk_indices = torch.topk(outputs, k=100, dim=1)
      for i, idx_row in enumerate(topk_indices):
          row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
          test_final_output.append(row_words)

In [None]:
with open('validation_output.txt', 'w') as file:
  for i, word in enumerate(df_val['Word']):
        file.write(f"{word} : {val_final_output[i]}\n")

with open('test_output.txt', 'w') as file:
  for i, word in enumerate(df_test['Word']):
        file.write(f"{word} : {test_final_output[i]}\n")

In [None]:
top_one=0
top_ten=0
top_hundred=0
total_words= len(df_val['Word'])
for i,word in enumerate(df_val['Word']):
  if val_final_output[i][0] == word:
    top_one+=1
    top_ten+=1
    top_hundred+=1
  elif word in val_final_output[i][:10]:
    top_ten+=1
    top_hundred+=1
  elif word in val_final_output[i][:100]:
    top_hundred+=1

print("Accuracy for Validation Dataset:")
print("Top-1 Accuracy: {:.2f}".format(top_one/total_words))
print("Top-10 Accuracy: {:.2f}".format(top_ten/total_words))
print("Top-100 Accuracy: {:.2f}".format(top_hundred/total_words))

In [None]:
top_one=0
top_ten=0
top_hundred=0
total_words= len(df_test['Word'])
for i,word in enumerate(df_test['Word']):
  if test_final_output[i][0] == word:
    top_one+=1
    top_ten+=1
    top_hundred+=1
  elif word in test_final_output[i][:10]:
    top_ten+=1
    top_hundred+=1
  elif word in test_final_output[i][:100]:
    top_hundred+=1

print("Accuracy for Test Dataset:")
print("Top-1 Accuracy: {:.2f}".format(top_one/total_words))
print("Top-10 Accuracy: {:.2f}".format(top_ten/total_words))
print("Top-100 Accuracy: {:.2f}".format(top_hundred/total_words))

# Testing the model on human coded test set

In [6]:
# Using the human coded test set
path = "./sample_data/testset.csv"
human_test_df = pd.read_csv(path, dtype="string")
human_test_df['Definition'] = human_test_df['Definition'].astype(str)
human_test_df = human_test_df[['Word', 'Definition']]
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
human_test_enc = tokenizer(human_test_df['Definition'].to_list(), padding=True, truncation=True, max_length=128)

with open('./sample_data/train_label_enum.txt', 'r') as file:
    contents = file.read()
train_label_enum = ast.literal_eval(contents)
inv_train_label_enum= {v: k for k, v in train_label_enum.items()}

labels = []
for word in list(human_test_df['Word']):
  if word in train_label_enum:
    labels.append(train_label_enum[word])
  else:
    labels.append(train_label_enum["<unk>"])
human_test_df['labels']=labels
human_test_dataset = RevDictDataset(human_test_enc, human_test_df['labels'])
human_test_loader = DataLoader(human_test_dataset, batch_size=8,shuffle=False) 

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
model.load_state_dict(torch.load('./sample_data/model.pt'))

#On Test
model.eval()
with torch.no_grad():
  human_test_final_output=[]
  for batch in human_test_loader:
      input_ids = batch['input_ids']
      attention_mask= batch['attention_mask']
      outputs = model(input_ids,attention_mask)

      # Generate top-k words for each instance in the batch
      _, topk_indices = torch.topk(outputs, k=100, dim=1)
      for i, idx_row in enumerate(topk_indices):
          row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
          human_test_final_output.append(row_words)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
top_one=0
top_ten=0
top_hundred=0
total_words= len(human_test_df['Word'])
for i,word in enumerate(human_test_df['Word']):
  if human_test_final_output[i][0] == word:
    top_one+=1
    top_ten+=1
    top_hundred+=1
  elif word in human_test_final_output[i][:10]:
    top_ten+=1
    top_hundred+=1
  elif word in human_test_final_output[i][:100]:
    top_hundred+=1

print("Accuracy for Human Test Dataset:")
print("Top-1 Accuracy: {:.2f}".format(top_one/total_words))
print("Top-10 Accuracy: {:.2f}".format(top_ten/total_words))
print("Top-100 Accuracy: {:.2f}".format(top_hundred/total_words))

Accuracy for Human Test Dataset:
Top-1 Accuracy: 0.00
Top-10 Accuracy: 0.00
Top-100 Accuracy: 0.06


In [10]:
with open('human_test_output.txt', 'w') as file:
  for i, word in enumerate(human_test_df['Word']):
        file.write(f"{word} : {human_test_final_output[i]}\n")

# Testing for a sample word

In [11]:
# Using a sample word to test
temp_df=pd.DataFrame(columns=['Word', 'Definition'])
temp_df = temp_df.append({'Word': 'roof', 'Definition': 'structure forming the upper covering of a builting'}, ignore_index=True)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
temp_enc = tokenizer(temp_df['Definition'].to_list(), padding=True, truncation=True, max_length=128)

with open('./sample_data/train_label_enum.txt', 'r') as file:
    contents = file.read()
train_label_enum = ast.literal_eval(contents)
inv_train_label_enum= {v: k for k, v in train_label_enum.items()}

labels = []
for word in list(temp_df['Word']):
  if word in train_label_enum:
    labels.append(train_label_enum[word])
  else:
    labels.append(train_label_enum["<unk>"])
temp_df['labels']=labels
temp_dataset = RevDictDataset(temp_enc, temp_df['labels'])
temp_loader = DataLoader(temp_dataset, batch_size=8,shuffle=False) 




vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
model.load_state_dict(torch.load('./sample_data/model.pt'))


model.eval()
with torch.no_grad():
  temp_final_output=[]
  for batch in temp_loader:
      input_ids = batch['input_ids']
      attention_mask= batch['attention_mask']
      outputs = model(input_ids,attention_mask)

      # Generate top-k words for each instance in the batch
      _, topk_indices = torch.topk(outputs, k=100, dim=1)
      for i, idx_row in enumerate(topk_indices):
          row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
          temp_final_output.append(row_words)

with open('temp_output.txt', 'w') as file:
  for i, word in enumerate(temp_df['Word']):
        file.write(f"{word} : {temp_final_output[i]}\n")

for i,word in enumerate(temp_df['Word']):
  print(f"Top-100 words generated for {word} are: {temp_final_output[i]}")
  print()

top_one=0
top_ten=0
top_hundred=0
total_words= len(temp_df['Word'])
for i,word in enumerate(temp_df['Word']):
  if temp_final_output[i][0] == word:
    top_one+=1
    top_ten+=1
    top_hundred+=1
  elif word in temp_final_output[i][:10]:
    top_ten+=1
    top_hundred+=1
  elif word in temp_final_output[i][:100]:
    top_hundred+=1

print("Accuracy for a Sample Word:")
print("Top-1 Accuracy: {:.2f}".format(top_one/total_words))
print("Top-10 Accuracy: {:.2f}".format(top_ten/total_words))
print("Top-100 Accuracy: {:.2f}".format(top_hundred/total_words))

  temp_df = temp_df.append({'Word': 'roof', 'Definition': 'structure forming the upper covering of a builting'}, ignore_index=True)
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top-100 words generated for roof are: ['Frolicful', 'back', 'Ulcerous', 'California_wine', 'Cheyne-Stokes_respiration', 'Tripartient', 'busted', 'Parasceve', 'Dram', 'Pongo_pygmaeus', 'charge', 'Complutensian', 'Lepas', 'Chokecherry', 'pothook', 'dateable', 'dispose', 'webber', 'trolling', 'Abstracting', 'bleaching', 'roof', 'Rafter', 'PCP', 'Fluid', 'riddle', 'viniculture', 'Perceptibility', 'Lopper', 'appropriateness', 'shorts', 'privatizing', 'playlist', 'Float', 'Tulsa', 'daisy', 'chad', 'photometer', 'presentation', 'Scenery', 'ornithopod', 'forward', 'trailblazer', 'bologna', 'blue_vitriol', 'decoration', 'impatience', 'Exclusivism', 'clot', 'Dimmy', 'couture', 'reparative', 'May_Day', 'ethiopian', 'Codifier', 'stew', 'amphitheater', 'Pirrie', 'Tumefy', 'short', 'christianity', 'Beat', 'kidnap', 'reprogram', 'despatches', 'Outward', 'being', 'Privative', 'colobus_monkey', 'bunt', 'Distillatory', 'syrian', 'Ichthyocolla', 'conditions', 'messiah', 'intruding', 'unbowed', 'saved', '