In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [2]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertModel, AutoTokenizer, TrainingArguments, Trainer, DistilBertModel
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn as nn
import pdb

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
path = "sample_data/toyset.csv"
df = pd.read_csv(path, dtype="string")
df['Definition'] = df['Definition'].astype(str)
df = df[['Word', 'Definition']]
df

Unnamed: 0,Word,Definition
0,Geographical,"""Of or pertaining to geography."""
1,Inextricableness,"""The state of being inextricable."""
2,Papuars,"""The native black race of Papua or New Guinea ..."
3,dark-coated,covered with dark hair
4,Cesura,"""See Caesura."""
...,...,...
233,olive,a tree of some other species of olea or of som...
234,olive,evergreen tree cultivated in the mediterranean...
235,olive,an evergreen tree olea europaea cultivated sin...
236,olive,the tree has been cultivated for its fruit for...


In [5]:
'''Convert classes to numbers'''
word_dict = {} 
i = 0
for w in df['Word'].unique():
    word_dict[w] = i
    i += 1

'''Convert numbers back to words'''
idx2word = {v:k for k,v in word_dict.items()}

In [6]:
df_train, df_test = train_test_split(df[['Definition','Word']], test_size=0.2)
df_test, df_val = train_test_split(df_test[['Definition','Word']], test_size=0.5)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

train_enc = tokenizer(df_train['Definition'].to_list(), padding=True, truncation=True, max_length=128)
test_enc = tokenizer(df_test['Definition'].to_list(), padding=True, truncation=True, max_length=128)
val_enc = tokenizer(df_val['Definition'].to_list(), padding=True, truncation=True, max_length=128)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
class RevDictDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.labels = self.labels.to_list()
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [9]:
'''One hot encoding of classes'''
train_label_enum = {k:j+1 for j, k in enumerate(df_train['Word'].unique())}
train_label_enum["<unk>"] = 0
train_num_labels = len(train_label_enum)
idx2token = {idx: token for token, idx in train_label_enum.items()}
df_train['labels'] = df_train['Word'].apply(lambda x: [1.0 if train_label_enum[x]==i else 0.0 for i in range(train_num_labels)])
# labels = []
# for word in list(df_val['Word']):
#   if word in train_label_enum:
#     labels.append(train_label_enum[word])
#   else:
#     labels.append(train_label_enum["<unk>"])

df_val['labels'] = df_val['Word'].apply(lambda x: [1.0 if train_label_enum[x]==i else 0.0 for i in range(train_num_labels)] if x in train_label_enum else [1.0] + [0.0]*(train_num_labels-1))

In [10]:
inv_train_label_enum= {v: k for k, v in train_label_enum.items()}

In [11]:
train_dataset = RevDictDataset(train_enc, df_train['labels'])
val_dataset = RevDictDataset(val_enc, df_val['labels'])

In [12]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

class BLmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4):
        super(BLmodel, self).__init__()
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.lstm_layer_1 = nn.LSTM(input_size=768, hidden_size=hidden_dim1, num_layers=1, batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=hidden_dim1, hidden_size=hidden_dim2, num_layers=1, batch_first=True)
        self.lstm_layer_3 = nn.LSTM(input_size=hidden_dim2, hidden_size=hidden_dim3, num_layers=1, batch_first=True)
        self.lstm_layer_4 = nn.LSTM(input_size=hidden_dim3, hidden_size=hidden_dim4, num_layers=1, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim4, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        bert_embedding = outputs[0]
        lstm_output_1, _ = self.lstm_layer_1(bert_embedding)
        lstm_output_2, _ = self.lstm_layer_2(lstm_output_1)
        lstm_output_3, _ = self.lstm_layer_3(lstm_output_2)
        lstm_output_4, _ = self.lstm_layer_4(lstm_output_3)
        output = self.output_layer(lstm_output_4[:, -1, :])
        return output


In [13]:
def train(model, train_loader, val_loader, optimizer, num_epochs):
    model.train()
    min_val_loss = float('inf')
    for epoch in range(num_epochs):
        total_loss = 0.0
        # Training
        for batch in train_loader:
            input_ids = batch['input_ids']
            labels = batch['labels']
            attention_mask= batch['attention_mask']
            optimizer.zero_grad()
            outputs = model(input_ids,attention_mask)
            # pdb.set_trace()
            loss = nn.CrossEntropyLoss()(outputs.view(-1, len(train_label_enum)), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        with torch.no_grad():
            total_val_loss = 0.0
            for batch in val_loader:
                input_ids = batch['input_ids']
                labels = batch['labels']
                attention_mask= batch['attention_mask']
                outputs = model(input_ids,attention_mask)

                # pdb.set_trace()
                
                # Generate top-k words for validation
                # _, topk_indices = torch.topk(outputs, k=10, dim=1)
                # for i,idx_row in enumerate(topk_indices):
                #   row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
                #   print(f"Top-10 words for {inv_train_label_enum[labels[i].item()]} generated are: {row_words}")
                
                val_loss = nn.CrossEntropyLoss()(outputs.view(-1, len(train_label_enum)), labels)
                total_val_loss += val_loss.item()

        avg_loss = total_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)

        if avg_val_loss < min_val_loss:
          min_val_loss = avg_val_loss
          torch.save(model.state_dict(), 'model.pt')

        print(f"Epoch [{epoch+1}/{num_epochs}]"
              f"\tTrain Loss: {avg_loss:.4f}"
              f"\tVal Loss: {avg_val_loss:.4f}"
              f"\tMin Val Loss: {min_val_loss:.4f}")

        model.train()


In [27]:
# Instantiate the DataLoader for train and validation datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)  # No need to shuffle for validation

# Define the training parameters
vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
num_epochs = 10

# Train and validate the model
train(model, train_loader, val_loader, optimizer, num_epochs)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/10]	Train Loss: 4.4668	Val Loss: 4.4584	Min Val Loss: 4.4584
Epoch [2/10]	Train Loss: 4.3996	Val Loss: 4.4139	Min Val Loss: 4.4139
Epoch [3/10]	Train Loss: 4.3010	Val Loss: 4.3949	Min Val Loss: 4.3949
Epoch [4/10]	Train Loss: 4.1574	Val Loss: 4.3119	Min Val Loss: 4.3119
Epoch [5/10]	Train Loss: 4.0133	Val Loss: 4.3308	Min Val Loss: 4.3119
Epoch [6/10]	Train Loss: 3.8709	Val Loss: 4.2290	Min Val Loss: 4.2290
Epoch [7/10]	Train Loss: 3.7336	Val Loss: 4.1342	Min Val Loss: 4.1342
Epoch [8/10]	Train Loss: 3.5889	Val Loss: 4.2213	Min Val Loss: 4.1342
Epoch [9/10]	Train Loss: 3.4674	Val Loss: 4.1138	Min Val Loss: 4.1138
Epoch [10/10]	Train Loss: 3.4218	Val Loss: 4.0937	Min Val Loss: 4.0937


In [28]:
labels = []
for word in list(df_test['Word']):
  if word in train_label_enum:
    labels.append(train_label_enum[word])
  else:
    labels.append(train_label_enum["<unk>"])
df_test['labels']=labels
test_dataset = RevDictDataset(test_enc, df_test['labels'])
test_loader = DataLoader(test_dataset, batch_size=8,shuffle=False) 

In [29]:
vocab_size = len(train_label_enum)
embedding_dim = 768
hidden_dim1 = 256
hidden_dim2 = 128
hidden_dim3 = 64
hidden_dim4 = 32
model = BLmodel(vocab_size, embedding_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4)
model.load_state_dict(torch.load('./model.pt'))

model.eval()
with torch.no_grad():
  final_output=[]
  for batch in test_loader:
      input_ids = batch['input_ids']
      attention_mask= batch['attention_mask']
      outputs = model(input_ids,attention_mask)

      # Generate top-k words for each instance in the batch
      _, topk_indices = torch.topk(outputs, k=10, dim=1)
      for i, idx_row in enumerate(topk_indices):
          row_words = [inv_train_label_enum[idx.item()] for idx in idx_row]
          final_output.append(row_words)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
for i,word in enumerate(df_test['Word']):
  print(f"Top-10 words generated for {word} are: {final_output[i]}")
  print()

Top-10 words generated for irritating are: ['irritating', 'Orb', 'Crapulous', 'Collyrium', 'hazily', 'dark-coated', 'Circumjacence', 'overhaul', 'Inboard', 'making_love']

Top-10 words generated for damning are: ['Supplyant', 'burn', 'Hans_Eysenck', 'mirroring', 'affordable', 'damning', 'nightshade', 'Quinible', 'hazily', 'alkaline-loving']

Top-10 words generated for damning are: ['damning', 'Supplyant', 'Hans_Eysenck', 'affordable', 'Quinible', 'betel', 'out_of_print', 'mirroring', 'alkaline-loving', 'hazily']

Top-10 words generated for nightshade are: ['olive', 'affordable', 'out_of_print', 'hazily', 'kirtle', 'sayonara', 'mirroring', 'minoxidil', 'Hans_Eysenck', 'glomerular_capsule']

Top-10 words generated for Tap are: ['Tap', 'oppressive', 'Orb', 'affordable', 'overhaul', 'unmechanical', 'carpet_sweeper', 'Quinible', 'Cortef', 'Inboard']

Top-10 words generated for Overseas are: ['overhaul', 'affordable', 'Tap', 'mirroring', 'oppressive', 'Staple', 'post-free', 'Orb', 'olive', '