NLP 243 Language Modeling - Parsa Mazaheri

In [None]:
# Uncomment if running code on Colab
! pip install datasets torchtext
! mkdir output 

In [None]:
# Imports
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchtext.vocab as vocab

from datasets import load_dataset

from tqdm import tqdm
from collections import Counter
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

In [None]:
# Download the dataset using HuggingFace load_dataset
dataset = load_dataset('ptb_text_only')

#### Preprocess and Embeddings

In [None]:
# View Dataset Splits
print('Dataset Split:', dataset)

# Read data
train_data = dataset['train']['sentence']
val_data = dataset['validation']['sentence']
test_data = dataset['test']['sentence']

In [None]:

# List of Tokenized Words in the Train Corpus
def preprocess(data):
    print("> Tokenizing the data...")
    tokenized_data = []
    for sentence in tqdm(data):
        tokenized_data += ['<start>'] + sentence.split()

    # List of Unique Words
    words_freq = Counter(tokenized_data)
    VOCAB_SIZE = len(words_freq)

    print()
    print("> Creating Word2Idx and Idx2Word...")
    word2idx = { word: idx for idx, word in enumerate(words_freq) }
    idx2word = { idx: word for word,idx in word2idx.items() }
    
    print("> Done")
    return word2idx, idx2word, VOCAB_SIZE


word2idx, idx2word, VOCAB_SIZE = preprocess(train_data)
# Print Vocab Length
print('\nVocab Size: ', VOCAB_SIZE)


In [None]:
# Load GloVe Embeddings
GLOVE_DIM = 300
glove = vocab.GloVe(name = '6B', dim = GLOVE_DIM)

print('Loaded {} words in GloVe vocab'.format(len(glove.itos)))

# Get Embedding for given word
def get_word_embedding(word):
    return glove.vectors[glove.stoi[word]]

In [None]:
start_tensor = torch.zeros(1, GLOVE_DIM, device = device) # Word Embedding Tensor for <start>
unk_tensor = torch.rand(1, GLOVE_DIM, device = device) # Word Embedding Tensor for <unk>

# Create Embedding Matrix for Vocab
embeddings = []
for word in word2idx:
    if word in glove.stoi: # If word present in GloVe
        embeddings.append(get_word_embedding(word)) 
    else:
        if(word == '<start>'): # If word is <start>
            embeddings.append(start_tensor) 
        else: # If word is <unk> or not present in GloVe
            embeddings.append(unk_tensor)
            
temp_list = []
for emb in embeddings:
    temp_list.append(emb.detach().cpu().numpy().squeeze().tolist())
# Tensor of Word Embeddings for each word in vocab
embeddings_tensor = torch.tensor(temp_list, device = device)

#### Dataset Loader

In [None]:
# LangModel Class for DataLoader
class DatasetLM(Dataset):
    
    # Constructor
    def __init__(self, data: list):
        self.data = data
        # Default Sequence Length
        self.N = 30
        self.words = self.load_words()
        # Create List of tokens in the Corpus
        self.token_list = []
        for word in self.words:
            if word in word2idx: 
                self.token_list.append(word2idx[word])
            else:
                self.token_list.append(1)
    
    # Length of Number of Sequences for a Dataset split
    def __len__(self):
        return len(self.token_list) - self.N
    
    def __getitem__(self, idx: int):
        # Input Sequence Tensor
        x = torch.tensor(self.token_list[idx : idx + self.N], device = device)
        # Target Sequence Tensor
        y = torch.tensor(self.token_list[idx + 1 : idx + self.N + 1], device = device)
        return x, y

    # List of Tokenized Words in the Corpus
    def load_words(self):
        tokenized_data = []
        for sentence in self.data:
            tokenized_data += ['<start>'] + sentence.split()
        return tokenized_data


In [None]:
print("> Loading datasets ...")

# Train Dataset 
train_ds = DatasetLM(train_data)
# Val Dataset Object
val_ds = DatasetLM(val_data)
# Test Dataset Object
test_ds = DatasetLM(test_data)
print("> Done")

#### Hyperparams

In [None]:
# Model Hyper-Parameters
BATCH_SIZE = 64
EMBEDDING_DIM = GLOVE_DIM
HIDDEN_DIM = 128
OUTPUT_DIM = VOCAB_SIZE
NUM_LAYERS = 2
BIDIRECTION = False
DROPOUT = 0.2
LEARNING_RATE = 0.001
MODEL_TYPE = 'lstm'
N_EPOCHS = 10


In [None]:
# Train Data Loader
train_loader = DataLoader(
    train_ds, batch_size = BATCH_SIZE, shuffle = True)
# Val Data Loader
val_loader = DataLoader(
    val_ds, batch_size = BATCH_SIZE, shuffle = True)
# Test Data Loader
test_loader = DataLoader(
    test_ds, batch_size = 1, shuffle = False)

#### Model Architecture

In [None]:
# RNN / LSTM / GRU Model
class LSTM(nn.Module):
    
    # Constructor
    def __init__(self, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout):
        super().__init__()
        
        # Initialize Pre-Trained GloVe Embeddings
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim,
            num_layers = n_layers,
            bidirectional = bidirectional,
            dropout = dropout,
            batch_first = True
        )
        num_directions = 2 if bidirectional else 1
        # Dense layers for predicting 
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # dropout
        self.dropout = nn.Dropout(dropout)
        
    # Forward Pass of Model    
    def forward(self, x):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before Seq Layer
        embedded = self.dropout(embedded)
         # LSTM Layer
        output, (hidden, cell) = self.lstm(embedded)
        # 1st Fully Connected Layer
        out_fc1 = self.fc1(output)
        # Dropout Layer before Output
        out_dp = self.dropout(out_fc1)
        # 2nd Fully Connected Layer
        output = self.fc2(out_dp)
        return output


# GRU Model
class GRU(nn.Module):        
    def __init__(self, embedding_dim, hidden_dim, output_dim, 
                n_layers, bidirectional, dropout):
        super().__init__()
        
        # Initialize Pre-Trained GloVe Embeddings
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        # GRU layer
        self.gru = nn.GRU(
            embedding_dim, hidden_dim,
            num_layers = n_layers,
            bidirectional = bidirectional,
            dropout = dropout,
            batch_first = True
        )
        num_directions = 2 if bidirectional else 1
        # Dense layers for predicting 
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # dropout
        self.dropout = nn.Dropout(dropout)
        
    # Forward Pass of Model    
    def forward(self, x):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before Seq Layer
        embedded = self.dropout(embedded)
        # GRU Layer
        output, hidden = self.gru(embedded)
        # 1st Fully Connected Layer
        out_fc1 = self.fc1(output)
        # Dropout Layer before Output
        out_dp = self.dropout(out_fc1)
        # 2nd Fully Connected Layer
        output = self.fc2(out_dp)
        return output


# RNN
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, 
                n_layers, bidirectional, dropout):
        super().__init__()
        
        # Initialize Pre-Trained GloVe Embeddings
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        # RNN layer
        self.rnn = nn.RNN(
            embedding_dim, hidden_dim,
            num_layers = n_layers,
            bidirectional = bidirectional,
            dropout = dropout,
            batch_first = True
        )
        num_directions = 2 if bidirectional else 1
        # Dense layers for predicting 
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # dropout
        self.dropout = nn.Dropout(dropout)
        
    # Forward Pass of Model    
    def forward(self, x):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before Seq Layer
        embedded = self.dropout(embedded)
        # RNN Layer
        output, hidden = self.rnn(embedded)
        # 1st Fully Connected Layer
        out_fc1 = self.fc1(output)
        # Dropout Layer before Output
        out_dp = self.dropout(out_fc1)
        # 2nd Fully Connected Layer
        output = self.fc2(out_dp)
        return output

In [None]:

def get_model(model) -> nn.Module:

    if model == 'rnn':
        return RNN(
            EMBEDDING_DIM,
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION,
            DROPOUT).to(device)
    
    elif model == 'gru':
        return GRU(
            EMBEDDING_DIM,
            HIDDEN_DIM, 
            OUTPUT_DIM,
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

    elif model == 'lstm':
        return LSTM(
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

model = get_model(
    model=MODEL_TYPE
)

print(model)

#### Training and Evaluation Functions

In [None]:
# Model Train Function
def train(loader, model, optimizer, loss_function):
    model.train()
    losses = [] 
    for batch in tqdm(loader, desc='Training'):
        x, y = batch

        optimizer.zero_grad()
        y_pred = model(x)
        
        # Convert y_pred -> 2D Tensor and y-> 1D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1]) 
        y = torch.flatten(y) 
        
        # Loss
        loss = loss_function(y_pred, y)
        losses.append(loss.item())
        
        # Calculate gradients and update weights
        loss.backward()  
        optimizer.step()
    return round((sum(losses) / len(losses)), 4) # Return Average Loss


# Model Evaluate Function
def evaluate(loader, model, loss_function):
    model.eval()
    losses = []
    for batch in tqdm(loader, desc='Evaluate'):
        x, y = batch 
        y_pred = model(x)
              
        # Convert y_pred -> 2D Tensor and y-> 1D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = torch.flatten(y)
        
        # Loss
        loss = loss_function(y_pred, y)
        losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4) # Return Average Loss

#### Model Training

In [None]:
# Model Training on Train dataset and Evaluation on Validation dataset

# Optimizer
optimizer = torch.optim.AdamW(
    model.parameters(), lr = LEARNING_RATE)

# Loss Function -> Cross-Entropy Loss
loss_func = nn.CrossEntropyLoss().to(device)

train_losses, val_losses = [], []
train_perplexities, val_perplexities = [], []


# Path to Save Best Model
PATH = f'output/best-model.pt'

for epoch in range(N_EPOCHS):
    print("EPOCH: ", epoch+1)

    # Avg Training Loss
    train_loss = train(
        loader=train_loader,
        model=model,
        optimizer=optimizer,
        loss_function=loss_func
    )
    train_losses.append(train_loss)
    
    # Train Perplexity
    train_ppl = torch.exp(
        torch.tensor(train_loss, device = device))
    train_perplexities.append(
        train_ppl.detach().cpu().numpy())
    
    # Avg Val Loss
    val_loss = evaluate(
        loader=val_loader,
        model=model,
        loss_function=loss_func
    )
    val_losses.append(val_loss)
    # Val Perplexity
    val_ppl = torch.exp(
        torch.tensor(val_loss, device = device))
    val_perplexities.append(
        val_ppl.detach().cpu().numpy())
    
    print(f"Train Loss: {train_loss:.4f} | Train Perplexity: {train_ppl:.4f} | Val Loss: {val_loss:.4f} | Val Perplexity: {val_ppl:.4f} \n")
    
    # Save model
    torch.save(model.state_dict(), PATH)

In [None]:
# Line Plot Comparison
epochs = np.arange(N_EPOCHS) + 1

def draw_plot(line_A_vals, line_B_vals, line_A_label, line_B_label, 
              xlabel, ylabel, title, colors):
    plt.style.use('ggplot')
    plt.plot(
        epochs, line_A_vals, label=line_A_label, color=colors[0], linewidth='3')
    plt.plot(
        epochs, line_B_vals, label=line_B_label, color=colors[1], linewidth='3')
    plt.xlabel(xlabel)
    plt.xticks(epochs)
    plt.ylabel(ylabel)
    plt.title(title, color = 'black')
    plt.legend(shadow = True)
    
# Training vs Validation Loss Plot
draw_plot(train_losses, val_losses, 
          'Training Loss', 
          'Validation Loss', 
          'Epoch', 
          'Loss', 
          'Training Loss vs Validation Loss', 
          ['green', 'red'])

In [None]:
# Training vs Validation Perplexity Plot
draw_plot(train_perplexities, 
          val_perplexities, 
          'Training Perplexity', 
          'Validation Perplexity', 
          'Epoch', 
          'Perplexity', 
          'Training Perplexity vs Validation Perplexity', 
          ['green', 'red'])

In [None]:
# Load the saved model
saved_model = get_model(model=MODEL_TYPE)

saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

In [None]:
# Model Test Function
def test(loader, model, loss_function):
    # Set model to eval mode
    model.eval()
    losses = []
    for batch in tqdm(loader, desc = 'Test: '):
        x, y = batch
        with torch.no_grad():
            
            y_pred = model.forward(x)
            # Convert (y_pred -> 2D Tensor) and (y -> 1D Tensor)
            y_pred = y_pred.view(-1, y_pred.shape[-1]) 
            y = torch.flatten(y)
            
            # Loss
            loss = loss_function(y_pred, y)
            losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4) # Return Average Loss

# Avg Test Loss
test_loss = test(
    loader=test_loader,
    model=saved_model,
    loss_function=loss_func
)
# Test Perplexity
test_perplexity = torch.exp(
    torch.tensor(test_loss, device = device))

print()
print(f"Test Loss: {test_loss} | Test Perplexity: {test_perplexity}")