## HW3

In [1]:
# Run this line if you do not have this package in your environment
!pip install portalocker>=2.0.0



### Data Preparation

In [2]:
import torch
import torchtext
import numpy as np
from torch import nn, optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

In [3]:
tokenizer = get_tokenizer('basic_english')

# Build the vocabulary (dictionary of words) from text dataset
def build_vocab(data_iter):
    vocab = build_vocab_from_iterator(map(tokenizer, data_iter), specials=["<pad>", "<unk>", "<SOS>", "<EOS>"])
    vocab.set_default_index(vocab["<pad>"])
    return vocab

train_data, _, _ = torchtext.datasets.PennTreebank(root='data', split=('train', 'valid', 'test'))
vocab = build_vocab(train_data)

# Process raw text data. Convert each sentence into a tensor of integers
def process_raw_text(raw_text_iter, vocab):
    sos_index = vocab['<SOS>']
    eos_index = vocab['<EOS>']
    # Adding <SOS> at the start and <EOS> at the end of each sequence
    data = [torch.tensor([sos_index] + vocab(tokenizer(item)) + [eos_index], dtype=torch.long) for item in raw_text_iter]
    return data

train_data, valid_data, test_data = torchtext.datasets.PennTreebank(root='data', split=('train', 'valid', 'test'))

train_text = process_raw_text(train_data,vocab)
valid_text = process_raw_text(valid_data,vocab)
test_text = process_raw_text(test_data,vocab)

# limit the train text data to 20000 sentences
train_text = train_text[:20000]

#this code prepares text data for training an NLP model by tokenizing, building a vocabulary, and converting text into tensors.
#The Penn Treebank dataset is commonly used for language modeling tasks."""

In [4]:
# You can look up the representation of each token using lookup_tokens()
vocab.lookup_tokens([0, 1, 2,3,4,5,6])

['<pad>', '<unk>', '<SOS>', '<EOS>', 'the', 'n', 'of']

In [5]:
# Take a look at test dataset
count = 0
for text in test_data:
  print(text)
  print('\n')
  count += 1
  if count > 20:
    break

no it was n't black monday


but while the new york stock exchange did n't fall apart friday as the dow jones industrial average plunged N points most of it in the final hour it barely managed to stay this side of chaos


some circuit breakers installed after the october N crash failed their first test traders say unable to cool the selling panic in both stocks and futures


the N stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the N crash once again could n't handle the selling pressure


big investment banks refused to step up to the plate to support the beleaguered floor traders by buying big blocks of stock traders say


heavy selling of standard & poor 's 500-stock index futures in chicago <unk> beat stocks downward


seven big board stocks ual amr bankamerica walt disney capital cities\/abc philip morris and pacific telesis group stopped trading and never resumed


the <unk> has already begun


the equity market was <u

In [6]:

# Assuming vocab is your vocabulary object
vocab_size = len(vocab)
print(f"Number of words in the vocabulary: {vocab_size}")

Number of words in the vocabulary: 9925


In [7]:
text_length= [len(txt) for txt in train_text]
print(train_text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Question 1. Prepare data for the RNN/LSTM models (fill in the blanks)

In [8]:
# Fill in the blanks
train_input = [sequence[:-1] for sequence in train_text] # Input sequence is a sentence without the last word. train_input should be a list of texts from train_text.
train_length = [len(sequence) for sequence in train_input] # Length is later used for packing (optional) and calculating accuracy for non-padded sequence
train_target = [sequence[1:] for sequence in train_text] # Target is a sentence without the first word. train_target should be a list of texts from train_text

# Do the same for the validation and test set

valid_input = [sequence[:-1] for sequence in valid_text]
valid_target = [sequence[1:] for sequence in valid_text]
valid_length = [len(sequence) for sequence in valid_input]

test_input = [sequence[:-1] for sequence in test_text]
test_target = [sequence[1:] for sequence in test_text]
test_length = [len(sequence) for sequence in test_input]


In [9]:
# Fill in the blanks
# Pad the input and target sequences using pad_sequence(). Please read the documentation of the function carefully before filling below.
# We need to pass the batch_first argument
train_input_padded = pad_sequence(train_input, batch_first=True).numpy()
train_target_padded = pad_sequence(train_target, batch_first=True).numpy()

valid_input_padded = pad_sequence(valid_input, batch_first=True).numpy()
valid_target_padded = pad_sequence(valid_target, batch_first=True).numpy()

test_input_padded = pad_sequence(test_input, batch_first=True).numpy()
test_target_padded = pad_sequence(test_target, batch_first=True).numpy()

#sequences are being padded to the maximum length found in each respective dataset.


In [10]:
print(train_input_padded.shape)
print(valid_input_padded.shape)
print(test_input_padded.shape)

(20000, 84)
(3370, 77)
(3761, 86)


In [11]:
# Define a dataset and dataloader
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        text = self.X[idx][0]
        length = self.X[idx][1]
        return torch.tensor(text,dtype=torch.long), torch.tensor(length,dtype=torch.long), torch.tensor(self.y[idx],dtype=torch.long)

In [12]:
train_dataset = TextDataset(list(zip(train_input_padded, train_length)), train_target_padded)
valid_dataset = TextDataset(list(zip(valid_input_padded, valid_length)), valid_target_padded)
test_dataset = TextDataset(list(zip(test_input_padded, test_length)), test_target_padded)

# Define a dataloader
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
testloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Question 2

### Build Naive model

In [13]:
from collections import Counter
#function that gets the most frequent word in a text dataset

def get_most_frequent_word(text_data):
    word_counter = Counter()

    for words in text_data:
        word_counter.update(words)
#most_common(n)returns a list with a tuple with the n most common element and its count.
    most_frequent_word = word_counter.most_common(1)[0][0]  # Get the most common word
    return most_frequent_word


In [14]:
def get_text_data(X, y):
    text_data = []
    for inp, out in zip(X, y):
        inp_filtered = inp[inp > 0]  # Assuming zero is used for padding
        out_filtered = out[out > 0]
        full_sequence = np.concatenate((inp_filtered, out_filtered))
        text_data.append(full_sequence)
    return text_data


In [None]:
#calculates accuracy of predicting the most frequent word in a dataset
def evaluate_naive_model(text_data, most_frequent_word):
    correct = 0
    total = 0
    for sequence in text_data:
        for word in sequence:
            if word == most_frequent_word:
                correct += 1
            total += 1
    accuracy = (100 * correct / total) if total > 0 else 0
    return accuracy

# Assuming train_input, train_target, test_input, test_target are arrays loaded and prepared
train_text = get_text_data(train_input, train_target)
test_text = get_text_data(test_input, test_target)

# Get the most frequent word from the training text
most_frequent_word = get_most_frequent_word(train_text)

# Evaluate the model on the test text
test_accuracy = evaluate_naive_model(test_text, most_frequent_word)
print(f"Accuracy of the naive model on the test set: {test_accuracy:.2f}%")
print(f"Most frequent word predicted: {most_frequent_word}")


Accuracy of the naive model on the test set: 5.27%
Most frequent word predicted: 4


In [15]:
def get_most_frequent_word(text_data):
    """Finds the most frequently occurring word in the text data."""
    word_counter = Counter()
    for sequence in text_data:
        word_counter.update(sequence)
    return word_counter.most_common(1)[0][0]

def predict_with_fallback(model, key, fallback):
    """Predicts using the model or falls back to the most frequent word if the key is not found."""
# model.get(key, default) is a method to retrieve value associated with a key in a dictionary-like object. It returns the value matching the key if the key is in the dictionary
# otherwise,it returns the default value.
    return model.get(key, fallback)


#fucntion to evaluate the model being used
def nw_accuracy(text_data, model, fallback, n_gram=1):
    """Evaluates the model accuracy, reverting to fallback for unseen tokens."""
    correct = 0
    total = 0
    for sequence in text_data:
        for i in range(len(sequence) - n_gram):
            if n_gram == 1:
                key = sequence[i]
            else:  # Assuming n_gram == 2 for bigram
                key = (sequence[i], sequence[i+1])

            if i + n_gram < len(sequence):
                actual_next = sequence[i + n_gram]
                predicted_next = model.get(key, fallback)
                if predicted_next == actual_next:
                    correct += 1
            total += 1

    return 100 * correct / total if total else 0

  # Assuming train_input, train_target, test_input, test_target are loaded and prepared
train_text = get_text_data(train_input, train_target)
test_text = get_text_data(test_input, test_target)
#most freq word for baseline fallback
most_frequent_word = get_most_frequent_word(train_text)


### Build one-gram model

In [16]:
import numpy as np
from collections import Counter, defaultdict

def get_text_data(X, y):
    """Extracts sequences from the input and target arrays, filtering out padding."""
    text_data = []
    for inp, out in zip(X, y):
        inp_filtered = inp[inp > 0]  # Assuming padding is represented by zeros
        out_filtered = out[out > 0]
        full_sequence = np.concatenate((inp_filtered, out_filtered))
        text_data.append(full_sequence)
    return text_data

def build_onegram_model(text_data):
    """Builds a onegram model from the text data."""
    onegram_counts = defaultdict(Counter)
    for sequence in text_data:
        for i in range(len(sequence) - 1):
            current_token = sequence[i]
            next_token = sequence[i+1]
            onegram_counts[current_token][next_token] += 1
    # Finding the most common next token for each token
    onegram_most_common = {token: counts.most_common(1)[0][0] for token, counts in onegram_counts.items()}
    return onegram_most_common


# Build model
onegram_model = build_onegram_model(train_text)
#find most common word to evaluate
most_frequent_word_onegram = get_most_frequent_word(test_text)
# Evaluate model
onegram_accuracy = nw_accuracy(test_text, onegram_model, most_frequent_word_onegram, n_gram=1)

print(f'Accuracy of the one-gram model on the test set: {onegram_accuracy:.2f}%')


Accuracy of the one-gram model on the test set: 19.32%


### Build bi-gram model

In [17]:
#predicts next word based on the two words before it
def build_bigram_model(text_data):
    """Builds a bigram model from the text data."""
    bigram_counts = defaultdict(Counter)
    for sequence in text_data:
      #iterates till third last word bc its the last word in the sequence that will have the word, word+1 to predict word+2
        for i in range(len(sequence) - 2):
            bigram = (sequence[i], sequence[i+1])
            next_word = sequence[i+2]
            bigram_counts[bigram][next_word] += 1
    # Finding the most common next word for each bi-gram
    bigram_most_common = {bigram: counts.most_common(1)[0][0] for bigram, counts in bigram_counts.items()}
    return bigram_most_common

# Build model
bigram_model = build_bigram_model(train_text)
#find most common word to evaluate
most_frequent_word_bigram = get_most_frequent_word(test_text)
#Evaluate model accuracy
bigram_accuracy = nw_accuracy(test_text, bigram_model, most_frequent_word, n_gram=2)

print(f'Accuracy of the bi-gram model on the test set: {bigram_accuracy:.2f}%')


Accuracy of the bi-gram model on the test set: 19.37%


### Helper Functions

In [18]:
import time

# Redefine the training function and accuracy calculator

# Function to calculate accuracy for non-padded sequence and calculate loss, perplexity for padded sequence.
def calculate_metrics(loader, model,loss_func=nn.CrossEntropyLoss()):
    model.eval()  # Set the model to evaluation mode

    # Calculate Accuracy and Loss
    correct = 0
    total = 0
    test_loss = 0
    perplexity = 0
    with torch.no_grad():  # No need to track gradients
        for data, l, target in loader:
            # check if cuda is available
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            for i in range(len(l)):
              output = model(data[i])  # Forward pass
              output_no_padding = output[:l[i],:]
              target_no_padding = target[i][:l[i]]
              _, predicted = torch.max(output_no_padding.data, 1)  # Get the predicted classes
              total +=target_no_padding.view(-1).size(0)
              correct += (predicted == target_no_padding.view(-1)).sum().item()
              # calculate loss (calculated on padded sequence)
              loss = loss_func(output.view(-1,9925),target[i].view(-1))
              test_loss += loss.item()
              # calculate perplexity
              perplexity += 2**loss.item()

    accuracy = 100 * correct / total
    test_loss /= len(loader.dataset)
    perplexity /= len(loader.dataset)
    model.train()
    # round accuracy, test_loss, perplexity to 2 decimal places
    accuracy = round(accuracy,2)
    test_loss = round(test_loss,2)
    perplexity = round(perplexity,2)
    return accuracy, test_loss, perplexity

def train_model(model, loss_func, num_epochs, optimizer, train_loader, test_loader):

  train_loss_log = []
  train_acc_log = []
  test_acc_log = []

  # Move model to GPU if CUDA is available
  if torch.cuda.is_available():
      model = model.cuda()
  tic = time.time()
  for epoch in range(1,num_epochs+1):
    train_loss = 0
    for i, data in enumerate(train_loader):
      x, l, y = data
      # check if cuda is available
      if torch.cuda.is_available():
        x, y = x.cuda(), y.cuda()
      # get predicted y value from our current model
      pred_y = model(x)
      # calculate the loss
      loss = loss_func(pred_y.view(-1,9925),y.view(-1))
      # Zero the gradient of the optimizer
      optimizer.zero_grad()
      # Backward pass: Compute gradient of the loss with respect to model parameters
      loss.backward()
      # update weights ar
      optimizer.step()
      train_loss += loss.item() * len(x)
    # change the model to evaluation mode to calculate the test loss; We will come back to this later after learning Dropout and Batch Normalization
    train_loss/=len(train_loader.dataset)
    train_loss_log.append(train_loss)
    train_acc, _, _ = calculate_metrics(train_loader, model)
    test_acc, _, _ = calculate_metrics(test_loader, model)
    train_acc_log.append(train_acc)
    test_acc_log.append(test_acc)
    print("Epoch {:2},  Training Loss: {:9.4f}, Training Accuracy: {:9.2f},  Valid Accuracy: {:7.2f}".format(epoch, train_loss, train_acc, test_acc))
  toc = time.time()
  print("Elapsed Time : {:7.2f}".format(toc-tic))
  return train_loss_log, train_acc_log, test_acc_log

## Question 3. RNN without hidden state

In [19]:

class MultinomialLogisticRegression(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(MultinomialLogisticRegression, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # x: LongTensor of size (batch_size, sequence_length)
        embedded = self.embedding(x)
        logits = self.linear(embedded)
        return logits

class SingleLayerNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SingleLayerNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # x: LongTensor of size (batch_size, sequence_length)
        embedded = self.embedding(x)
        hidden = self.relu(self.linear1(embedded))
        logits = self.linear2(hidden)
        return logits

# Parameters
vocab_size = 9925  # from your dataset info
embedding_dim = 100  # size of the embedding vectors
hidden_dim = 128  # size of the hidden layer for the single layer NN

# Initialize models
model_a = MultinomialLogisticRegression(vocab_size, embedding_dim)
model_b = SingleLayerNN(vocab_size, embedding_dim, hidden_dim)

# Define loss function and optimizer
loss_func = nn.CrossEntropyLoss()
optimizer_a = optim.Adam(model_a.parameters(), lr=0.001)
optimizer_b = optim.Adam(model_b.parameters(), lr=0.001)

# Assuming `trainloader` and `testloader` are defined as per your data preparation code
num_epochs = 10  # or however many you deem appropriate

# Train model a
print("Training Multinomial Logistic Regression Model:")
train_loss_log_a, train_acc_log_a, test_acc_log_a = train_model(model_a, loss_func, num_epochs, optimizer_a, trainloader, validloader)

# Train model b
print("\nTraining Single Layer Neural Network Model:")
train_loss_log_b, train_acc_log_b, test_acc_log_b = train_model(model_b, loss_func, num_epochs, optimizer_b, trainloader, validloader)


Training Multinomial Logistic Regression Model:
Epoch  1,  Training Loss:    2.3640, Training Accuracy:     14.60,  Valid Accuracy:   14.30
Epoch  2,  Training Loss:    1.6440, Training Accuracy:     16.68,  Valid Accuracy:   16.20
Epoch  3,  Training Loss:    1.5101, Training Accuracy:     17.83,  Valid Accuracy:   17.07
Epoch  4,  Training Loss:    1.4439, Training Accuracy:     18.36,  Valid Accuracy:   17.36
Epoch  5,  Training Loss:    1.4022, Training Accuracy:     19.15,  Valid Accuracy:   18.01
Epoch  6,  Training Loss:    1.3724, Training Accuracy:     19.61,  Valid Accuracy:   18.31
Epoch  7,  Training Loss:    1.3494, Training Accuracy:     19.97,  Valid Accuracy:   18.52
Epoch  8,  Training Loss:    1.3308, Training Accuracy:     20.30,  Valid Accuracy:   18.66
Epoch  9,  Training Loss:    1.3154, Training Accuracy:     20.51,  Valid Accuracy:   18.83
Epoch 10,  Training Loss:    1.3024, Training Accuracy:     20.48,  Valid Accuracy:   18.74
Elapsed Time :  154.39

Training

In [20]:
calculate_metrics(testloader, model_a,loss_func=nn.CrossEntropyLoss())

(19.06, 1.43, 3.06)

In [21]:
calculate_metrics(testloader, model_b,loss_func=nn.CrossEntropyLoss())

(19.31, 1.49, 3.26)

## Question 4.

### RNN with single hidden state layer

In [23]:
class NextWordRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1): #single hidden layer
        super(NextWordRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #embeddings convert discrete word indices into dense vector reps (embeddings) that capture semantic information
        self.rnn = nn.RNN(embedding_dim, hidden_dim,num_layers=num_layers, nonlinearity='relu', batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        all_h, last_h = self.rnn(x)
        out = self.fc(all_h) # Apply Linear layer to outputs from all the hidden state.
        return out

In [24]:
# Hyperparameters
VOCAB_SIZE = 9925
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
learning_rate = 1e-3
epoch = 5
num_layers = 1

# Instantiate the model
next_word_rnn = NextWordRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,num_layers=num_layers)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(next_word_rnn.parameters(), lr=learning_rate)

train_loss_next_word_rnn, train_acc_log_next_word_rnn, test_acc_log_next_word_rnn = train_model(next_word_rnn, loss_func, epoch, optimizer, trainloader, testloader)

Epoch  1,  Training Loss:    1.7349, Training Accuracy:     18.20,  Valid Accuracy:   18.29
Epoch  2,  Training Loss:    1.4606, Training Accuracy:     20.86,  Valid Accuracy:   20.61
Epoch  3,  Training Loss:    1.3752, Training Accuracy:     22.32,  Valid Accuracy:   21.52
Epoch  4,  Training Loss:    1.3128, Training Accuracy:     23.74,  Valid Accuracy:   22.49
Epoch  5,  Training Loss:    1.2630, Training Accuracy:     24.77,  Valid Accuracy:   22.86
Elapsed Time :  121.44


In [26]:
accuracy, test_loss, perplexity= calculate_metrics(testloader, next_word_rnn,loss_func=nn.CrossEntropyLoss())

### LSTM with single hiddden layer

In [27]:
# Redefine the training function and accuracy calculator

# Function to calculate accuracy for non-padded sequence
def calculate_accuracy_LSTM(loader, model):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # No need to track gradients
        for data, l, target in loader:
            # check if cuda is available
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            for i in range(len(l)):
              output = model(data[i])  # Forward pass
              _, predicted = torch.max(output[:l[i],:].data, 1)  # Get the predicted classes
              total += target[i][:l[i]].view(-1).size(0)
              correct += (predicted == target[i][:l[i]].view(-1)).sum().item()

    accuracy = 100 * correct / total
    model.train()
    return accuracy

def train_model(model, loss_func, num_epochs, optimizer, train_loader, test_loader):

  train_loss_log = []
  train_acc_log = []
  test_acc_log = []

  # Move model to GPU if CUDA is available
  if torch.cuda.is_available():
      model = model.cuda()
  tic = time.time()
  for epoch in range(1,num_epochs+1):
    train_loss = 0
    for i, data in enumerate(train_loader):
      x, l, y = data
      # check if cuda is available
      if torch.cuda.is_available():
        x, y = x.cuda(), y.cuda()
      # get predicted y value from our current model
      pred_y = model(x)
      # calculate the loss
      loss = loss_func(pred_y.view(-1,9925),y.view(-1))
      # Zero the gradient of the optimizer
      optimizer.zero_grad()
      # Backward pass: Compute gradient of the loss with respect to model parameters
      loss.backward()
      # update weights ar
      optimizer.step()
      train_loss += loss.item() * len(x)
    # change the model to evaluation mode to calculate the test loss; We will come back to this later after learning Dropout and Batch Normalization
    train_loss/=len(train_loader)
    train_loss_log.append(train_loss)
    train_acc = calculate_accuracy_LSTM(train_loader, model)
    test_acc = calculate_accuracy_LSTM(test_loader, model)
    train_acc_log.append(train_acc)
    test_acc_log.append(test_acc)
    print("Epoch {:2},  Training Loss: {:9.4f}, Training Accuracy: {:9.2f},  Test Accuracy: {:7.2f}".format(epoch, train_loss, train_acc, test_acc))
  toc = time.time()
  print("Elapsed Time : {:7.2f}".format(toc-tic))
  return train_loss_log, train_acc_log, test_acc_log

In [28]:
# Define the model

class NextWordLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        all_h, (h, c) = self.lstm(x)
        out = self.fc(all_h) # Apply Linear layer to outputs from all the hidden state.
        return out

In [29]:
# Hyperparameters
VOCAB_SIZE = 9925
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
learning_rate = 1e-3
epoch = 5
num_layers = 1

# Instantiate the model
next_word_lstm = NextWordLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,num_layers=num_layers)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(next_word_lstm.parameters(), lr=learning_rate)

train_loss_next_word_lstm, train_acc_log_next_word_lstm, test_acc_log_next_word_lstm = train_model(next_word_lstm, loss_func, epoch, optimizer, trainloader, testloader)

Epoch  1,  Training Loss:   60.7425, Training Accuracy:     16.63,  Test Accuracy:   16.69
Epoch  2,  Training Loss:   49.0273, Training Accuracy:     19.98,  Test Accuracy:   19.82
Epoch  3,  Training Loss:   46.2464, Training Accuracy:     21.83,  Test Accuracy:   21.41
Epoch  4,  Training Loss:   44.4125, Training Accuracy:     23.16,  Test Accuracy:   22.32
Epoch  5,  Training Loss:   43.0051, Training Accuracy:     24.10,  Test Accuracy:   22.96
Elapsed Time :  130.24


In [30]:
calculate_accuracy_LSTM(testloader, next_word_lstm)

22.960116448326055

## Question 5.

### RNN/LSTM with Two Hidden Layers

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim

class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2):  # Updated num_layers to 2 hidden layers
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        all_h, (h, c) = self.lstm(x)
        out = self.fc(all_h)  # Apply Linear layer to outputs from all the hidden states.
        return out

# Hyperparameters
VOCAB_SIZE = 9925  # 10000 words in the dictionary + <START>, <UNK>, <UNUSED>
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
LEARNING_RATE = 1e-3
EPOCHS = 5
NUM_LAYERS = 2  # Updated this to reflect the change in the model constructor

# Instantiate the model
next_word_lstm2 = NextWordLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS)
loss_func = nn.CrossEntropyLoss()
# Correct instantiation of the optimizer for the 2HL model
optimizer = optim.Adam(next_word_lstm2.parameters(), lr=LEARNING_RATE)

# Assuming trainloader and testloader are properly defined and available
train_loss_next_word_lstm, train_acc_log_next_word_lstm, test_acc_log_next_word_lstm = train_model(next_word_lstm2, loss_func, EPOCHS, optimizer, trainloader, testloader)


Epoch  1,  Training Loss:    1.9537, Training Accuracy:     15.23,  Valid Accuracy:   15.35
Epoch  2,  Training Loss:    1.5786, Training Accuracy:     19.22,  Valid Accuracy:   19.04
Epoch  3,  Training Loss:    1.4803, Training Accuracy:     21.34,  Valid Accuracy:   20.82
Epoch  4,  Training Loss:    1.4177, Training Accuracy:     22.79,  Valid Accuracy:   21.92
Epoch  5,  Training Loss:    1.3713, Training Accuracy:     24.04,  Valid Accuracy:   22.67
Elapsed Time :  177.66


In [None]:
calculate_accuracy_LSTM(testloader, next_word_lstm2)

## Question 6. Generate Text with Prompts

In [31]:
def generate_text(prompt,model,max_length=10):
  text = process_raw_text(prompt,vocab)[0][:-1] # Remove the last <EOS> token from our text
  if torch.cuda.is_available():
    text = text.cuda()
  max_length = 10
  for i in range(max_length):
    # get the next word prediction and add it to text
    output = model(text).view(-1,9925)
    _, predicted = torch.max(output,1)
    new_word = predicted[-1].unsqueeze(0)
    text = torch.cat((text,new_word),dim=0)
    text = text.to(torch.long)
    if vocab.lookup_tokens([new_word.item()])[0] == '<EOS>':
      break
  return text


In [32]:
# Prompt to tokenized tensor
prompt=[''] # You do not need to prepend <SOS> token to your prompt. generate_text() function will prepend <SOS> token for you.
generated_text = generate_text(prompt,next_word_lstm2) # put your model here.
generated_text_lookup = vocab.lookup_tokens(generated_text.tolist())
' '.join(generated_text_lookup)

'<SOS> the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

In [33]:
prompt=['Some Traders'] # You do not need to prepend <SOS> token to your prompt. generate_text() function will prepend <SOS> token for you.
generated_text = generate_text(prompt,next_word_lstm2) # put your model here.
generated_text_lookup = vocab.lookup_tokens(generated_text.tolist())
' '.join(generated_text_lookup)

'<SOS> some traders are <unk> by the <unk> of the <unk> of the'

In [34]:
prompt=['stocks'] # You do not need to prepend <SOS> token to your prompt. generate_text() function will prepend <SOS> token for you.
generated_text = generate_text(prompt,next_word_lstm2) # put your model here.
generated_text_lookup = vocab.lookup_tokens(generated_text.tolist())
' '.join(generated_text_lookup)

'<SOS> stocks the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Count parameters for each model
num_params_multinomial_logistic_regression = count_parameters(model_a)
num_params_single_layer_hidden_network = count_parameters(model_b)
num_params_next_word_rnn = count_parameters(next_word_rnn)
num_params_next_word_lstm = count_parameters(next_word_lstm)
num_params_next_word_lstm2 = count_parameters(next_word_lstm2)

print(f"Multinomial Logistic Regression Model Parameters: {num_params_multinomial_logistic_regression}")
print(f"Single Layer Hidden Network Model Parameters: {num_params_single_layer_hidden_network}")
print(f"Next Word RNN Model Parameters: {num_params_next_word_rnn}")
print(f"Next Word LSTM Model Parameters: {num_params_next_word_lstm}")
print(f"Next Word LSTM with 2 layers Model Parameters: {num_params_next_word_lstm2}")


Multinomial Logistic Regression Model Parameters: 1994925
Single Layer Hidden Network Model Parameters: 2285753
Next Word RNN Model Parameters: 2583749
Next Word LSTM Model Parameters: 2682821
Next Word LSTM with 2 layers Model Parameters: 2814917


Examining the performance of all the models:


Multinomial Logistic Regression:
Training accuracy reached a maximum of 20.32%, and the validation Accuracy reached up to 18.68%. This basic model is not as computationally demanding but also cannot capture complex patterns.

Unigram Model: 19.32% test accuracy
Bigram Model: 19.37% test accuracy

Single Layer Neural Network:
Training accuracy reached a maximum of 19.23%, and the validation Accuracy reached up to 18.97%.
The nonlinearity in the activation function allows it to capture more complex patterns.

RNN with single hidden layer Model:
Training accuracy reached a maximum of 24.77%, and the validation Accuracy reached up to 22.86%.

LSTM with single hidden layer model:
Training accuracy reached a maximum of 24.10%, and the validation Accuracy reached up to 22.96%. Advantage of the LSTM is ability to capture dependencies and relationship among words

LSTM with 2 hidden layers model:
Training accuracy reached a maximum of 24.04%, and the validation Accuracy reached up to 22.67%. Similar benefits as above with added ability to capture more complex patterns with 2 hidden layers.

After examining the next word accuracy on each model and taking into account factors like computational costs, interpretability, and run time, I chose to go with the LSTM neural net that used 2 hidden layers.
Since we were tasked with predicting the next word for the Penn Treebank dataset, which has many complex patterns and dependencies in the text data,I I concluded that an LSTM model was best suited because it can capture long-term dependencies in sequential data.
Within the two LSTM models, I decided to g with the two hidden layer LSTM even thouhg the single layer one had a similar valiadation accuracy because the model with 2 hidde layers will be able to capture more complex patterns because of the additional layer and could have had better performance given more tests/ training epochs.
Among the LSTM models, both the single hidden layer LSTM and the two hidden layers LSTM performed similarly in terms of validation accuracy. However, the two hidden layers LSTM might have slightly more capacity to capture complex patterns due to its additional hidden layer.


