**Libraries**

In [None]:
import nltk
import numpy as np
import string
import pandas as pd
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, math
from tqdm import tqdm
from nltk import sent_tokenize
import pickle

Downloading the punkt for the tokenization support

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Defing the device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

**Loading the dataset**

In [None]:
path = '/content/drive/MyDrive/NLP_A2/game_of_thrones.txt'

# Open the file in read mode ('r')
with open(path, 'r') as file:
    # Read the entire content of the file
    file_content = file.read()

# Print or manipulate the content as needed
print(file_content[:100])

A Song of Ice and Fire

A Game of Thrones

PROLOGUE

We should start back, Gared urged as the woods 


**Preprocessing**

1. Remove the tabs and Newlines

In [None]:
def remove_tabs_newlines(text):
  text = text.replace('\n','').replace('\t','')
  return text

In [None]:
removed_TabsNewlines = remove_tabs_newlines(file_content)
removed_TabsNewlines[:100]

'A Song of Ice and FireA Game of ThronesPROLOGUEWe should start back, Gared urged as the woods began '

2. Tokenizing into sentences

In [None]:
def tokenize_sentence(text):
  sentences = sent_tokenize(text)
  return sentences

In [None]:
sentences = tokenize_sentence(removed_TabsNewlines)
sentences[:5]

['A Song of Ice and FireA Game of ThronesPROLOGUEWe should start back, Gared urged as the woods began to grow dark around them.',
 'The wildlings are dead.Do the dead frighten you?',
 'Ser Waymar Royce asked with just the hint of a smile.Gared did not rise to the bait.',
 'He was an old man, past fifty, and he had seen the lordlings come and go.',
 'Dead is dead, he said.']

In [None]:
# def remove_punctuation(text):
#     # Create a translation table to map each punctuation character to None
#     translator = str.maketrans('', '', string.punctuation)

#     # Use translate to remove punctuations from the text
#     text_without_punctuations = text.translate(translator)
#     return text_without_punctuations

In [None]:
# removed_punctuations_sentences = []
# for sentence in sentences:
#   removed_punctuations_sentences.append(remove_punctuation(sentence))
# removed_punctuations_sentences[:5]

3. Tokenizing the text

In [None]:
tokeninzed_text = []
for sentence in sentences:
  tokeninzed_text.append(word_tokenize(sentence))
tokeninzed_text[0]

['A',
 'Song',
 'of',
 'Ice',
 'and',
 'FireA',
 'Game',
 'of',
 'ThronesPROLOGUEWe',
 'should',
 'start',
 'back',
 ',',
 'Gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them',
 '.']

3. Numericalizing

In [None]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokeninzed_text, min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [None]:
# Checking the length
print(len(vocab))

6354


In [None]:
# Print some samples
print(vocab.get_itos()[:10])

['<unk>', '<eos>', ',', '.', 'the', 'and', 'to', 'of', 'a', 'his']


**Prepare the batch loader**

In [None]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        tokens = example.append('<eos>')
        tokens = [vocab[token] for token in example]
        data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

    return data

In [None]:
# Testing the function
batch_size = 2
text_data = [word_tokenize('A Song of Ice and FireA Game of ThronesPROLOGUE.'),word_tokenize(' We should start back Gared urged as the woods began.')]
get_data(text_data, vocab, batch_size)

tensor([[  84,    0,    7, 1539,    5,    0,    0,    7,    0,    3,    1],
        [ 163,  189, 1524,   61, 1201, 1337,   19,    4,  832,  271,    3]])

In [None]:
# Checking the shape
get_data(text_data, vocab, batch_size).shape

torch.Size([2, 12])

**Model**

In [None]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim

        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)

        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

**Training**

Initializing the parameters


1. The variables you provided are related to configuring a neural network model, and they are commonly used in the context of natural language processing (NLP) tasks, such as language modeling or machine translation. Let's go through each of them:

2. vocab_size: This represents the size of the vocabulary, which is the total number of unique words in your dataset. In NLP tasks, each word is typically represented as a unique index, and the vocabulary size is the total number of unique indices.

3. emb_dim: Short for embedding dimension, this parameter determines the size of the word embeddings. Word embeddings are vector representations of words in a continuous vector space. Each word in the vocabulary is mapped to a high-dimensional vector of emb_dim size. The embedding layer is used to convert discrete word indices into continuous vector representations.

4. hid_dim: Hidden dimension is the size of the hidden state in recurrent neural networks (RNNs) or the size of the output dimension in feedforward neural networks. In the context of an RNN, it represents the number of hidden units in each recurrent layer. Larger hidden dimensions can capture more complex patterns but may also require more computational resources.

5. num_layers: This parameter indicates the number of layers in the neural network. In the context of recurrent neural networks (RNNs), it represents the number of recurrent layers. For feedforward neural networks, it represents the number of hidden layers.

6. dropout_rate: Dropout is a regularization technique used to prevent overfitting in neural networks. During training, a random fraction of the neurons is dropped out (i.e., their outputs are set to zero) at each update, which helps prevent co-adaptation of hidden units. The dropout_rate is the probability of dropping out a neuron during training.

7. lr: Learning rate is a hyperparameter that determines the step size at each iteration during the optimization process. It controls how much the model's weights should be updated during training. A smaller learning rate may lead to more stable convergence, but training might be slower, while a larger learning rate may speed up training but risk overshooting the optimal weights.

In [None]:
vocab_size = len(vocab)
emb_dim = 1024
hid_dim = 1024
num_layers = 2
dropout_rate = 0.65
lr = 1e-3

In [None]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 29,812,946 trainable parameters


In [None]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1
    return src, target

In [None]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):

    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]

    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)

    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()

        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]
        target = target.reshape(-1)
        loss = criterion(prediction, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [None]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [None]:
size_of_training_data = int(0.8 * len(tokeninzed_text))
train_data = tokeninzed_text[:size_of_training_data]
valid_data = tokeninzed_text[size_of_training_data:]

In [None]:
batch_size = 128
train_data = get_data(train_data, vocab, batch_size)
valid_data = get_data(valid_data, vocab, batch_size)
#test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [None]:
n_epochs = 20
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion,
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size,
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/NLP_A2/best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')



	Train Perplexity: 580.741
	Valid Perplexity: 446.872




	Train Perplexity: 342.097
	Valid Perplexity: 260.573




	Train Perplexity: 210.871
	Valid Perplexity: 202.781




	Train Perplexity: 163.193
	Valid Perplexity: 166.514




	Train Perplexity: 137.254
	Valid Perplexity: 149.642




	Train Perplexity: 121.336
	Valid Perplexity: 137.163




	Train Perplexity: 109.599
	Valid Perplexity: 128.855




	Train Perplexity: 100.258
	Valid Perplexity: 121.452




	Train Perplexity: 92.585
	Valid Perplexity: 116.136




	Train Perplexity: 86.683
	Valid Perplexity: 112.125




	Train Perplexity: 81.605
	Valid Perplexity: 108.380




	Train Perplexity: 76.992
	Valid Perplexity: 106.348




	Train Perplexity: 73.050
	Valid Perplexity: 104.200




	Train Perplexity: 69.501
	Valid Perplexity: 102.455




	Train Perplexity: 66.640
	Valid Perplexity: 101.330




	Train Perplexity: 63.871
	Valid Perplexity: 100.422




	Train Perplexity: 61.577
	Valid Perplexity: 100.193




	Train Perplexity: 59.559
	Valid Perplexity: 100.015




	Train Perplexity: 57.225
	Valid Perplexity: 99.358




	Train Perplexity: 55.270
	Valid Perplexity: 99.041


In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)

            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab

            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)
            prediction = torch.multinomial(probs, num_samples=1).item()

            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
prompt = 'Royce nodded'
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes
#with a tradeoff of less-make-sense sentence
temperatures = [1, 1, 0.5, 1.0, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, word_tokenize,
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

1
Royce nodded , a heavy doublet red and swaying .

1
Royce nodded , a heavy doublet red and swaying .

0.5
Royce nodded , a heavy voice of the Eyrie .

1.0
Royce nodded , a heavy doublet red and swaying .

1.0
Royce nodded , a heavy doublet red and swaying .



**Save the files**

Vocab

In [None]:
with open('/content/drive/MyDrive/NLP_A2/vocabs.pkl', 'wb') as f:
    pickle.dump(vocab, f)