In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

c:\Users\PREDATOR\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\PREDATOR\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [8]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Load Data

Load Harry Potter Corpus
Source: https://github.com/ErikaJacobs/Harry-Potter-Text-Mining/tree/master

##### About Dataset

The dataset is taken from the github repi of **ErikaJacobs**. In this repo ErikaJacobs has performed a **Text Analysis of the Harry Potter Book Series**. 
 - "This project features sentiment analysis conducted on the text of the Harry Potter book series by JK Rowling."

This repository contains the Harry Potter dataset under the folder **Book Text**. The folder contains 7 txt files each containing the text from single chapter or books. The files are:
   * HPBook1.txt
   * HPBook2.txt
   * HPBook3.txt
   * HPBook4.txt
   * HPBook5.txt
   * HPBook6.txt
   * HPBook7.txt

These text files were last committed four years ago. The size of the files are varying from 450kb to 1.4mb depending on the content of each book.

According to the repo, the source for this file is another github repo **bradleyboehmke** : https://github.com/bradleyboehmke/harrypotter

The details for the data in this repo is mentioned as:
------------------------------------------------------

An R Package for J.K. Rowling's Harry Potter Series

This package provides access to the full texts of the first seven Harry Potter books. The UTF-8 plain text for each novel was sourced from [Read Vampire Books](www.readbooksvampire.com) **however the website is not currently accessible**, processed a bit, and is ready for text analysis. Each text is in a character vector with each element representing a single chapter. The package contains:

-   `philosophers_stone`: Harry Potter and the Philosophers Stone, published in 1997
-   `chamber_of_secrets`: Harry Potter and the Chamber of Secrets, published in 1998
-   `prisoner_of_azkaban`: Harry Potter and the Prisoner of Azkaban, published in 1999
-   `goblet_of_fire`: Harry Potter and the Goblet of Fire, published in 2000
-   `order_of_the_phoenix`: Harry Potter and the Order of the Phoenix, published in 2003
-   `half_blood_prince`: Harry Potter and the Half-Blood Prince, published in 2005
-   `deathly_hallows`: Harry Potter and the Deathly Hallows, published in 2007

* For this assignment we will use 
   - (HPBook1.txt HPBook2.txt HPBook3.txt HPBook4.txt) as training data
   - (HPBook5.txt HPBook6.txt) as validation data
   - (HPBook7.txt) as test data


In [37]:
dataset = datasets.load_dataset("text", data_files={"train": [f'corpus-dataset/HPBook{i}.txt' for i in range(1,5)], "test":  [f'corpus-dataset/HPBook{i}.txt' for i in range(5,7)], "validation": f"corpus-dataset/HPBook{7}.txt"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [40]:
print(dataset['train'].shape)

(95, 1)


In [38]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 95
    })
    test: Dataset({
        features: ['text'],
        num_rows: 68
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 37
    })
})


### Text Preprocessing

In [42]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/95 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

In [43]:
print(tokenized_dataset['train'][23]['tokens'])

['harry', 'looked', 'bemusedly', 'at', 'the', 'photograph', 'colin', 'was', 'brandishing', 'under', 'his', 'nose', '.', 'a', 'moving', ',', 'black-and-white', 'lockhart', 'was', 'tugging', 'hard', 'on', 'an', 'arm', 'harry', 'recognized', 'as', 'his', 'own', '.', 'he', 'was', 'pleased', 'to', 'see', 'that', 'his', 'photographic', 'self', 'was', 'putting', 'up', 'a', 'good', 'fight', 'and', 'refusing', 'to', 'be', 'dragged', 'into', 'view', '.', 'as', 'harry', 'watched', ',', 'lockhart', 'gave', 'up', 'and', 'slumped', ',', 'panting', ',', 'against', 'the', 'white', 'edge', 'of', 'the', 'picture', '.', '\\will', 'you', 'sign', 'it', '?', '\\', 'said', 'colin', 'eagerly', '.', '\\no', ',', '\\', 'said', 'harry', 'flatly', ',', 'glancing', 'around', 'to', 'check', 'that', 'the', 'room', 'was', 'really', 'deserted', '.', '\\sorry', ',', 'colin', ',', 'i', "'", 'm', 'in', 'a', 'hurry', '-', 'quidditch', 'practice', '-\\', 'he', 'climbed', 'through', 'the', 'portrait', 'hole', '.', '\\oh', '

### Numericalizing

In [44]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [45]:
print(len(vocab))

7761


In [46]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', '.', ',', 'the', "'", '\\', 'and', 'to', 'a']


In [62]:
#Save vocab
def save_vocab(vocab, path):
    import pickle
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

save_vocab(vocab, './models/vocab.pkl')

### . Prepare the batch loader

##### Prepare data

In [47]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [48]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [49]:
train_data.shape

torch.Size([128, 4603])

### Preprocessing Steps

##### **Tokenization**: 
* The **get_tokenizer** function from torchtext is used to create a basic English tokenizer. This tokenizer breaks down text into individual words.
* *lambda* function is defined to apply tokenization to each example in the dataset. The resulting tokens are stored in a new field named **tokens**.
* The *map* function is used to apply this tokenization function to each example in the dataset, and the original text column is removed.

##### **Numericalizing**:
* The vocabulary (vocab) is built from the tokenized training dataset. It assigns a unique numerical index to each token that appears at least three times (min_freq=3).
* Two special tokens, <unk> (unknown) and <eos> (end of sequence), are inserted into the vocabulary at indices 0 and 1, respectively.
* The default index for the vocabulary is set to the index of <unk>.

##### **get_data** method:
* This method converts the tokenized dataset into a format suitable for training a language model.
* For each example in the dataset, the tokens are retrieved, and <eos> is appended to represent the end of the sequence.
* The tokens are then numericalized using the vocabulary **vocab**, and the resulting indices are added to the **data** list.
* The list of indices is converted to a PyTorch LongTensor **torch.LongTensor**.
* The data is reshaped into batches of size **batch_size**, and the function returns the processed data.

**get_data** function is used to preprocess the tokenized datasets for training, validation, and testing.
The resulting train_data, valid_data, and test_data are batches of numericalized sequences ready for input to the language model.

### Modeling

In [50]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden


### Training

In [52]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3 

In [53]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 32,695,889 trainable parameters


### Model Architecture

**LSTMLanguageModel**, is an LSTM-based language model implemented using PyTorch.
The LSTM model contains following layers:
* **Embedding Layer:** The input of this layer is -- and the output is embedding vector of dimension **emb_dim** which in our case is 1024
* **LSTM Layer:** This layer takes emb_dim(1024) as input and outputs Hidden states with a dimension of hid_dim for each time step in the sequence.

    Parameters for this layer are: 
    * Number of layers: **num_layers**(2)
    * Hidden state dimension: **hid_dim**(1024)
    * Dropout rate: **dropout_rate**(0.65)

    Weights for this layer are initialized uniformly within the range [-init_range_other, init_range_other]
* **Dropout Layer:** This layer is applied to discard some of the output from LSTM layer by randomly setting some of the LSTM output values to 0. Main purpose of this layer is to intoduce regularization. The rate of the outputs to be set as 0 is determined by **droupout_rate**(0.65).
* **Linear (Fully Connected) Layer:** Finally in this layer, output from LSTM layer is fitted which has dimension **hiden_dim**(1024) and this give the score to each word in vocabulary. Here weights are initialized uniformly within the range [-init_range_other, init_range_other] and the bias is set to zero.

The model has a total of 32,695,889 trainable parameters, including the weights and biases in the embedding layer, LSTM layer, linear layer, and other parameters.
       

In [54]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [55]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches


In [56]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

### Training Process

##### LSTM model's major methods during training

* **Hidden State Initialization:**
The **init_hidden** method is resposible for initialising the hidden state. This method sets hidden state and cell state fro LSTM layer to 0.

* **Forward Method:**
Forward method takes batch of token sequences  and an initial hidden state (hidden) as input. It then embeds the input tokens using the embedding layer and applies dropout to the embedded sequence. Atter that it passes the sequence through the LSTM layer to obtain hidden states.
It then applies dropout to the LSTM output and finally feeds the output through a linear layer to obtain predictions for the next tokens. This method returns the predictions as well as the hidden state.

* **Detaching Hidden State:**
The **detach_hidden** method is used to detach the hidden states from the computation graph for purposes of gradient computation during training.


##### Train Method
* **Loading Data:** Data is loaded to the model in batch with **num_batches**. The batches that are not in multiple of **seq_len** are dropped to maintain sequence length.

* **Training Loop:** In the whole traing iteration following steps are carried out..
    * Training data is iterated in the batch of sequence length
    * Gradient of model parameters is set to 0
    * Hidden state is detached from the computation for better accuracy
    * **get_batch** method is called to get the batch of input and target sequences
    * Data is transferred to specified device (GPU or CPU) for processing
    * Forward pass is made to get the prediction and update the hidden state
    * Loss of prediction is calculated given target
    * Backpropogation is performed
    * **clip_grad_norm** method from torch.nn.utils is used to prevent exploding gradient
    * Model parameters are updated 
    * Loss for the epoch is accumulated

* **Validation**: Similary **evaluate** method is used to get validation loss

* Learning rate is updated using learning rate scheduler
* Finally the model state where the validation loss is best is saved.

In [57]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 555.328
	Valid Perplexity: 372.969


                                                         

	Train Perplexity: 313.069
	Valid Perplexity: 199.907


                                                         

	Train Perplexity: 170.831
	Valid Perplexity: 134.611


                                                         

	Train Perplexity: 123.905
	Valid Perplexity: 116.125


                                                         

	Train Perplexity: 103.301
	Valid Perplexity: 106.414


                                                         

	Train Perplexity: 91.384
	Valid Perplexity: 100.587


                                                         

	Train Perplexity: 82.858
	Valid Perplexity: 98.139


                                                         

	Train Perplexity: 76.439
	Valid Perplexity: 94.981


                                                         

	Train Perplexity: 71.287
	Valid Perplexity: 90.966


                                                         

	Train Perplexity: 67.042
	Valid Perplexity: 91.343


                                                         

	Train Perplexity: 62.420
	Valid Perplexity: 87.720


                                                         

	Train Perplexity: 60.094
	Valid Perplexity: 86.525


                                                         

	Train Perplexity: 58.228
	Valid Perplexity: 86.225


                                                         

	Train Perplexity: 56.545
	Valid Perplexity: 85.366


                                                         

	Train Perplexity: 54.964
	Valid Perplexity: 84.586


                                                         

	Train Perplexity: 53.393
	Valid Perplexity: 84.209


                                                         

	Train Perplexity: 52.089
	Valid Perplexity: 83.983


                                                         

	Train Perplexity: 50.777
	Valid Perplexity: 83.703


                                                         

	Train Perplexity: 49.515
	Valid Perplexity: 83.619


                                                         

	Train Perplexity: 48.338
	Valid Perplexity: 83.242


                                                         

	Train Perplexity: 47.213
	Valid Perplexity: 82.738


                                                         

	Train Perplexity: 46.143
	Valid Perplexity: 82.101


                                                         

	Train Perplexity: 45.221
	Valid Perplexity: 82.255


                                                         

	Train Perplexity: 43.692
	Valid Perplexity: 81.926


                                                         

	Train Perplexity: 43.047
	Valid Perplexity: 81.990


                                                         

	Train Perplexity: 42.278
	Valid Perplexity: 81.495


                                                         

	Train Perplexity: 41.905
	Valid Perplexity: 81.575


                                                         

	Train Perplexity: 41.425
	Valid Perplexity: 81.793


                                                         

	Train Perplexity: 41.265
	Valid Perplexity: 81.883


                                                         

	Train Perplexity: 41.032
	Valid Perplexity: 81.940


                                                         

	Train Perplexity: 41.006
	Valid Perplexity: 81.890


                                                         

	Train Perplexity: 40.952
	Valid Perplexity: 81.900


                                                         

	Train Perplexity: 40.974
	Valid Perplexity: 81.889


                                                         

	Train Perplexity: 40.965
	Valid Perplexity: 81.894


                                                         

	Train Perplexity: 40.923
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.960
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.991
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.969
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 41.019
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.951
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.939
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.997
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.999
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.905
	Valid Perplexity: 81.893


                                                         

	Train Perplexity: 40.904
	Valid Perplexity: 81.892


                                                         

	Train Perplexity: 40.956
	Valid Perplexity: 81.892


                                                         

	Train Perplexity: 40.906
	Valid Perplexity: 81.892


                                                         

	Train Perplexity: 41.002
	Valid Perplexity: 81.892


                                                         

	Train Perplexity: 40.945
	Valid Perplexity: 81.892


                                                         

	Train Perplexity: 40.952
	Valid Perplexity: 81.892


In [64]:
#Save whole model
torch.save(model, 'best-val-lstm_lm.pt')

### Testing

In [58]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 79.847


### Inference

In [59]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [60]:
prompt = 'Harry Potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is going to get past fluffy . \ \maybe the dementors are back to the quidditch world cup , \ said hagrid . \i ' m sure he ' s a

0.7
harry potter is . he was still getting over on the grounds , and they were looking after a long walk against the dark arts . ron had no impression that he was

0.75
harry potter is . he was still getting over on the grounds , and they were looking after a long walk against the dark arts . ron had no impression that he was

0.8
harry potter is . he was still getting over on the grounds , and they were looking after a long walk against the dark arts . ron had no impression that he was

1.0
harry potter is already to get past anything like strange , , \ he added , after a quarter of terror . hagrid thrust his bed , which he looked down at the

