In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import nltk
import warnings
warnings.filterwarnings(action = 'ignore')
torch.cuda.empty_cache()

In [2]:
# we are using the word_tokenize as it is more efficient and powerful
from nltk import word_tokenize
nltk.download('punkt')
anna_words = []
with open('anna.txt', 'r') as file:
    for line in file:
        anna_words.extend(word_tokenize(line.lower()))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
print(anna_words[:10])
print(len(anna_words))

['chapter', '1', 'happy', 'families', 'are', 'all', 'alike', ';', 'every', 'unhappy']
431119


In [4]:
from collections import Counter
# creating the dict for word to id and id to word
# Each word have unique ids 
# this will be required for mapping the words and convert to batch format

word_to_id, id_to_word = {}, {}
count = Counter(anna_words) # Counter function from collection
n_unique = sorted(count, key=count.get, reverse=True)
for idx, word in enumerate(n_unique):
    word_to_id[word] = idx
    id_to_word[idx] = word

In [5]:
# Converting our word data to indices
anna_ids = [word_to_id[word] for word in anna_words] # list of ids

# length of the vocabulary
# This will be used for embedding layer and one-hot-vector for final output
n_vocab = len(word_to_id)
print(f'Length of vocabulary is {n_vocab}')

# This is how the data looks after converting words to unique ids
print(anna_ids[:10])

#comment this line during actual training
# anna_ids = anna_ids[:20000]

# convert anna_ids to numpy array
anna_ids = np.array(anna_ids)

Length of vocabulary is 13758
[210, 2564, 283, 2970, 76, 31, 2408, 35, 203, 681]


In [6]:
# Device
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device('cpu')

Creating batches in RNN is tricky. 

**batch_size** = The entire data will be divided into these many rows
ex: if we have total 100 number of words in anna.txt and we have batch_size = 5, then we will have a list of list which will have 5 rows and each row will 
contain 20 words.

**sequence length** = Length of sentence during each iteration.
Ex: In our example we have 5 rows and 20 columns. Let's select the seq_len as 4.
During each mini_batch we will create x and y. Where x, y are features and labels respectively. For seq_len of 4 and batch_size of 5, our x will have 5 rows and 4 columns. Our label y is of same dimension as x but shifted by one position. This means we will create 5(batch_size) copy of our same RNN for training it in GPU and to each of those RNN we will pass for 4 words. These 5 RNNs will generate 5 hidden states from those 4 words passed to them as seq_len and in the next mini batch we will pass the next 4 words. For our dummy scenario we will be creating 20/4 = 5 mini batches and during each mini_batch hidden_state will be passed across the columns.

read [this.](https://github.com/udacity/deep-learning-v2-pytorch/blob/master/recurrent-neural-networks/char-rnn/Character_Level_RNN_Solution.ipynb)




In [7]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from, i.e. anna_ids
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [8]:
anna_ids_np = np.array(anna_ids)
batches = get_batches(anna_ids_np, 6, 5)

In [9]:
x, y = next(batches)

Read [this](https://github.com/chrisvdweth/ml-toolkit/blob/master/pytorch/notebooks/minimal-example-lstm-input.ipynb) for better clarity about dimensions. 

In [10]:
# defining the RNN

class wordRNN(nn.Module):
    '''creating a word level RNN'''
    def __init__(self, input_size, hidden_size, batch_size, n_vocab, drop_prob = 0.2):
        super(wordRNN, self).__init__()

        self.embedding_size = input_size # This is the size of the embedding layer
        self.hidden_size = hidden_size # Can be any value - hyperparameter
        self.batch_size = batch_size
        self.drop_prob = drop_prob

        # Expected no of classes - for LM the entire vocab # n_vocab
        self.n_vocab = n_vocab # Len of the vocab
        
        # Creating the embedding layer
        self.embedding = nn.Embedding(num_embeddings=self.n_vocab, embedding_dim = self.embedding_size)

        # Creating the LSTM
        self.lstm = nn.LSTM(input_size = self.embedding_size, hidden_size = self.hidden_size, batch_first = True, dropout = self.drop_prob)

        # Creating the FClayer
        self.fc = nn.Linear(self.hidden_size, self.n_vocab)

        #Creating a dropout layer
        self.dropout = nn.Dropout(self.drop_prob)
        
    def init_hidden(self):
        #(num_layers * num_directions, batch, hidden_size)
        # batch_first = True, doesn't affect initial hidden state dimension
        return (torch.zeros(1, self.batch_size, self.hidden_size).to(device), torch.zeros(1, self.batch_size, self.hidden_size).to(device))
        
    def forward(self, x, hidden):
        embed = self.embedding(x)
        # expected input shape (batch_size, sequence_len, input_size)
        lstm_out, hidden = self.lstm(embed, hidden)
        out = self.dropout(lstm_out)
        out = out.contiguous().view(-1, self.hidden_size)
        output = self.fc(out)
        return output, hidden

Here are some important information you need to implement the nn.CrossEntropyLoss(). It does not take in **one-hot vectors**. Rather, it takes in class values. Therefore, your logits and targets will **not be of the same dimensions**. Logits have to be of the dimension (num_examples, vocab_size) but your label only has to contain the index of the true class so it will have the shape (num_examples) not (num_examples, vocab_size). That shape would be needed only if you are feeding in one-hot encoded vectors.

In [11]:
# Creating our training loop

def train(model, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1):
    
    # Creating the criterion
    criterion = nn.CrossEntropyLoss()

    # Creating the optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # Learning rate scheduler
    # lr = torch.optim.lr_scheduler.StepLR()

    train_loss_epoch = []
    val_loss_epoch = []

    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    train_data, val_data = data[:val_idx], data[val_idx:]


    counter = 0
    for epoch in range(epochs):
        
        train_loss = 0
        val_loss = 0
        
        hidden = model.init_hidden()
        model.train()
        
        for data, label in get_batches(train_data, batch_size, seq_length):
            counter+=1
            data, label = torch.from_numpy(data), torch.from_numpy(label)

            data = data.to(device)
            label = label.to(device)
            model.to(device)
            criterion.to(device)

            hidden = tuple([each.data for each in hidden])

            optimizer.zero_grad()
            logits, hidden = model(data, hidden)
            loss = criterion(logits, label.view(batch_size*seq_len).long())
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            train_loss += loss.item()


        val_h = model.init_hidden()
        model.eval()

        for data, label in get_batches(val_data, batch_size, seq_length):
            # One-hot encode our data and make them Torch tensors
            data, label = torch.from_numpy(data), torch.from_numpy(label)
        

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            val_h = tuple([each.data for each in val_h])

            inputs, targets = data, label
            inputs.to(device)
            targets.to(device)

            output, val_h = model(inputs, val_h)
            val_loss_temp = criterion(output, targets.view(batch_size*seq_length).long())

            val_loss += val_loss_temp.item()
            
        train_loss_epoch.append(train_loss)
        val_loss_epoch.append(val_loss)


        print("Epoch: {}/{}...".format(epoch+1, epochs),
            "Step: {}...".format(counter),
            " Train Loss: {:.4f}...".format(train_loss_epoch[-1]),
            " Validation Loss: {:.4f}...".format(val_loss_epoch[-1]))
        
#     print(f'The training loss for epoch {epoch} is{train_loss}')

In [12]:
n_vocab = len(word_to_id)

embedding_size = 100
hidden_size = 512
batch_size = 16
seq_len = 80
epochs = 10

# creating our RNN model
model = wordRNN(embedding_size, hidden_size, batch_size, n_vocab)

train(model, anna_ids, epochs=epochs, batch_size=batch_size, seq_length=seq_len, lr=0.001, clip=5, val_frac=0.1)

Epoch: 1/10... Step: 303...  Train Loss: 1748.4401...  Validation Loss: 179.1718...
Epoch: 2/10... Step: 606...  Train Loss: 1501.8193...  Validation Loss: 171.1935...
Epoch: 3/10... Step: 909...  Train Loss: 1417.5236...  Validation Loss: 167.4724...
Epoch: 4/10... Step: 1212...  Train Loss: 1355.0168...  Validation Loss: 165.1995...
Epoch: 5/10... Step: 1515...  Train Loss: 1299.8150...  Validation Loss: 164.0242...
Epoch: 6/10... Step: 1818...  Train Loss: 1248.5582...  Validation Loss: 163.5314...
Epoch: 7/10... Step: 2121...  Train Loss: 1200.8852...  Validation Loss: 164.0882...
Epoch: 8/10... Step: 2424...  Train Loss: 1157.1028...  Validation Loss: 163.9245...
Epoch: 9/10... Step: 2727...  Train Loss: 1117.7539...  Validation Loss: 164.5872...
Epoch: 10/10... Step: 3030...  Train Loss: 1079.1443...  Validation Loss: 165.0321...
