# RNN text embedding 

In [1]:
!wget https://raw.githubusercontent.com/fawazsammani/The-Complete-Neural-Networks-Bootcamp-Theory-Applications/master/alice.txt

--2023-12-29 01:48:52--  https://raw.githubusercontent.com/fawazsammani/The-Complete-Neural-Networks-Bootcamp-Theory-Applications/master/alice.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147341 (144K) [text/plain]
Saving to: 'alice.txt'


2023-12-29 01:48:52 (4.66 MB/s) - 'alice.txt' saved [147341/147341]



In [2]:
import torch
import os 
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [3]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def __len__(self):
        return len(self.word2idx)

In [4]:
class TextProcess(object):
    
    def __init__(self):
        self.dictionary = Dictionary()
        
    def get_data(self, path, batch_size=20):
        with open(path, "r") as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
        # Create a 1-D tensor that contains the index of all the words in the file 
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        with open(path, "r") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index +=1
        # Find out how many batches we need
        num_batches = rep_tensor.shape[0] // batch_size
        # Remove the remainder (Filter out the ones that don't fit)
        rep_tensor = rep_tensor[:num_batches * batch_size]
        # return (batch_size, num_batches)
        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor

In [5]:
embed_size = 128 # Input features to the LSTM
hidden_size = 1024
num_layers = 1
num_epochs = 20
batch_size = 20
timesteps = 30
learning_rate = 0.002

In [6]:
corpus = TextProcess()

In [7]:
rep_tensor = corpus.get_data("alice.txt", batch_size)

In [8]:
# rep_tensor is the tensor that contains the index of all the words. Each row contains 1659 words by default
print(rep_tensor.shape)

torch.Size([20, 1484])


In [9]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

5290


In [10]:
num_batches = rep_tensor.shape[1] // timesteps
print(num_batches)

49


In [11]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        # Perform word Embedding
        x = self.embed(x)
        # Reshape the input tensor
        # x = x.view(batch_size, timesteps, embed_size)
        out, (h, c) = self.lstm(x, h)
        # Reshape the output from (samples, timestep, output_features) to a shape appropriate for the FC layer
        # (batch_size * timesteps, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        # Decade hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

In [12]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
for epoch in range(num_epochs):
    # set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size),
             torch.zeros(num_layers, batch_size, hidden_size))
    
    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Get mini-batch inputs and targets
        inputs = rep_tensor[:, i:i+timesteps]
        targets = rep_tensor[:, (i+1):(i+1)+timesteps]
        # String: blach horse is here
        # input: black horse   output: lack horse
        outputs, _ = model(inputs, states)
        loss = loss_fn(outputs, targets.reshape(-1))
        
        # Backpropagation and weight update
        model.zero_grad()
        loss.backward()
        # perform gradient clipping. clip_value (float or int) is the maximum allowed value of the gradients
        # The gradients are clipped in the range [-clip_value, clip_value]. This is to prevent the exploding gradient problem
        clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
        
        step = (i+1) // timesteps
        if step % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

  clip_grad_norm(model.parameters(), 0.5)


Epoch [1/20], Loss: 8.581891059875488
Epoch [2/20], Loss: 5.966008186340332
Epoch [3/20], Loss: 5.230400562286377
Epoch [4/20], Loss: 4.6995086669921875
Epoch [5/20], Loss: 4.20219087600708
Epoch [6/20], Loss: 3.599154472351074
Epoch [7/20], Loss: 3.090261697769165
Epoch [8/20], Loss: 2.715618371963501
Epoch [9/20], Loss: 2.319878578186035
Epoch [10/20], Loss: 1.957794189453125
Epoch [11/20], Loss: 1.660093903541565
Epoch [12/20], Loss: 1.4346832036972046
Epoch [13/20], Loss: 1.1302061080932617
Epoch [14/20], Loss: 0.8875438570976257
Epoch [15/20], Loss: 0.6753695011138916
Epoch [16/20], Loss: 0.5117305517196655
Epoch [17/20], Loss: 0.3650626242160797
Epoch [18/20], Loss: 0.20184478163719177
Epoch [19/20], Loss: 0.14105528593063354
Epoch [20/20], Loss: 0.08908329904079437


In [15]:
# Test the model 
with torch.no_grad():
    with open("results.txt", "w") as f:
        # Set initial hidden one cell states
        state = (torch.zeros(num_layers, 1, hidden_size),
                torch.zeros(num_layers, 1, hidden_size))
        # Select one word id randomly and convert it to shape (1, 1)
        input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1)
        
        for i in range(500):
            output, _ = model(input, state)
            print(output.shape)
            # sample a word id from the exponential of the output 
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()
            print(word_id)
            # Replace the input with sampled word id for the next time step
            input.fill_(word_id)
            
            # Write the results to file 
            word = corpus.dictionary.idx2word[word_id]
            word = "\n" if word == "<eos>" else word + " "
            f.write(word)
            
            if (i+1) % 100 == 0:
                print(f"Sampled [{i+1}/{500}] words and save to results.txt")

torch.Size([1, 5290])
5
torch.Size([1, 5290])
1625
torch.Size([1, 5290])
41
torch.Size([1, 5290])
285
torch.Size([1, 5290])
1001
torch.Size([1, 5290])
41
torch.Size([1, 5290])
285
torch.Size([1, 5290])
1001
torch.Size([1, 5290])
103
torch.Size([1, 5290])
4924
torch.Size([1, 5290])
103
torch.Size([1, 5290])
4924
torch.Size([1, 5290])
103
torch.Size([1, 5290])
402
torch.Size([1, 5290])
403
torch.Size([1, 5290])
20
torch.Size([1, 5290])
5
torch.Size([1, 5290])
285
torch.Size([1, 5290])
137
torch.Size([1, 5290])
485
torch.Size([1, 5290])
110
torch.Size([1, 5290])
2423
torch.Size([1, 5290])
3
torch.Size([1, 5290])
5239
torch.Size([1, 5290])
114
torch.Size([1, 5290])
1649
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
768
torch.Size([1, 5290])
3
torch.Size([1, 5290])
1631
torch.Size([1, 5290])
5
torch.Size([1, 5290])
2974
torch.Size([1, 5290])
73
torch.Size([1, 5290])
20
torch.Size([1, 5290])
320
torch.Size([1, 5290])
74
torch.Size([1, 5290])
3
torch.Size([1, 5290])
15

In [16]:
with open("results.txt", "r") as f:
        for line in f:
            print(line)



'Why, what you know what you know I must, I must, I might catch and 

you never been that all?' the pig-baby at Alice. 



Soon the confused 

jumping up and began picking the insolence on the scream, go round it, down her as she thought, master asked it unfolded 

to the rest times so she could remember 

over their 

'That is it 







sweet-tempered. her usual yet--and are as he went on their heads take 

saying 

the roof 



startled a snail. 

'Yes, or two 

right is to the three herself, 'I suppose 

key in silence. a growl, 

yet.' 





'It is more calmly, 'But 

her saucer at the 



'Perhaps it unfolded 

which was just at Alice. 

'In her usual 'No,' 'Nothing as she had a snail. 



that Cheshire 

'--yes, every 

'You can zigzag, out 

'Why, know what you know what 

Soon 

sweet-tempered. said Alice was opened 

toes.' she had a pie--' 

'Well, I give 



The Mouse was not Ada,' she thought, their slates, and nibbled in silence. 'What was not help said Alice; 'only, Do