# **LSTM Model for Text Generation**

This model has been updated from the one from the exercise notebook in class. We now use the entire Alice in Wonderland corpus for training the LSTM model. However, if you get memory errors you may decrease the amount of sentences used.

In [5]:
import nltk
nltk.download('gutenberg')
import re
from nltk.corpus import gutenberg

# Load text from NLTK Gutenberg corpus
text = gutenberg.raw("carroll-alice.txt")
text = text.replace('\t', ' ').replace('\n', ' ')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [8]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Define the corpus 
# YOU CAN TAKE A SMALL PIECE OF THIS LIST IF YOU EXPERIENCE MEMORY ISSUES
corpus = sent_tokenize(text)

# Tokenize the corpus
tokenizer = lambda x: x.split()
tokenized_corpus = [tokenizer(doc) for doc in corpus]

# Create a vocabulary and dictionary of indices
vocab = list(set([word for doc in tokenized_corpus for word in doc]))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

# Convert the corpus to indices
corpus_idx = [[word_to_idx[word] for word in doc] for doc in tokenized_corpus]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Add brief comments to the LSTM class to explain the purpose of each line of code.**

In [9]:
import torch
import torch.nn as nn

# DOCUMENT THE CLASS
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell):
        output, (hidden, cell) = self.lstm(input.view(1, 1, -1), (hidden, cell))
        output = self.fc(output.view(1, -1))
        output = self.softmax(output)
        return output, hidden, cell

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size)


batch_size=32
hidden_size=32
num_epoch=20
model = LSTM(vocab_size, hidden_size, vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the LSTM model
for epoch in range(num_epoch):
  for batch_start in range(0, len(corpus_idx), batch_size):
      batch = corpus_idx[batch_start:batch_start + batch_size]
      hidden, cell = model.initHidden()
      loss = 0
      for doc in batch:
          for i in range(len(doc)-1):
              input = torch.zeros(1, vocab_size)
              input[0, doc[i]] = 1
              target = torch.tensor([doc[i+1]], dtype=torch.long)
              output, hidden, cell = model(input, hidden, cell)
              loss += nn.functional.nll_loss(output, target)
      loss /= batch_size
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
      optimizer.step()
  print('Epoch {}:, Loss: {:.2f}'.format(epoch+1, loss.item()))

Epoch 1:, Loss: 142.05
Epoch 2:, Loss: 135.98
Epoch 3:, Loss: 132.06
Epoch 4:, Loss: 125.22
Epoch 5:, Loss: 121.83
Epoch 6:, Loss: 116.80
Epoch 7:, Loss: 109.49
Epoch 8:, Loss: 106.17
Epoch 9:, Loss: 101.25
Epoch 10:, Loss: 97.73
Epoch 11:, Loss: 95.16
Epoch 12:, Loss: 89.99
Epoch 13:, Loss: 84.95
Epoch 14:, Loss: 81.45
Epoch 15:, Loss: 77.81
Epoch 16:, Loss: 73.02
Epoch 17:, Loss: 70.26
Epoch 18:, Loss: 66.03
Epoch 19:, Loss: 64.91
Epoch 20:, Loss: 62.91


# Generating Text

In [10]:
import torch.nn.functional as F

def generate_text(model, start_word, length, temperature=1.0):
    with torch.no_grad():
        # Initialize hidden and cell state
        hidden, cell = model.initHidden()

        # Convert the start word to a tensor
        start_tensor = torch.zeros(1, vocab_size)
        start_tensor[0, word_to_idx[start_word]] = 1

        # Generate the initial hidden and cell state using the start word
        output, hidden, cell = model(start_tensor, hidden, cell)

        # Sample the next word based on the output probabilities and temperature
        output = output.squeeze().div(temperature).exp().cpu()
        word_idx = torch.multinomial(output, 1).item()

        # Generate the rest of the text
        output_text = [start_word]
        for i in range(length - 1):
            # Convert the previous predicted word to a tensor
            input_tensor = torch.zeros(1, vocab_size)
            input_tensor[0, word_idx] = 1

            # Generate the next hidden and cell state using the previous predicted word
            output, hidden, cell = model(input_tensor, hidden, cell)

            # Sample the next word based on the output probabilities and temperature
            output = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(output, 1).item()

            # Convert the predicted word index to a string and add it to the generated text
            output_text.append(idx_to_word[word_idx])

        # End it with a period
        output_text += '.'
        return ' '.join(output_text)

**Generate text using different values for temperature.**

In [None]:
# Set the seed text and length of the generated text


# Generate the text with different temperatures


# Print the generated text
