<a href="https://colab.research.google.com/github/marimcmurtrie/NLP/blob/main/Mari_Text_generation_using_RNN_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Character-Level Text Generation Using basic RNN

**Objective:**

Implement a basic RNN model for text generation.

Train the model on a text dataset (e.g., a short novel or song lyrics).

Explore the generated text samples during training and analyze how the quality improves over time.


**Tools:**

Python, PyTorch, Google Colab


**Dataset:**

Use a public domain text (e.g., Shakespeare’s plays, a short novel, or song lyrics).

The text data will be converted into sequences of characters for training.

In [None]:
import torch
import torch.nn as nn
import numpy as np
import random
import time
import requests

# Load a text dataset from URL
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  # Pride and Prejudice by Jane Austen
response = requests.get(url)
text = response.text  # Get the text content

# Use a subset of the text for quicker training
text = text[:100000]
print(f'Total characters in text: {len(text)}')

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
print(f'Number of unique characters: {vocab_size}')

# Convert text into integer sequences
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Hyperparameters
seq_length = 100  # Length of input sequences for training
hidden_size = 128  # Number of hidden units in RNN
batch_size = 64
num_epochs = 20
learning_rate = 0.001

# Create input-output pairs
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    for i in range(0, len(data) - seq_length):
        inputs.append(data[i:i + seq_length])
        targets.append(data[i + seq_length])
    return np.array(inputs), np.array(targets)

inputs, targets = create_sequences(encoded_text, seq_length)
print(f'Number of sequences: {len(inputs)}')

# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create data loader
dataset = TextDataset(inputs, targets)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the RNN model
class RNNTextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNNTextGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)  # Convert input indices to embeddings
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next character
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

# Instantiate the model
model = RNNTextGenerator(vocab_size, hidden_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function
# Training function with dynamic hidden state initialization
def train(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, targets in data_loader:
            batch_size = inputs.size(0)  # Get the actual batch size
            hidden = model.init_hidden(batch_size).to(device)  # Initialize hidden state for each batch

            inputs, targets = inputs.to(device), targets.to(device)
            hidden = hidden.detach()  # Detach hidden states to prevent gradient backpropagation through the entire history

            # Forward pass
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')
        if (epoch + 1) % 5 == 0:
            print("Generated text sample:")
            print(generate_text(model, "It is a truth universally acknowledged", 200))

# Function to generate text
def generate_text(model, start_string, length=100):
    model.eval()
    input_seq = torch.tensor([char_to_idx[ch] for ch in start_string], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1).to(device)
    generated_text = start_string

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        output_dist = torch.softmax(output, dim=1).data
        top_char = torch.multinomial(output_dist, 1)[0]
        predicted_char = idx_to_char[top_char.item()]
        generated_text += predicted_char
        input_seq = torch.tensor([[top_char.item()]], dtype=torch.long).to(device)

    return generated_text

# Train the model
train(model, data_loader, criterion, optimizer, num_epochs=num_epochs)

# Testing the model by generating text with a given prompt
start_string = "It is a truth universally acknowledged"  # Starting prompt
generated_length = 500  # Length of text to generate

print("Generated Text:")
print(generate_text(model, start_string, generated_length))



Total characters in text: 100000
Number of unique characters: 90
Number of sequences: 99900
Epoch 1/20, Loss: 1.8689
Epoch 2/20, Loss: 1.5042
Epoch 3/20, Loss: 1.7252
Epoch 4/20, Loss: 1.6972
Epoch 5/20, Loss: 1.4188
Generated text sample:
It is a truth universally acknowledged, and room_.”

“Hot eld
on he
beendriss exceshed his. Mary withde is if gleem” SI dested and a bage-melase wedones; Mrs. PR; youint, and fire, and prainely; but offunions Mr. Dianring
danced schu
Epoch 6/20, Loss: 1.2501
Epoch 7/20, Loss: 1.7192
Epoch 8/20, Loss: 1.0638
Epoch 9/20, Loss: 1.5249
Epoch 10/20, Loss: 1.1964
Generated text sample:
It is a truth universally acknowledged, I with
Mr. Jare any bullso, with pecas was discried been the rust. I do not_.”

[Illustraric her acculical, at any foo not chard: drejure a great is itself in the charmed by Gut that any for anse
Epoch 11/20, Loss: 1.5564
Epoch 12/20, Loss: 1.2456
Epoch 13/20, Loss: 1.1224
Epoch 14/20, Loss: 1.2314
Epoch 15/20, Loss: 1.4638
Generated t

**Notes:**
- In the RNNTextGenerator class, self.fc is a fully connected (linear) layer that maps the hidden state output from the RNN to the vocabulary size, enabling the model to make predictions over the possible output characters.

- Purpose of self.fc:
    - Input to self.fc: The RNN layer (self.rnn) outputs a hidden state of size [batch_size, hidden_size], which encapsulates the learned information from the input sequence.

    - Role of self.fc: self.fc is a fully connected layer (nn.Linear) with dimensions [hidden_size, vocab_size]. It maps the RNN’s hidden state to a vector of size vocab_size (one element for each possible character in the vocabulary).

    - Output: The output of self.fc is a logit vector of length vocab_size, which represents the unnormalized scores for each possible character. When a softmax function is applied to this vector, it converts the scores into a probability distribution, where each element represents the likelihood of a specific character being the next in the sequence.



**Questions:**

- Try changing the seq_length and hidden_size. How does it affect the quality of the generated text?


- Train the model for more epochs and observe how the generated text quality improves.


- Replace the RNN layer with an LSTM or GRU layer and compare the difference in generated text quality.

### 2. Character-Level Text Generation with LSTM

 This is another version of the character-level text generation using LSTM instead of a simple RNN.

 LSTMs (Long Short-Term Memory networks) are well-suited for text generation because they can better capture long-term dependencies in sequential data, making them ideal for generating more coherent and context-aware text

In [None]:
# File: lstm_text_generation.py

import torch
import torch.nn as nn
import numpy as np
import random
import time

# Load a text dataset from URL
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  # Pride and Prejudice by Jane Austen
response = requests.get(url)
text = response.text[:20000]  # Get the text content; use a seubset to speedup the demo
print(f'Total characters in text: {len(text)}')

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
print(f'Number of unique characters: {vocab_size}')

# Convert text into integer sequences
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Hyperparameters
seq_length = 30  # 100 Length of input sequences for training
hidden_size = 256  # Number of hidden units in LSTM
batch_size = 16  #batch size: 64
num_epochs = 5
learning_rate = 0.001

# Create input-output pairs
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    for i in range(0, len(data) - seq_length):
        inputs.append(data[i:i + seq_length])
        targets.append(data[i + seq_length])
    return np.array(inputs), np.array(targets)

inputs, targets = create_sequences(encoded_text, seq_length)
print(f'Number of sequences: {len(inputs)}')


# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create data loader
dataset = TextDataset(inputs, targets)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the LSTM-based model for text generation
class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(LSTMTextGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)  # Convert input indices to embeddings
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next character
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

# Instantiate the model
model = LSTMTextGenerator(vocab_size, hidden_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Training
def train(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, targets in data_loader:
            batch_size = inputs.size(0)  # Get the actual batch size for each batch
            hidden = model.init_hidden(batch_size)  # Initialize hidden state dynamically
            hidden = tuple([h.to(device) for h in hidden])  # Move each element of hidden to the device

            inputs, targets = inputs.to(device), targets.to(device)
            hidden = tuple([h.detach() for h in hidden])  # Detach hidden states to prevent gradient backpropagation through the entire history

            # Forward pass
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')
        if (epoch + 1) % 5 == 0:
            print("Generated text sample:")
            print(generate_text(model, "It is a truth universally acknowledged", 200))


# Function to generate text
def generate_text(model, start_string, length=100):
    model.eval()
    input_seq = torch.tensor([char_to_idx[ch] for ch in start_string], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = tuple([h.to(device) for h in hidden])
    generated_text = start_string

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        output_dist = torch.softmax(output, dim=1).data
        top_char = torch.multinomial(output_dist, 1)[0]
        predicted_char = idx_to_char[top_char.item()]
        generated_text += predicted_char
        input_seq = torch.tensor([[top_char.item()]], dtype=torch.long).to(device)

    return generated_text

# Train the model
train(model, data_loader, criterion, optimizer, num_epochs=num_epochs)

# Testing the model by generating text with a given prompt
start_string = "It is a truth universally acknowledged"  # Starting prompt
generated_length = 500  # Length of text to generate

print("Generated Text:")
print(generate_text(model, start_string, generated_length))


Total characters in text: 20000
Number of unique characters: 77
Number of sequences: 19970
Epoch 1/5, Loss: 1.5721
Epoch 2/5, Loss: 0.1734
Epoch 3/5, Loss: 2.8040
Epoch 4/5, Loss: 3.0650
Epoch 5/5, Loss: 1.2011
Generated text sample:
It is a truth universally acknowledged feacte
ertwily but it sit prelove. Whic which tour those curison about whethir the mer stry of Swift, wife for was the not sto lespined the momore day be benusent was the mith confusion a lading out
Generated Text:
It is a truth universally acknowledged manratiorly _and love.
Froens be, who deling in  o doe been rateritalit in and pishcessens, a dould_ bstifule are of the ficting,
it
withes a fonit
must obily a plateent But though founse
bult ough nouritistings of the bent us it it would in he world,
des
Fround by presher, of he himself. But glenater
uffiely the cold by that is uperfites
like scenie
thigress westen
ally the farimess,
to poyessed pablifes centapher have
as usel’squne natter_ I persce--feen the of it
th

### 3. Character-level text generation using a GRU

GRUs are simpler than LSTMs but can be similarly effective in capturing sequential dependencies in text. This example provides an additional option for students to compare the performance and text quality of GRU versus LSTM and RNN.

**GRU Model Architecture:**

- **Embedding Layer:** Maps each character index to an embedding vector of size hidden_size.

- **GRU Layer:** A single GRU layer processes the embedding sequence. Unlike LSTM, GRU does not have a separate cell state, making it computationally simpler and faster.

- **Fully Connected Layer:** Maps the GRU’s hidden state to the vocabulary size, predicting the next character in the sequence.



In [None]:
# File: gru_text_generation.py

import torch
import torch.nn as nn
import numpy as np
import requests

# Load a text dataset (Pride and Prejudice by Jane Austen)
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'
response = requests.get(url)
text = response.text[:20000]  # Use a smaller subset of text for quicker training
print(f'Total characters in text: {len(text)}')

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
print(f'Number of unique characters: {vocab_size}')

# Convert text into integer sequences
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Hyperparameters
seq_length = 30  # Length of input sequences for training
hidden_size = 128  # Number of hidden units in GRU
batch_size = 16  # Smaller batch size
num_epochs = 5  # Fewer epochs for quicker training
learning_rate = 0.001

# Create input-output pairs
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    for i in range(0, len(data) - seq_length):
        inputs.append(data[i:i + seq_length])
        targets.append(data[i + seq_length])
    return np.array(inputs), np.array(targets)

inputs, targets = create_sequences(encoded_text, seq_length)
print(f'Number of sequences: {len(inputs)}')

# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create data loader
dataset = TextDataset(inputs, targets)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the GRU-based model for text generation
class GRUTextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(GRUTextGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)  # Convert input indices to embeddings
        out, hidden = self.gru(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next character
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

# Instantiate the model
model = GRUTextGenerator(vocab_size, hidden_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function with dynamic hidden state initialization
def train(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, targets in data_loader:
            batch_size = inputs.size(0)  # Get the actual batch size
            hidden = model.init_hidden(batch_size).to(device)  # Initialize hidden state dynamically

            inputs, targets = inputs.to(device), targets.to(device)
            hidden = hidden.detach()  # Detach hidden states to prevent backpropagation through entire history

            # Forward pass
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')
        if (epoch + 1) % 5 == 0:
            print("Generated text sample:")
            print(generate_text(model, "It is a truth universally acknowledged", 200))

# Function to generate text with GRU-based model
def generate_text(model, start_string, length=100):
    model.eval()
    input_seq = torch.tensor([char_to_idx[ch] for ch in start_string], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1).to(device)
    generated_text = start_string

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        output_dist = torch.softmax(output, dim=1).data
        top_char = torch.multinomial(output_dist, 1)[0]
        predicted_char = idx_to_char[top_char.item()]
        generated_text += predicted_char
        input_seq = torch.tensor([[top_char.item()]], dtype=torch.long).to(device)

    return generated_text

# Train the model
train(model, data_loader, criterion, optimizer, num_epochs=num_epochs)

# Testing the model by generating text with a given prompt
start_string = "It is a truth universally acknowledged"  # Starting prompt
generated_length = 500  # Length of text to generate

print("Generated Text:")
print(generate_text(model, start_string, generated_length))


Total characters in text: 20000
Number of unique characters: 77
Number of sequences: 19970
Epoch 1/5, Loss: 1.3980
Epoch 2/5, Loss: 0.3977
Epoch 3/5, Loss: 1.5277
Epoch 4/5, Loss: 2.2150
Epoch 5/5, Loss: 2.2596
Generated text sample:
It is a truth universally acknowledged delling and has
doovev. Cos bout doe inserulits gurhesseld, appersons bes not englanor pirifaced be
kinvinits I
tooo frial at and liked what tDarmal, yerough genatten rlain Bense have
olo dould i
Generated Text:
It is a truth universally acknowledged to boudh bifferess--some moraled in and uraloges, to culd ssubsouctions of ruses one tul. And und givinibot who may eatifratious nodel the utoling, his a forles;
workes and not knowonded and to hive as nogman and and gracy alminit. It ive, ame inder be exhacter; greal and in other of _as mout are
dould is thing brop in one of chot poy couth, _all doop;
nears, than and glin coulde author byIt in the charrpio8d,) and propherent, in and in custen putis caled and indeed of pa

### 4. Temperature Sampling for Text Generation

Temperature sampling is a technique used to control the randomness of the predictions during text generation, especially in language models like RNNs, LSTMs, and GRUs. This technique can help adjust the confidence of a model when selecting the next character or word in a sequence.

In [None]:
# File: gru_text_generation_with_temperature.py

import torch
import torch.nn as nn
import numpy as np
import random
import time

# Load a text dataset from URL
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  # Pride and Prejudice by Jane Austen
response = requests.get(url)
text = response.text  # Get the text content
print(f'Total characters in text: {len(text)}')

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
print(f'Number of unique characters: {vocab_size}')

# Convert text into integer sequences
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Hyperparameters
seq_length = 100  # Length of input sequences for training
hidden_size = 256  # Number of hidden units in GRU
batch_size = 64
num_epochs = 20
learning_rate = 0.001

# Create input-output pairs
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    for i in range(0, len(data) - seq_length):
        inputs.append(data[i:i + seq_length])
        targets.append(data[i + seq_length])
    return np.array(inputs), np.array(targets)

inputs, targets = create_sequences(encoded_text, seq_length)
print(f'Number of sequences: {len(inputs)}')

# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create data loader
dataset = TextDataset(inputs, targets)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the GRU-based model for text generation
class GRUTextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(GRUTextGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)  # Convert input indices to embeddings
        out, hidden = self.gru(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next character
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

# Instantiate the model
model = GRUTextGenerator(vocab_size, hidden_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function
def train(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        hidden = model.init_hidden(batch_size)
        hidden = hidden.to(device)

        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            hidden = hidden.detach()  # Detach hidden states to prevent gradient backpropagation through entire history

            # Forward pass
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')
        if (epoch + 1) % 5 == 0:
            print("Generated text sample:")
            print(generate_text(model, "It is a truth universally acknowledged", 200, temperature=0.8))

# Function to generate text with temperature sampling
def generate_text(model, start_string, length=100, temperature=1.0):
    model.eval()
    input_seq = torch.tensor([char_to_idx[ch] for ch in start_string], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1).to(device)
    generated_text = start_string

    for _ in range(length):
        output, hidden = model(input_seq, hidden)

        # Adjust output with temperature
        output = output / temperature
        output_dist = torch.softmax(output, dim=1).data
        top_char = torch.multinomial(output_dist, 1)[0]
        predicted_char = idx_to_char[top_char.item()]
        generated_text += predicted_char
        input_seq = torch.tensor([[top_char.item()]], dtype=torch.long).to(device)

    return generated_text

# Train the model
train(model, data_loader, criterion, optimizer, num_epochs=num_epochs)

# Testing the model by generating text with a given prompt
start_string = "It is a truth universally acknowledged"  # Starting prompt
generated_length = 500  # Length of text to generate

print("Generated Text:")
print(generate_text(model, start_string, generated_length))


Total characters in text: 743375
Number of unique characters: 92
Number of sequences: 743275


**Questions:**


- Try generating text with different temperature values (e.g., 0.5, 1.0, 1.5). How does the diversity of the output change?


- How does a very high temperature (e.g., 2.0) affect the coherence of the text?


- What happens when the temperature is very low (e.g., 0.1)? Is the generated text too repetitive?


**Analysis of Temperature Sampling:**

- **High Temperature (> 1):** The model explores less probable characters, which can be useful for generating more creative and diverse text but might result in less coherent sentences.


- **Low Temperature (< 1):** The model tends to choose the most likely next characters, leading to more predictable and conservative outputs. This can be useful for generating formal or structured text.


- **Applications:** Adjusting the temperature is often used in creative writing, dialogue generation, and poetry generation where balancing between creativity and coherence is important.

### 5. Word-Level Text Generation using an LSTM

For this coding activity, we will focus on a word-level LSTM model to generate text. The model will:

- Process sequences of words rather than characters.
- Use pre-trained word embeddings (like GloVe) to represent words as vectors.
- Generate text by predicting the next word in a sequence.

In [None]:
# File: word_level_text_generation.py

import torch
import torch.nn as nn
import numpy as np
import random
import time
from collections import Counter
from torchtext.vocab import GloVe

# Load a text dataset from URL
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  # Pride and Prejudice by Jane Austen
response = requests.get(url)
text = response.text  # Get the text content
print(f'Total characters in text: {len(text)}')

# Tokenize text into words
words = text.split()
print(f'Total words in text: {len(words)}')

# Build a vocabulary and create word-to-index mappings
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for idx, word in enumerate(vocab)}
vocab_size = len(vocab)
print(f'Vocabulary size: {vocab_size}')

# Convert text into integer sequences
encoded_text = [word_to_idx[word] for word in words]

# Hyperparameters
seq_length = 10  # Length of input sequences (in words)
hidden_size = 256  # Number of hidden units in LSTM
embedding_dim = 100  # Embedding dimension size
batch_size = 64
num_epochs = 20
learning_rate = 0.001

# Create input-output pairs
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    for i in range(0, len(data) - seq_length):
        inputs.append(data[i:i + seq_length])
        targets.append(data[i + seq_length])
    return np.array(inputs), np.array(targets)

inputs, targets = create_sequences(encoded_text, seq_length)
print(f'Number of sequences: {len(inputs)}')

# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create data loader
dataset = TextDataset(inputs, targets)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the LSTM-based model for word-level text generation
class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(LSTMTextGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)  # Convert input indices to word embeddings
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next word
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

# Instantiate the model
model = LSTMTextGenerator(vocab_size, embedding_dim, hidden_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function
def train(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        hidden = model.init_hidden(batch_size)
        hidden = tuple([h.to(device) for h in hidden])

        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            hidden = tuple([h.detach() for h in hidden])  # Detach hidden states

            # Forward pass
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')
        if (epoch + 1) % 5 == 0:
            print("Generated text sample:")
            print(generate_text(model, ["it", "is", "a", "truth"], 20))

# Function to generate text with word-level model
def generate_text(model, start_words, length=10):
    model.eval()
    input_seq = torch.tensor([word_to_idx[word] for word in start_words], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = tuple([h.to(device) for h in hidden])
    generated_text = start_words

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        output_dist = torch.softmax(output, dim=1).data
        top_word = torch.multinomial(output_dist, 1)[0]
        predicted_word = idx_to_word[top_word.item()]
        generated_text.append(predicted_word)
        input_seq = torch.tensor([[top_word.item()]], dtype=torch.long).to(device)

    return ' '.join(generated_text)

# Train the model
train(model, data_loader, criterion, optimizer, num_epochs=num_epochs)

# Testing the model by generating text with a given prompt
start_string = "It is a truth universally acknowledged"  # Starting prompt
generated_length = 500  # Length of text to generate

print("Generated Text:")
print(generate_text(model, start_string, generated_length))
