# Validation

After training both models, we will perform validation and comparison of the results.
First we start by loading the models and the data.

In [4]:
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


# Load GPT-2 Model
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2_finetuned_shakespeare')


In [21]:
import torch
import torch.nn as nn

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input):
        embedded = self.embedding(input)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out

# Define necessary parameters
VOCAB_SIZE = len(tokenizer.get_vocab())  # You need to define the tokenizer
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
device = torch.device('cpu')

# Load LSTM Model
lstm_model = LSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(device)
lstm_model.load_state_dict(torch.load("lstm_model.pth", map_location=torch.device('cpu')))



<All keys matched successfully>

In [8]:
import nltk
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer

nltk.download('punkt')

# Load the text
with open('text.txt', 'r') as file:
    text = file.read()

# Split the text into sentences
sentences = nltk.tokenize.sent_tokenize(text)

# Split sentences into training and validation sets
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)

# Define the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create the validation dataset
val_dataset = ShakespeareDataset(val_sentences, tokenizer)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luisp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [9]:
from torch.utils.data import Dataset, DataLoader

class ShakespeareDataset(Dataset):
    def __init__(self, sentences, tokenizer):
        self.sentences = sentences
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs = self.tokenizer(sentence, return_tensors='pt', truncation=True, padding='max_length', max_length=100)
        input_ids = inputs["input_ids"].squeeze().to(device)
        attention_mask = inputs["attention_mask"].squeeze().to(device)
        return {"input_ids": input_ids, "attention_mask": attention_mask}

# Tokenize the validation sentences and create the validation dataset
val_dataset = ShakespeareDataset(val_sentences, tokenizer)

# Create the DataLoader for the validation set
BATCH_SIZE = 32
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


Now we will create a function to calculate the validation loss of both models.

In [22]:
from tqdm import tqdm
import torch


from tqdm import tqdm

def evaluate_model(model, dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", position=0, leave=True):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["input_ids"].to(device)

            if isinstance(model, LSTMModel):
                outputs = model(input_ids)
                loss = criterion(outputs.view(-1, VOCAB_SIZE), labels.view(-1))
            else:  # For GPT2LMHeadModel
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = criterion(outputs.logits.view(-1, VOCAB_SIZE), labels.view(-1))

            total_loss += loss.item()

    return total_loss / len(dataloader)


criterion = torch.nn.CrossEntropyLoss()




In [23]:
# Calculate the validation loss for each model

gpt2_val_loss = evaluate_model(gpt2_model, val_dataloader)

# Print validation loss

print(f"GPT-2 validation loss: {gpt2_val_loss}")

# Calculate the perplexity for each model
perplexity = exp(gpt2_val_loss)
print(f"GPT-2 perplexity: {perplexity}")


Evaluating: 100%|██████████| 78/78 [07:14<00:00,  5.57s/it]

GPT-2 validation loss: 3.275444278350243





In [19]:
# Calculate the validation loss for each model

lstm_val_loss = evaluate_model(lstm_model, val_dataloader)

# Print validation loss

print(f"LSTM validation loss: {lstm_val_loss}")

Evaluating: 100%|██████████| 78/78 [01:43<00:00,  1.33s/it]

LSTM validation loss: 0.060352194481171094





The loss for the LSTM model is lower than the loss for the GPT-2 Model. This suggests that the LSTM model is better at predicting the next word in the sequence than the GPT-2 model.

In [24]:
import math

# Calculate perplexity
def calculate_perplexity(loss):
    return math.exp(loss)

# Calculate perplexity for each model
gpt2_perplexity = calculate_perplexity(gpt2_val_loss)
lstm_perplexity = calculate_perplexity(lstm_val_loss)

print(f"GPT-2 perplexity: {gpt2_perplexity}")
print(f"LSTM perplexity: {lstm_perplexity}")



GPT-2 perplexity: 26.454976243265854
LSTM perplexity: 1.062210585380299


For the GPT-2 model, we obtained a perplexity of approximately 26.45, indicating that, on average, the model is predicting the next word with a higher level of uncertainty compared to the ground truth.

On the other hand, the LSTM model achieved a perplexity of around 1.06, indicating that it is making more confident predictions and aligning better with the ground truth.

Now let's do some manual testing to see how the models behave.

In [32]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the model to evaluation mode
gpt2_model.eval()

# Set the prompt text
prompt = "Thou art"

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text using the GPT-2 model with temperature
output = gpt2_model.generate(input_ids, max_length=20, do_sample=True, top_k=50, temperature=0.5, pad_token_id=tokenizer.eos_token_id)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Thou art thou not God's own God? But thou art God's own God.




In [40]:
prompt = "Blue ocean"
# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text using the GPT-2 model with temperature
output = gpt2_model.generate(input_ids, max_length=20, do_sample=True, top_k=50, temperature=0.5, pad_token_id=tokenizer.eos_token_id)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Blue ocean, and the ocean itself, the ocean is not the same as the ocean of the Sun


In [41]:
prompt = "What is life?"
# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text using the GPT-2 model with temperature
output = gpt2_model.generate(input_ids, max_length=20, do_sample=True, top_k=50, temperature=0.5, pad_token_id=tokenizer.eos_token_id)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

What is life? What is the world? What is the Stewart family?

What is life


In [38]:
import torch.nn.functional as F

lstm_model.eval()

# Start prompt
start_prompt = "Thou art"
generated = start_prompt

# Tokenize the start prompt
input_ids = tokenizer.encode(start_prompt, return_tensors='pt').to(device)

# Generate three words
for _ in range(3):
    with torch.no_grad():
        outputs = lstm_model(input_ids)
        predictions = outputs[0, -1, :]

    # Apply softmax to predictions to get probabilities
    probabilities = F.softmax(predictions, dim=-1)

    # Sample from the distribution
    predicted_id = torch.multinomial(probabilities, 1)
    generated_word = tokenizer.decode([predicted_id.item()])
    generated += " " + generated_word

    # Prepare the new input
    input_ids = predicted_id.unsqueeze(0).to(device)

print(generated)


Thou art  art Blocks  inexperienced


In [42]:
# Start prompt
start_prompt = "Blue ocean"
generated = start_prompt

# Tokenize the start prompt
input_ids = tokenizer.encode(start_prompt, return_tensors='pt').to(device)

# Generate three words
for _ in range(3):
    with torch.no_grad():
        outputs = lstm_model(input_ids)
        predictions = outputs[0, -1, :]

    # Apply softmax to predictions to get probabilities
    probabilities = F.softmax(predictions, dim=-1)

    # Sample from the distribution
    predicted_id = torch.multinomial(probabilities, 1)
    generated_word = tokenizer.decode([predicted_id.item()])
    generated += " " + generated_word

    # Prepare the new input
    input_ids = predicted_id.unsqueeze(0).to(device)

print(generated)

Blue ocean ai  Fritz Brain


In [43]:
# Start prompt
start_prompt = "What is life?"
generated = start_prompt

# Tokenize the start prompt
input_ids = tokenizer.encode(start_prompt, return_tensors='pt').to(device)

# Generate three words
for _ in range(3):
    with torch.no_grad():
        outputs = lstm_model(input_ids)
        predictions = outputs[0, -1, :]

    # Apply softmax to predictions to get probabilities
    probabilities = F.softmax(predictions, dim=-1)

    # Sample from the distribution
    predicted_id = torch.multinomial(probabilities, 1)
    generated_word = tokenizer.decode([predicted_id.item()])
    generated += " " + generated_word

    # Prepare the new input
    input_ids = predicted_id.unsqueeze(0).to(device)

print(generated)

What is life? ? ? ?


Manual testing returns much better results for the GPT-2 model. The LSTM model is more likely to generate words or characters that are not in the vocabulary, while the GPT-2 model is more likely to generate words that are in the vocabulary. The LSTM model seems to generate random words.

The GPT-2 model is also better at generating coherent sentences. The LSTM model is more likely to generate sentences that do not make sense.

To improve the LSTM model, we could try to increase the number of epochs and the number of hidden layers. We could also try to use a larger vocabulary or dataset.