# NLP Exercise 2: Neural Network
---

## Going from raw text to tokens

### Loading necessary libraries.

In [None]:
%pip install transformers
%pip install datasets
%pip install tokenizers
%pip install torch
%pip install tqdm

### Create a dataset

Using Datasets from Hugging Face could be useful when we create our own dataset.

In [2]:
import datasets

def create_dataset(raw_sentences: list[str]) -> datasets.Dataset:
    """
    Create a HuggingFace Dataset.
    
    Parameters: 
        raw_sentences: list of sentences.
        labels: list of integer labels corresponding to the sentences.

    """

    dataset_dict = {
        "text": raw_sentences,

    }

    # Define schema
    dataset_features = datasets.Features(
        {
            "text": datasets.Value("string"),
        }
    )

    # Create the datset
    dataset = datasets.Dataset.from_dict(dataset_dict, features=dataset_features)
    return dataset

  from .autonotebook import tqdm as notebook_tqdm


Read and extract sentences from book

In [3]:
import re

with open('alice_in_wonderland.txt', 'r', encoding='utf-8') as file:
    story_text = file.read()

raw_sentences = []

# Extract Chapter 1
chapter_start = "Alice was beginning"
chapter_end = "THE END"
start_idx = story_text.find(chapter_start)
end_idx = story_text.find(chapter_end)
chapter_1_text = story_text[start_idx:end_idx].strip()

# Split into sentences
# Use regular expressions to split by special signs like '.', '!', and '?'
split_sentences = re.split(r'[.!?*;,]', chapter_1_text)

# Filter sentences longer than 5 words
long_sentences = [sentence.strip() for sentence in split_sentences if len(sentence.split()) > 2]

# Display the results
for sentence in long_sentences:  
    raw_sentences.append(sentence)

In [4]:
def count_distinct_words(text):
    # Create an empty set to store unique words
    unique_words = set()

    # Split text into words using regular expressions
    words = re.split(r'\W+', text.lower())  # This splits at any non-alphanumeric character

    # Add each word to the set
    for word in words:
        if word:  # This check avoids adding empty strings
            unique_words.add(word)

    # Return the number of distinct words
    return len(unique_words)
count_distinct_words(chapter_1_text)

2681

In [5]:
own_dataset = create_dataset(raw_sentences)

print(own_dataset.to_pandas())

                                                   text
0     Alice was beginning to get very tired of sitti...
1     and of having nothing to do: once or twice she...
2        but it had no pictures or\nconversations in it
3                        “and what is the use of a book
4     ” thought Alice\n“without pictures or conversa...
...                                                 ...
3568  and make _their_ eyes bright and eager with ma...
3569  perhaps even with the dream of Wonderland of l...
3570      and find a pleasure in all\ntheir simple joys
3571                     remembering her own child-life
3572                         and the happy summer\ndays

[3573 rows x 1 columns]


### Tokenization

Differences between WordPiece tokenization and wordLevel tokenizer.

In [6]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast

Function to train WordLevel Tokenizer

In [7]:
def train_word_level_tokenizer(
        sentences: list[str],
        unk_token: str = "[UNK]",
        pad_token: str = "[PAD]",
        start_of_seq_token: str = "<s>",
        end_of_seq_token: str = "</s>", 
        vocab_size: int = 3000
) -> PreTrainedTokenizerFast:
    """Train a WordLevel tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordLevelTrainer(vocab_size=vocab_size,
                                        special_tokens=special_tokens, 
                                        show_progress=True)

    # Initialize WordLevel tokenizer
    tokenizer = Tokenizer(models.WordLevel(unk_token=unk_token))

    # Normalize each sentence using NFD unicode and stripping whitespace
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Strip()]
    )

    # Using Whitespace to split each input sentence
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer

Function to train WordPiece Tokenizer

In [8]:
def train_wordpiece_tokenizer(
    sentences: list[str],
    unk_token: str = "[UNK]",
    pad_token: str = "[PAD]",
    start_of_seq_token: str = "<s>",
    end_of_seq_token: str = "</s>",
    vocab_size: int = 3000  # Set a smaller vocab size to force subword splits
) -> PreTrainedTokenizerFast:
    """Train a WordPiece tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        show_progress=True
    )

    # Initialize WordPiece tokenizer
    tokenizer = Tokenizer(models.WordPiece(unk_token=unk_token))

    # Configure normalization and pre-tokenization
    tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Strip()])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer


Train both tokenizers

In [9]:
word_level_tokenizer = train_word_level_tokenizer(raw_sentences)
wordpiece_tokenizer = train_wordpiece_tokenizer(raw_sentences)

Visualize how tokenizers work

In [10]:
import random

# Print tokenized results for raw sentences using convert_ids_to_tokens
index = random.randint(0,3572)
sentence = raw_sentences[index]

wordpiece_encoded_example = wordpiece_tokenizer.encode(sentence)
wordpiece_tokens_example = wordpiece_tokenizer.convert_ids_to_tokens(wordpiece_encoded_example)

wordlevel_encoded_example = word_level_tokenizer.encode(sentence)
wordlevel_tokens_example = word_level_tokenizer.convert_ids_to_tokens(wordlevel_encoded_example)

# Using WordPiece Tokenizer
print(f"Sentence: {sentence}")
print(f"WordPiece Tokens: {wordpiece_tokens_example}")
print(f"WordPiece Token IDs: {wordpiece_encoded_example}")
print()

# Using WordLevel Tokenizer
print(f"WordLevel Tokens: {wordlevel_tokens_example}")
print(f"WordLevel Token IDs: {wordlevel_encoded_example}")

Sentence: ” added the Dormouse
WordPiece Tokens: ['<s>', '”', 'added', 'the', 'Dormouse', '</s>']
WordPiece Token IDs: [2, 68, 607, 125, 449, 3]

WordLevel Tokens: ['<s>', '”', 'added', 'the', 'Dormouse', '</s>']
WordLevel Token IDs: [2, 5, 191, 4, 129, 3]


## Build RNN model based on different tokenizations.

In [11]:
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F

Convert to PyTorch tensors

In [12]:
# Define a fixed maximum length for padding
max_length = 50
# Tokenize the whole dataset
wordpiece_encoded = [wordpiece_tokenizer.encode(sentence, max_length=max_length, padding='max_length', truncation=True) for sentence in raw_sentences]
wordlevel_encoded = [word_level_tokenizer.encode(sentence, max_length=max_length, padding='max_length', truncation=True) for sentence in raw_sentences]

In [13]:
wordpiece_data = torch.tensor(wordpiece_encoded, dtype=torch.long)
wordlevel_data = torch.tensor(wordlevel_encoded, dtype=torch.long)

In [14]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        logits = self.fc(output)
        return logits, hidden

In [15]:
from torch.utils.data import DataLoader, TensorDataset

In [16]:
# Example batch size
batch_size = 32

# Create Tensor datasets
wordpiece_dataset = TensorDataset(wordpiece_data)
wordlevel_dataset = TensorDataset(wordlevel_data)

# Shuffle your data to avoid sequence patterns during training
wordpiece_loader = DataLoader(wordpiece_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
wordlevel_loader = DataLoader(wordlevel_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

def train_model(model, data_loader, vocab_size, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            inputs, = batch  
            targets = inputs[:, 1:].contiguous()
            inputs = inputs[:, :-1].contiguous()
            
            optimizer.zero_grad()
            logits, _ = model(inputs)
            loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(data_loader):.4f}")


In [17]:
# Initialize models
embedding_dim = 64
hidden_dim = 128
vocab_size_wordpiece = len(wordpiece_tokenizer.get_vocab())
vocab_size_wordlevel = len(word_level_tokenizer.get_vocab())
wordpiece_model = RNNLanguageModel(vocab_size_wordpiece, embedding_dim, hidden_dim)
wordlevel_model = RNNLanguageModel(vocab_size_wordlevel, embedding_dim, hidden_dim)


In [18]:
# Train model by WordPiece Model
print("Training WordPiece Model:")
train_model(wordpiece_model, wordpiece_loader, vocab_size_wordpiece, epochs=20)

Training WordPiece Model:
Epoch 1/20, Loss: 2.0067
Epoch 2/20, Loss: 1.1305
Epoch 3/20, Loss: 1.0554
Epoch 4/20, Loss: 1.0057
Epoch 5/20, Loss: 0.9671
Epoch 6/20, Loss: 0.9338
Epoch 7/20, Loss: 0.9033
Epoch 8/20, Loss: 0.8770
Epoch 9/20, Loss: 0.8541
Epoch 10/20, Loss: 0.8323
Epoch 11/20, Loss: 0.8117
Epoch 12/20, Loss: 0.7928
Epoch 13/20, Loss: 0.7743
Epoch 14/20, Loss: 0.7566
Epoch 15/20, Loss: 0.7399
Epoch 16/20, Loss: 0.7226
Epoch 17/20, Loss: 0.7074
Epoch 18/20, Loss: 0.6908
Epoch 19/20, Loss: 0.6754
Epoch 20/20, Loss: 0.6603


In [19]:
# Train model by WordLevel Model
print("Training WordLevel Model:")
train_model(wordlevel_model, wordlevel_loader, vocab_size_wordlevel, epochs=20)

Training WordLevel Model:
Epoch 1/20, Loss: 1.8892
Epoch 2/20, Loss: 1.0364
Epoch 3/20, Loss: 0.9653
Epoch 4/20, Loss: 0.9195
Epoch 5/20, Loss: 0.8816
Epoch 6/20, Loss: 0.8488
Epoch 7/20, Loss: 0.8194
Epoch 8/20, Loss: 0.7933
Epoch 9/20, Loss: 0.7697
Epoch 10/20, Loss: 0.7496
Epoch 11/20, Loss: 0.7301
Epoch 12/20, Loss: 0.7128
Epoch 13/20, Loss: 0.6956
Epoch 14/20, Loss: 0.6795
Epoch 15/20, Loss: 0.6644
Epoch 16/20, Loss: 0.6496
Epoch 17/20, Loss: 0.6343
Epoch 18/20, Loss: 0.6205
Epoch 19/20, Loss: 0.6067
Epoch 20/20, Loss: 0.5927


In [20]:
def clean_wordpiece_output(tokens):
    """
    Enhanced cleanup for WordPiece tokens, handling initial '##' tokens correctly.
    """
    cleaned_text = ""
    for i, token in enumerate(tokens):
        if token.startswith("##"):
            if i == 0 or cleaned_text == "":
                # If '##' token is the first one, treat as new word without '##'
                cleaned_text += token[2:]
            else:
                # Otherwise, merge directly with the previous token
                cleaned_text += token[2:]
        else:
            # Add space only if it's not the first token in cleaned_text
            cleaned_text += (" " + token if cleaned_text else token)
    return cleaned_text


In [157]:
def generate_text(model, tokenizer, start_text, max_length=100, temperature=0.7):
    model.eval()
    tokens = tokenizer.encode(start_text)
    input_tensor = torch.tensor(tokens).unsqueeze(0)

    generated_text = start_text
    hidden = None

    for _ in range(max_length):
        logits, hidden = model(input_tensor, hidden)
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        probs[0, tokenizer.pad_token_id] = 0  # Prevent PAD from being chosen

        next_token = torch.multinomial(probs, num_samples=1).item()
        if next_token == tokenizer.pad_token_id or next_token == tokenizer.eos_token_id:
            break

        next_word = tokenizer.decode([next_token])
        if tokenizer == wordpiece_tokenizer:
            next_word = clean_wordpiece_output([next_word])  # Apply enhanced cleaning function

        generated_text += " " + next_word
        input_tensor = torch.tensor([[next_token]])

    return generated_text


In [174]:
start_text = "Alice"
print("Generated Text using WordPiece Model:")
print(generate_text(wordpiece_model, wordpiece_tokenizer, start_text))

print("\nGenerated Text using WordLevel Model:")
print(generate_text(wordlevel_model, word_level_tokenizer, start_text))

Generated Text using WordPiece Model:
Alice like at last

Generated Text using WordLevel Model:
Alice Alice were talking


# LSTM Model

## Import Libraries

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import numpy as np

## Load and Preprocess Data

In [33]:
dataset = own_dataset

# Split into training and validation set
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Visualize the dataset
dataset_df = dataset.to_pandas()
print(dataset_df.head())

                                                text
0  Alice was beginning to get very tired of sitti...
1  and of having nothing to do: once or twice she...
2     but it had no pictures or\nconversations in it
3                     “and what is the use of a book
4  ” thought Alice\n“without pictures or conversa...


## Initialize a pre-trained tokenizer

In [34]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
padding_token_id = tokenizer.pad_token_id

## Tokenize and prepare data for language modeling

- The 'labels' are shifted version of 'input_ids', meaning each token in 'labels' corresponds to the next word in 'input_ids'.
- The model's prediction at position 'i' in 'input_ids' should match the word at position 'i+1' in 'labels'.

In [35]:
def tokenize_and_shift(examples):
    # Tokenize with padding and truncation
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    
    # Copy input_ids for labels and then shift for next-token prediction
    labels = tokenized["input_ids"].copy()
    
    # Shift the labels one position to the right
    for i in range(len(labels)):
        labels[i] = labels[i][1:] + [padding_token_id]  # Shift and pad
    
    tokenized["labels"] = labels
    return tokenized

## Mapping the function to the Dataset

In [36]:
# Apply the function to both datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_shift, batched=True, remove_columns=["text"])
tokenized_val_dataset = test_dataset.map(tokenize_and_shift, batched=True, remove_columns=["text"])

# Set the format for PyTorch
tokenized_train_dataset.set_format("torch", columns=["input_ids", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "labels"])

Map: 100%|██████████| 2858/2858 [00:00<00:00, 10183.39 examples/s]
Map: 100%|██████████| 715/715 [00:00<00:00, 13730.66 examples/s]


## Create DataLoader

In [45]:
train_loader = DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(tokenized_val_dataset, batch_size=32, shuffle=False, num_workers=2)

## Define the Language Model Class

In [38]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, padding_token_id: int):
        super().__init__()
        self.padding_token_id = padding_token_id

        # Model components
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.projection = nn.Linear(hidden_dim, vocab_size)

        # Loss function, ignoring padding tokens
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=padding_token_id)

    def forward(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        logits = self.predict_logits(input_ids)
        loss = self.compute_loss(logits, input_ids)
        return loss, logits

    def predict_logits(self, input_ids: torch.Tensor) -> torch.Tensor:
        """
        Processes the embeddings sequentially, allowing each token to influence the representation
        of the subsequent tokens. This helps capture dependencies accross the sequence.

        Args:
            input_ids (torch.Tensor): The embeddings tensors from the 
            previous step with shape [batch_size, seq_len, embedding_dim]

        Returns:
            torch.Tensor: A tensor with shape [batch_size, seq_len, hidden_dim], 
            where hidden_dim is the dimensionality of the RNN's hidden state.

        """
        embeddings = self.embedding(input_ids)
        rnn_output, _ = self.rnn(embeddings)  
        logits = self.projection(rnn_output)
        return logits

    def compute_loss(self, logits: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
        logits = logits[:, :-1, :].contiguous().view(-1, logits.size(-1))
        target_ids = input_ids[:, 1:].contiguous().view(-1)
        loss = self.loss_fn(logits, target_ids)
        return loss

    def generate_text(self, prompt, max_len=20, temperature=1.0):
        """
        Generates text from a model, starting with a given prompt and
        extending it one token at a time until reach a specified length.

        Args:
            prompt (_type_): Prompt.
            max_len (int, optional): The maximum length of tokens to generate.
            temperature (float, optional):  A parameter that controls the randomness of predictions. 
                                        Lower values make the model more confident and deterministic, 
                                        while higher values make it more diverse and exploratory..

        Returns:
            _type_: _description_
        """
        # Switch to Evaluation mode
        self.eval()
        # Convert prompts to tensor
        input_ids = torch.tensor([prompt], dtype=torch.long)
        generated_tokens = prompt[:]
        
        # Disable Gradient computation
        with torch.no_grad():
            # Generate tokens in a loop
            for _ in range(max_len):
                logits = self.predict_logits(input_ids)  
                next_token_logits = logits[:, -1, :] / temperature # extracts the logits for the last token
                
                # Convert logits to probabilities
                probabilities = F.softmax(next_token_logits, dim=-1).squeeze()
                # Uses torch.multinomial to sample a token ID from the probability distribution.
                next_token = torch.multinomial(probabilities, 1).item()
                
                # End condition
                if next_token == self.padding_token_id:
                    break

                generated_tokens.append(next_token)
                input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=1)

        # Conver tokens to text
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        return generated_text

## Initialize the model, optimizer, and training loop

In [118]:
vocab_size = tokenizer.vocab_size
embedding_dim = 64
hidden_dim = 128
epochs = 20

model = LanguageModel(vocab_size, embedding_dim, hidden_dim, padding_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

## Function to evalulate model

In [119]:
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    total_batches = len(dataloader)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"]
            loss, _ = model(input_ids)
            total_loss += loss.item()

    # Calculate average loss and perplexity
    avg_loss = total_loss / total_batches

    return avg_loss

## Training loop

In [120]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch["input_ids"]
        optimizer.zero_grad()

        loss, _ = model(input_ids)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

    # Run evaluation
    val_loss = evaluate_model(model, val_loader)
    print(f"Epoch {epoch+1} - Validation Loss: {val_loss:.4f}")

Training Epoch 1: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 1 - Training Loss: 7.2321


Evaluating: 100%|██████████| 23/23 [00:10<00:00,  2.10it/s]


Epoch 1 - Validation Loss: 5.7756


Training Epoch 2: 100%|██████████| 90/90 [01:06<00:00,  1.35it/s]


Epoch 2 - Training Loss: 5.4902


Evaluating: 100%|██████████| 23/23 [00:15<00:00,  1.49it/s]


Epoch 2 - Validation Loss: 5.5026


Training Epoch 3: 100%|██████████| 90/90 [01:08<00:00,  1.31it/s]


Epoch 3 - Training Loss: 5.2557


Evaluating: 100%|██████████| 23/23 [00:10<00:00,  2.23it/s]


Epoch 3 - Validation Loss: 5.3292


Training Epoch 4: 100%|██████████| 90/90 [01:07<00:00,  1.33it/s]


Epoch 4 - Training Loss: 5.0675


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.39it/s]


Epoch 4 - Validation Loss: 5.1717


Training Epoch 5: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 5 - Training Loss: 4.9100


Evaluating: 100%|██████████| 23/23 [00:10<00:00,  2.27it/s]


Epoch 5 - Validation Loss: 5.0607


Training Epoch 6: 100%|██████████| 90/90 [01:05<00:00,  1.37it/s]


Epoch 6 - Training Loss: 4.7718


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.36it/s]


Epoch 6 - Validation Loss: 4.9507


Training Epoch 7: 100%|██████████| 90/90 [01:04<00:00,  1.39it/s]


Epoch 7 - Training Loss: 4.6475


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.41it/s]


Epoch 7 - Validation Loss: 4.8591


Training Epoch 8: 100%|██████████| 90/90 [01:03<00:00,  1.42it/s]


Epoch 8 - Training Loss: 4.5352


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.32it/s]


Epoch 8 - Validation Loss: 4.7859


Training Epoch 9: 100%|██████████| 90/90 [01:04<00:00,  1.39it/s]


Epoch 9 - Training Loss: 4.4450


Evaluating: 100%|██████████| 23/23 [00:10<00:00,  2.18it/s]


Epoch 9 - Validation Loss: 4.7243


Training Epoch 10: 100%|██████████| 90/90 [01:06<00:00,  1.35it/s]


Epoch 10 - Training Loss: 4.3472


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.32it/s]


Epoch 10 - Validation Loss: 4.6653


Training Epoch 11: 100%|██████████| 90/90 [01:07<00:00,  1.34it/s]


Epoch 11 - Training Loss: 4.2635


Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.32it/s]


Epoch 11 - Validation Loss: 4.6166


Training Epoch 12: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 12 - Training Loss: 4.1817


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  1.94it/s]


Epoch 12 - Validation Loss: 4.5791


Training Epoch 13: 100%|██████████| 90/90 [01:07<00:00,  1.33it/s]


Epoch 13 - Training Loss: 4.1101


Evaluating: 100%|██████████| 23/23 [00:15<00:00,  1.44it/s]


Epoch 13 - Validation Loss: 4.5342


Training Epoch 14: 100%|██████████| 90/90 [01:06<00:00,  1.35it/s]


Epoch 14 - Training Loss: 4.0441


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  2.02it/s]


Epoch 14 - Validation Loss: 4.5059


Training Epoch 15: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 15 - Training Loss: 3.9806


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  1.96it/s]


Epoch 15 - Validation Loss: 4.4776


Training Epoch 16: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 16 - Training Loss: 3.9242


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  2.08it/s]


Epoch 16 - Validation Loss: 4.4533


Training Epoch 17: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 17 - Training Loss: 3.8552


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  2.03it/s]


Epoch 17 - Validation Loss: 4.4337


Training Epoch 18: 100%|██████████| 90/90 [01:05<00:00,  1.38it/s]


Epoch 18 - Training Loss: 3.8035


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  2.01it/s]


Epoch 18 - Validation Loss: 4.4108


Training Epoch 19: 100%|██████████| 90/90 [01:06<00:00,  1.36it/s]


Epoch 19 - Training Loss: 3.7471


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  2.00it/s]


Epoch 19 - Validation Loss: 4.3986


Training Epoch 20: 100%|██████████| 90/90 [01:06<00:00,  1.35it/s]


Epoch 20 - Training Loss: 3.6950


Evaluating: 100%|██████████| 23/23 [00:11<00:00,  1.97it/s]

Epoch 20 - Validation Loss: 4.3813





## Generate text from a promt

In [176]:
prompt_text = "The rabbit"
prompt_tokens = tokenizer.encode(prompt_text, return_tensors="pt").squeeze().tolist()

generated_text = model.generate_text(prompt_tokens, max_len=10, temperature=0.8)
print(f"Generated text: {generated_text}")

Generated text: the rabbit notice her face _ that
