# NLP Exercise 2: Neural Network
---

## Going from raw text to tokens

### Loading necessary libraries.

In [None]:
%pip install transformers
%pip install datasets
%pip install tokenizers
%pip install torch
%pip install tqdm

### Create a dataset

Using Datasets from Hugging Face could be useful when we create our own dataset.

In [2]:
import datasets

def create_dataset(raw_sentences: list[str]) -> datasets.Dataset:
    """
    Create a HuggingFace Dataset.
    
    Parameters: 
        raw_sentences: list of sentences.
        labels: list of integer labels corresponding to the sentences.

    """

    dataset_dict = {
        "text": raw_sentences,

    }

    # Define schema
    dataset_features = datasets.Features(
        {
            "text": datasets.Value("string"),
        }
    )

    # Create the datset
    dataset = datasets.Dataset.from_dict(dataset_dict, features=dataset_features)
    return dataset

  from .autonotebook import tqdm as notebook_tqdm


Read and extract sentences from book

In [3]:
import re

with open('alice_in_wonderland.txt', 'r', encoding='utf-8') as file:
    story_text = file.read()

raw_sentences = []

# Extract Chapter 1
chapter_start = "Alice was beginning"
chapter_end = "THE END"
start_idx = story_text.find(chapter_start)
end_idx = story_text.find(chapter_end)
chapter_1_text = story_text[start_idx:end_idx].strip()

# Split into sentences
# Use regular expressions to split by special signs like '.', '!', and '?'
split_sentences = re.split(r'[.!?*;,]', chapter_1_text)

# Filter sentences longer than 5 words
long_sentences = [sentence.strip() for sentence in split_sentences if len(sentence.split()) > 2]

# Display the results
for sentence in long_sentences:  
    raw_sentences.append(sentence)

In [4]:
own_dataset = create_dataset(raw_sentences)

print(own_dataset.to_pandas())

                                                   text
0     Alice was beginning to get very tired of sitti...
1     and of having nothing to do: once or twice she...
2        but it had no pictures or\nconversations in it
3                        “and what is the use of a book
4     ” thought Alice\n“without pictures or conversa...
...                                                 ...
3568  and make _their_ eyes bright and eager with ma...
3569  perhaps even with the dream of Wonderland of l...
3570      and find a pleasure in all\ntheir simple joys
3571                     remembering her own child-life
3572                         and the happy summer\ndays

[3573 rows x 1 columns]


### Tokenization

Differences between WordPiece tokenization and wordLevel tokenizer.

In [5]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast

Function to train WordLevel Tokenizer

In [6]:
def train_word_level_tokenizer(
        sentences: list[str],
        unk_token: str = "[UNK]",
        pad_token: str = "[PAD]",
        start_of_seq_token: str = "<s>",
        end_of_seq_token: str = "</s>", 
        vocab_size: int = 1000
) -> PreTrainedTokenizerFast:
    """Train a WordLevel tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordLevelTrainer(vocab_size=vocab_size,
                                        special_tokens=special_tokens, 
                                        show_progress=True)

    # Initialize WordLevel tokenizer
    tokenizer = Tokenizer(models.WordLevel(unk_token=unk_token))

    # Normalize each sentence using NFD unicode and stripping whitespace
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Strip()]
    )

    # Using Whitespace to split each input sentence
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer

Function to train WordPiece Tokenizer

In [7]:
def train_wordpiece_tokenizer(
    sentences: list[str],
    unk_token: str = "[UNK]",
    pad_token: str = "[PAD]",
    start_of_seq_token: str = "<s>",
    end_of_seq_token: str = "</s>",
    vocab_size: int = 1000  # Set a smaller vocab size to force subword splits
) -> PreTrainedTokenizerFast:
    """Train a WordPiece tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        show_progress=True
    )

    # Initialize WordPiece tokenizer
    tokenizer = Tokenizer(models.WordPiece(unk_token=unk_token))

    # Configure normalization and pre-tokenization
    tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Strip()])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer


Train both tokenizers

In [8]:
word_level_tokenizer = train_word_level_tokenizer(raw_sentences)
wordpiece_tokenizer = train_wordpiece_tokenizer(raw_sentences)

Visualize how tokenizers work

In [11]:
import random

# Print tokenized results for raw sentences using convert_ids_to_tokens
index = random.randint(0,3572)
sentence = raw_sentences[index]

wordpiece_encoded_example = wordpiece_tokenizer.encode(sentence)
wordpiece_tokens_example = wordpiece_tokenizer.convert_ids_to_tokens(wordpiece_encoded_example)

wordlevel_encoded_example = word_level_tokenizer.encode(sentence)
wordlevel_tokens_example = word_level_tokenizer.convert_ids_to_tokens(wordlevel_encoded_example)

# Using WordPiece Tokenizer
print(f"Sentence: {sentence}")
print(f"WordPiece Tokens: {wordpiece_tokens_example}")
print(f"WordPiece Token IDs: {wordpiece_encoded_example}")
print()

# Using WordLevel Tokenizer
print(f"WordLevel Tokens: {wordlevel_tokens_example}")
print(f"WordLevel Token IDs: {wordlevel_encoded_example}")

Sentence: by the way the people near the door began sneezing all at once
WordPiece Tokens: ['<s>', 'by', 'the', 'way', 'the', 'people', 'near', 'the', 'door', 'began', 'sneez', '##ing', 'all', 'at', 'once', '</s>']
WordPiece Token IDs: [2, 360, 125, 350, 125, 889, 519, 125, 493, 356, 994, 130, 202, 187, 508, 3]

WordLevel Tokens: ['<s>', 'by', 'the', 'way', 'the', 'people', 'near', 'the', 'door', 'began', 'sneezing', 'all', 'at', 'once', '</s>']
WordLevel Token IDs: [2, 89, 4, 88, 4, 311, 268, 4, 158, 87, 612, 30, 25, 156, 3]


## Build RNN model based on different tokenizations.

In [12]:
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F

Convert to PyTorch tensors

In [13]:
# Define a fixed maximum length for padding
max_length = 20
# Tokenize the whole dataset
wordpiece_encoded = [wordpiece_tokenizer.encode(sentence, max_length=max_length, padding='max_length', truncation=True) for sentence in raw_sentences]
wordlevel_encoded = [word_level_tokenizer.encode(sentence, max_length=max_length, padding='max_length', truncation=True) for sentence in raw_sentences]

In [14]:
wordpiece_data = torch.tensor(wordpiece_encoded, dtype=torch.long)
wordlevel_data = torch.tensor(wordlevel_encoded, dtype=torch.long)

In [15]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True,)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        logits = self.fc(output)
        return logits, hidden

In [17]:
from torch.utils.data import DataLoader, TensorDataset

In [18]:
# Example batch size
batch_size = 32

# Create Tensor datasets
wordpiece_dataset = TensorDataset(wordpiece_data)
wordlevel_dataset = TensorDataset(wordlevel_data)

# Make sure to shuffle your data to avoid sequence patterns during training
wordpiece_loader = DataLoader(wordpiece_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
wordlevel_loader = DataLoader(wordlevel_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

def train_model(model, data_loader, vocab_size, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            inputs, = batch  
            targets = inputs[:, 1:].contiguous()
            inputs = inputs[:, :-1].contiguous()
            
            optimizer.zero_grad()
            logits, _ = model(inputs)
            loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(data_loader):.4f}")


In [19]:
# Initialize models
embedding_dim = 64
hidden_dim = 128
vocab_size_wordpiece = len(wordpiece_tokenizer.get_vocab())
vocab_size_wordlevel = len(word_level_tokenizer.get_vocab())
wordpiece_model = RNNLanguageModel(vocab_size_wordpiece, embedding_dim, hidden_dim)
wordlevel_model = RNNLanguageModel(vocab_size_wordlevel, embedding_dim, hidden_dim)


In [20]:
# Train model by WordPiece Model
print("Training WordPiece Model:")
train_model(wordpiece_model, wordpiece_loader, vocab_size_wordpiece, epochs=20)

Training WordPiece Model:
Epoch 1/20, Loss: 3.8572
Epoch 2/20, Loss: 3.0665
Epoch 3/20, Loss: 2.8793
Epoch 4/20, Loss: 2.7321
Epoch 5/20, Loss: 2.5960
Epoch 6/20, Loss: 2.4772
Epoch 7/20, Loss: 2.3693
Epoch 8/20, Loss: 2.2760
Epoch 9/20, Loss: 2.1966
Epoch 10/20, Loss: 2.1259
Epoch 11/20, Loss: 2.0644
Epoch 12/20, Loss: 2.0061
Epoch 13/20, Loss: 1.9519
Epoch 14/20, Loss: 1.9032
Epoch 15/20, Loss: 1.8577
Epoch 16/20, Loss: 1.8127
Epoch 17/20, Loss: 1.7718
Epoch 18/20, Loss: 1.7317
Epoch 19/20, Loss: 1.6935
Epoch 20/20, Loss: 1.6536


In [21]:
# Train model by WordLevel Model
print("Training WordLevel Model:")
train_model(wordlevel_model, wordlevel_loader, vocab_size_wordlevel, epochs=10)

Training WordLevel Model:
Epoch 1/10, Loss: 3.0236
Epoch 2/10, Loss: 2.1691
Epoch 3/10, Loss: 2.0183
Epoch 4/10, Loss: 1.9143
Epoch 5/10, Loss: 1.8363
Epoch 6/10, Loss: 1.7729
Epoch 7/10, Loss: 1.7179
Epoch 8/10, Loss: 1.6706
Epoch 9/10, Loss: 1.6286
Epoch 10/10, Loss: 1.5915


In [23]:
def generate_text(model, tokenizer, start_text, max_length=25, temperature=0.7):
    model.eval()
    tokens = tokenizer.encode(start_text)
    input_tensor = torch.tensor(tokens).unsqueeze(0)
    
    generated_text = start_text
    hidden = None
    
    for _ in range(max_length):
        logits, hidden = model(input_tensor, hidden)
        
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        probs[0, tokenizer.pad_token_id] = 0  # Set pad token probability to zero

        next_token = torch.multinomial(probs, num_samples=1).item()
        
        if next_token == tokenizer.pad_token_id or next_token == tokenizer.eos_token_id:
            break
        
        next_word = tokenizer.decode([next_token])
        generated_text += " " + next_word
        input_tensor = torch.tensor([[next_token]])

    return generated_text


In [34]:
start_text = "Alice"
print("Generated Text using WordPiece Model:")
print(generate_text(wordpiece_model, wordpiece_tokenizer, start_text))

print("\nGenerated Text using WordLevel Model:")
print(generate_text(wordlevel_model, word_level_tokenizer, start_text))

Generated Text using WordPiece Model:
Alice ##ed s ##ever ##el

Generated Text using WordLevel Model:
Alice shoes a very [UNK]


# LSTM Model

## Import Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import numpy as np

## Load and Preprocess Data

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# Split into training and validation set
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Visualize the dataset
dataset_df = dataset.to_pandas()
print(dataset_df.head())

## Initialize a pre-trained tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
padding_token_id = tokenizer.pad_token_id

# Example for using tokenizer with padding
sequence = ["I am a student at Tampere University", "I live in Finland"]
model_inputs = tokenizer(sequence, padding='longest', truncation=True) # try with padding = 'max_length'
print(model_inputs['input_ids'])

## Tokenize and prepare data for language modeling

- The 'labels' are shifted version of 'input_ids', meaning each token in 'labels' corresponds to the next word in 'input_ids'.
- The model's prediction at position 'i' in 'input_ids' should match the word at position 'i+1' in 'labels'.

In [None]:
def tokenize_and_shift(examples):
    # Tokenize with padding and truncation
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    
    # Copy input_ids for labels and then shift for next-token prediction
    labels = tokenized["input_ids"].copy()
    
    # Shift the labels one position to the right
    for i in range(len(labels)):
        labels[i] = labels[i][1:] + [padding_token_id]  # Shift and pad
    
    tokenized["labels"] = labels
    return tokenized

## Mapping the function to the Dataset

In [None]:
# Apply the function to both datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_shift, batched=True, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(tokenize_and_shift, batched=True, remove_columns=["text"])

# Set the format for PyTorch
tokenized_train_dataset.set_format("torch", columns=["input_ids", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "labels"])

## Create DataLoader

In [None]:
train_loader = DataLoader(tokenized_train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(tokenized_val_dataset, batch_size=8, shuffle=False, num_workers=2)

## Define the Language Model Class

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, padding_token_id: int):
        super().__init__()
        self.padding_token_id = padding_token_id

        # Model components
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.projection = nn.Linear(hidden_dim, vocab_size)

        # Loss function, ignoring padding tokens
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=padding_token_id)

    def forward(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        logits = self.predict_logits(input_ids)
        loss = self.compute_loss(logits, input_ids)
        return loss, logits

    def predict_logits(self, input_ids: torch.Tensor) -> torch.Tensor:
        """
        Processes the embeddings sequentially, allowing each token to influence the representation
        of the subsequent tokens. This helps capture dependencies accross the sequence.

        Args:
            input_ids (torch.Tensor): The embeddings tensors from the 
            previous step with shape [batch_size, seq_len, embedding_dim]

        Returns:
            torch.Tensor: A tensor with shape [batch_size, seq_len, hidden_dim], 
            where hidden_dim is the dimensionality of the RNN's hidden state.

        """
        embeddings = self.embedding(input_ids)
        rnn_output, _ = self.rnn(embeddings)  
        logits = self.projection(rnn_output)
        return logits

    def compute_loss(self, logits: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
        logits = logits[:, :-1, :].contiguous().view(-1, logits.size(-1))
        target_ids = input_ids[:, 1:].contiguous().view(-1)
        loss = self.loss_fn(logits, target_ids)
        return loss

    def generate_text(self, prompt, max_len=20, temperature=1.0):
        """
        Generates text from a model, starting with a given prompt and
        extending it one token at a time until reach a specified length.

        Args:
            prompt (_type_): Prompt.
            max_len (int, optional): The maximum length of tokens to generate.
            temperature (float, optional):  A parameter that controls the randomness of predictions. 
                                        Lower values make the model more confident and deterministic, 
                                        while higher values make it more diverse and exploratory..

        Returns:
            _type_: _description_
        """
        # Switch to Evaluation mode
        self.eval()
        # Convert prompts to tensor
        input_ids = torch.tensor([prompt], dtype=torch.long)
        generated_tokens = prompt[:]
        
        # Disable Gradient computation
        with torch.no_grad():
            # Generate tokens in a loop
            for _ in range(max_len):
                logits = self.predict_logits(input_ids)  
                next_token_logits = logits[:, -1, :] / temperature # extracts the logits for the last token
                
                # Convert logits to probabilities
                probabilities = F.softmax(next_token_logits, dim=-1).squeeze()
                # Uses torch.multinomial to sample a token ID from the probability distribution.
                next_token = torch.multinomial(probabilities, 1).item()
                
                # End condition
                if next_token == self.padding_token_id:
                    break

                generated_tokens.append(next_token)
                input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=1)

        # Conver tokens to text
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        return generated_text

## Initialize the model, optimizer, and training loop

In [None]:
vocab_size = tokenizer.vocab_size
embedding_dim = 64
hidden_dim = 128
epochs = 3

model = LanguageModel(vocab_size, embedding_dim, hidden_dim, padding_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

## Function to evalulate model

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    total_batches = len(dataloader)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"]
            loss, _ = model(input_ids)
            total_loss += loss.item()

    # Calculate average loss and perplexity
    avg_loss = total_loss / total_batches
    perplexity = np.exp(avg_loss)
    return avg_loss, perplexity

## Training loop

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch["input_ids"]
        optimizer.zero_grad()

        loss, _ = model(input_ids)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

    # Run evaluation
    val_loss, val_perplexity = evaluate_model(model, val_loader)
    print(f"Epoch {epoch+1} - Validation Loss: {val_loss:.4f}, Perplexity: {val_perplexity:.4f}")

## Generate text from a promt

In [None]:
prompt_text = ""
prompt_tokens = tokenizer.encode(prompt_text, return_tensors="pt").squeeze().tolist()

generated_text = model.generate_text(prompt_tokens, max_len=50, temperature=0.8)
print(f"Generated text: {generated_text}")

# Text Generation using transformers 

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model='distilgpt2')
generator(
    "In this course, we will teach you about",
    max_length=30,
    truncation=True,
    num_return_sequences=1,
    pad_token_id=generator.tokenizer.eos_token_id
)