# Deep Learning Applications: Laboratory #2

In this laboratory we studied LLM and some of their uses. 

## Exercise 1: Basic GPT model
For this exercise we implement a simple LLM that uses a tokenized dataset or text such as Dante's Inferno to produce a text similar in style and wording.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import wandb

In [None]:
class TextDataset:
    def __init__(self, filename, train=True, train_size=0.9, block_size=8):
        self.block_size = block_size
        self.train = train
        
        with open(filename, 'r', encoding='utf-8') as f:
            raw_data = f.read()
        
        self.tokens = sorted(set(raw_data))
        self.num_tokens = len(self.tokens)
        self.char2idx = {c: i for i, c in enumerate(self.tokens)}
        self.idx2char = {i: c for c, i in self.char2idx.items()}
        
        split_idx = int(len(raw_data) * train_size)
        raw_data = raw_data[:split_idx] if train else raw_data[split_idx:]
        
        self.data = torch.tensor(self.encode(raw_data), dtype=torch.long)
    
    def encode(self, text):
        return [self.char2idx[c] for c in text]
    
    def decode(self, indices):
        return ''.join(self.idx2char[i] for i in indices)
    
    def get_batch(self, batch_size):
        idx = torch.randint(0, len(self) - self.block_size, (batch_size,))
        x = torch.stack([self.data[i:i+self.block_size] for i in idx])
        y = torch.stack([self.data[i+1:i+self.block_size+1] for i in idx])
        return x.to(device), y.to(device)
    
    def __len__(self):
        return len(self.data) - self.block_size

In [None]:
class SelfAttentionHead(nn.Module):
    def __init__(self, embed_dim, head_size, dropout=0.2):
        super().__init__()
        self.key = nn.Linear(embed_dim, head_size, bias=False)
        self.query = nn.Linear(embed_dim, head_size, bias=False)
        self.value = nn.Linear(embed_dim, head_size, bias=False)
        self.tril = torch.tril(torch.ones(embed_dim, embed_dim))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        B, T, C = x.shape
        k, q, v = self.key(x), self.query(x), self.value(x)
        attn_scores = (q @ k.transpose(-2, -1)) * (C ** -0.5)
        attn_scores = attn_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        return attn_probs @ v

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, head_size, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(embed_dim, head_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, embed_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.dropout(self.proj(torch.cat([h(x) for h in self.heads], dim=-1)))

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        head_size = embed_dim // num_heads
        self.attn = MultiHeadAttention(embed_dim, num_heads, head_size)
        self.ffwd = FeedForward(embed_dim)
        self.ln1, self.ln2 = nn.LayerNorm(embed_dim), nn.LayerNorm(embed_dim)
    
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        return x + self.ffwd(self.ln2(x))

In [None]:
class GPTModel(nn.Module):
    def __init__(self, vocab_size, block_size, embed_dim, num_heads, num_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.Sequential(*[TransformerBlock(embed_dim, num_heads) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.embed(idx) + self.pos_embed(torch.arange(T, device=device))
        x = self.blocks(x)
        logits = self.lm_head(self.ln_f(x))
        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1)) if targets is not None else None
        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            idx_cond = idx[:, -self.pos_embed.num_embeddings:]
            logits, _ = self(idx_cond)
            idx_next = torch.multinomial(F.softmax(logits[:, -1, :], dim=-1), 1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
def train_model(model, optimizer, dataset, epochs=5000, eval_interval=500, batch_size=6):
    wandb.init(project="dante-model", config={
    "block_size": block_size,
    "num_heads": num_heads,
    "embed_dim": embed_dim,
    "num_layers": num_layers,
    "learning_rate": lr
})
    wandb.watch(model, log="all")
    
    for epoch in tqdm(range(epochs)):
        if epoch % eval_interval == 0:
            X, Y = dataset.get_batch(batch_size)
            logits, loss = model(X, Y)
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
            wandb.log({"epoch": epoch, "loss": loss.item()})
        
        X, Y = dataset.get_batch(batch_size)
        optimizer.zero_grad()
        _, loss = model(X, Y)
        loss.backward()
        optimizer.step()
    
    wandb.finish()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
block_size=64
num_heads=4
embed_dim=128
num_layers=6
lr=3e-4

ds_train = TextDataset("1ddcd09.txt", train=True, block_size=block_size)
model = GPTModel(ds_train.num_tokens, block_size=block_size, embed_dim=embed_dim, num_heads=num_heads, num_layers=num_layers).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
train_model(model, optimizer, ds_train)

In [None]:
# check an example of a generated code
print(ds_train.decode(model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), 500)[0].tolist()))

We produced a couple of versions of the model to see how it evolves.
With training for **1.000** epochs we received text such as:

"besti solcitomerso, que' rivant'aIffio.

Marli,  mandima te lede che saffraste' ste.

Obborda de di fuo, non setch'atto si
  XVer eregla fic'io piu' du balchio sosto
  mosco con fosti pere naddi il porge
  fe' ridieglinti al e rpietta da che alu coi.

Non de son danzi o sestartei quar zaveastro.

L'a gio te sbenta, so tui e fesu osci.

; luro in a vio` ch'ovessi' chia Frezzia,

non  che abbuosiana e ducia chi me ssoppre>>.

Tianteco e s'arla ior cae` ritaliora,
  parsosero a tenindo e' olora"

This text visually is similar to Divina Commedia but when closely watching it has a lot of things that don't make sense such as punctuation and some words.

For **5.000** epochs we get something like:

"mi richio in ciel chiusto ' l'abbialio tanta,
  qual esso more or pozzo i sei gento,

di' voltrova si coneo>>, ch'a s'elli avessa
  nom'io lamor Diegon con suoi rivolse

Chi fron fonde addo' ora gi,
  e prel Nascino uscimo a rice, ai, compagne
  e voniziol non sono` in monestro:
  e; che' semmo ingogno i parti>>.

Io la bella cosa;
  e 'l lanzo ma sua sigua li spatti.

L'acque a Dio avina credera loce,
  buon hanno del podo indio, andar s'ogne.

E non la patica leodia, e ul Zabra:
  dal biatto"

The wordings are much better even if not perfect.

## Exercise 2: Working with LLMs

In this exercise we will see how to use the Hugging Face model and dataset.

## Exercise 2.1: Text tokenization

The key classes to work with when using GPT-2 for text generation are GPT2Tokenizer and GPT2LMHeadModel. 
The GPT2Tokenizer encoded raw text into sub-word tokens. These tokens are then mapped to integer IDs that GPT-2 can process. When working with text generation, it's important to include an important part that is the language modeling head. This head attaches to the final hidden layers of the architecture, enabling it to generate text by predicting the next token in a sequence based on the context.

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

# sample input text
input_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."

# encode text into sub-word tokens
inputs = tokenizer(input_text, return_tensors="pt")
token_ids = inputs["input_ids"]

print("Input text:", input_text)
print("Length of input text (characters):", len(input_text))
print("Token IDs:", token_ids)
print("Length of encoded sequence (tokens):", token_ids.size(1))

# encode token IDs back to text
decoded_text = tokenizer.decode(token_ids[0], skip_special_tokens=True)
print("Decoded text:", decoded_text)

We get these lenghts for basic and encoded text:
Length of input text (characters): 445
Length of encoded sequence (tokens): 153

Some of the reasons why is length is so much shorter in theory can be:
* many short words map directly to single tokens
* spaces and punctuation are handled efficiently
  
If the text contains rare words or complex structures, the token count would be higher relative to the character count.

## Exercise 2.2: Generating Text

In this exercise we instantiate a pre-trained GPT2LMHeadModel and use the innate generate() method to generate text given a prompt.


In [None]:
import torch
from transformers .mport GPT2Tokenizer, GPT2LMHeadModel

We then will use a new function generate_text() that given an input and the model, will return a new phrase. Some of the parameters to be set in this function are:
* do_sample: whether to use sampling or not, meaning greedy or probabilistic approach.
* temperature: the creativity level, with lower temperature it becomes more deterministic.

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

def generate_text(prompt, max_length=50, do_sample=False, temperature=1.0):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]

    output_ids = model.generate(
        input_ids, 
        max_length=max_length, 
        do_sample=do_sample, 
        temperature=temperature
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
prompt = "The dinner is"

# text using greedy decoding
greedy_text = generate_text(prompt, do_sample=False)
print("\nGreedy decoding:")
print(greedy_text)

# text using sampling
sampled_text = generate_text(prompt, do_sample=True, temperature=0.2)
print("\nSampled decoding (temperature=0.2):")
print(sampled_text)

sampled_text = generate_text(prompt, do_sample=True, temperature=0.7)
print("\nSampled decoding (temperature=0.7):")
print(sampled_text)

The result phrases that we got are:

*Greedy decoding*:
The dinner is a bit of a mess, but I'm glad I did it. I'm glad I did it. I'm glad I did it. I'm glad I did it. I'm glad I did it. I'm glad I did

*Sampled decoding (temperature=0.2)*:
The dinner is a bit of a mess. I'm not sure if it's because I'm not a good cook or if I'm just not good at it. I'm not sure if I'm a good cook. I'm not sure if I

*Sampled decoding (temperature=0.7)*:
The dinner is hosted by the University of Ottawa's School of Family and Community Studies. The event is also attended by Canada's first-ever U.S. ambassador to Canada, William J. St. John, and the United States'


We can see that when using the default settings of the generate function, so the greedy approach, our phrase repeats the same 5 or so words, such as "I'm glad I did it." likely because it is often used in a sentence of this type. Another thing to note is that the phrase generated is always the same because the greedy behaviour does not have any randomness in it to change the possible outcome.

The higher the *temperature* the better is the generated result, it is more creative and not as repetitive as it would be with small temperature, such as 0.2 in our example.

If we want to generate good phrases that are connected and make sense together we should not use the greedy approach, instead we should sample to create new sentences.

# Exercise 3: Test classification using LLM

In this exercise, we used a pre-trained Large Language Model DistilBERT for a Natural Language Processing task. Since DistilBERT provides a special class token in their output, we used it directly for classification. The goal is to select a moderately sized dataset, such as IMDb and train a logistic regression model to perform sentiment classification.

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
import wandb
import time
import random
from sklearn.linear_model import LogisticRegression

In [None]:
dataset = load_dataset("imdb")

x_train = dataset['train']['text']
x_test = dataset['test']['text']

y_train = np.array(dataset['train']['label'])
y_test = np.array(dataset['test']['label'])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name).to(device)
model.eval()

In [None]:
wandb.init(project="imdb-feature-extraction", name="distilbert-cpu")

def extract_features(texts, batch_size=4, max_length=32):
    features = []
    total_batches = len(texts) // batch_size
    start_time = time.time()

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        batch_start_time = time.time()

        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        features.append(cls_embeddings)

        batch_time = time.time() - batch_start_time
        wandb.log({"batch_time (s)": batch_time, "batch_index": i // batch_size})

    total_time = time.time() - start_time
    wandb.log({"total_extraction_time (s)": total_time})

    return np.vstack(features)

train_features = extract_features(x_train)
test_features = extract_features(x_test)

wandb.finish()

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(train_features, y_train)
y_pred = clf.predict(test_features)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred):.4f}")

After training the model and the classifier we get Logistic Regression Accuracy of 0.72.


In [None]:
num_samples = 5
sample_indices = random.sample(range(len(x_test)), num_samples)

print("\nSample Predictions:")
for idx in sample_indices:
    print(f"Text: {x_test[idx][:200]}...")
    print(f"Predicted Label: {y_pred[idx]} | Actual Label: {y_test[idx]}")
    print("-" * 80)

Using the trained model and classifier these are some of the sample predictions that we got:

Text: Plot in a nutshell - Duchess (voice of Eva Gabor) is the well polished single mother cat of three little kittens. When their owner, the wealthy elderly woman known as Madame Adelaide, realizes that he...
Predicted Label: 1 | Actual Label: 1

Text: 2:37 is an intense and fascinating drama which has some similarities in tone and subject with films like Bully, Elephant and Kids (although, by my point of view, 2:37 is a superior film to those three...
Predicted Label: 1 | Actual Label: 1

Text: Set in 1962 Hong Kong (in turbulent times, as we are informed), this extremely intimate story of a failed romance between a two married people tied to their traditions manages to recall the essence of...
Predicted Label: 1 | Actual Label: 1

Text: I honestly don't know where to begin when reviewing a movie as pathetic as Ernest Goes to Africa. Aside from two or three good laughs dispersed throughout the film, there is nothing positive about thi...
Predicted Label: 0 | Actual Label: 0

Text: In my analysis of "Trois couleurs: Blanc" I wrote that its tone is much lighter than the tone of "Trois couleurs: Bleu". I think it's the same with this film. This time it's not because of a tragic co...
Predicted Label: 1 | Actual Label: 1

This means that in these cases the trained model was always correct, but the classifier's accuracy can be further improved with longer training.