### Load results from previous lectures

In [None]:
%%capture
%run -n ../full_achitecture_and_loaded_parameters/gpt2_small_124M.ipynb

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
m_seed = 123
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

import pandas as pd
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import tiktoken

import time
import matplotlib.pyplot as plt

### Data preparation

In [4]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [5]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return
    
    ssl_context = ssl._create_unverified_context()

    # download and write file to disk
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # unzip
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"file downloaded and saved as {data_file_path}")

In [None]:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

In [None]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df

In [None]:
print(df["Label"].value_counts())

In [9]:
def create_balanced_dataset(df):
    spam_subset = df[df["Label"] == "spam"]
    num_spam = spam_subset.shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=m_seed)
    balanced_df = pd.concat([ham_subset, spam_subset])

    return balanced_df

In [None]:
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

In [11]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [12]:
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=m_seed).reset_index(drop=True)

    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [13]:
train_df, validation_df, test_df = random_split(balanced_df, train_frac=0.7, validation_frac=0.1)

In [14]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [15]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        super().__init__()
        self.data = pd.read_csv(csv_file)

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length

            self.encoded_texts = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]

        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length

        return max_length     


In [None]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

In [17]:
validation_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [18]:
num_workers = 0
batch_size = 8

torch.manual_seed(m_seed)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

validation_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

In [None]:
# test data loaders
print("Train loader:")
for i_batch, t_batch in train_loader:
    pass

print("Input batch dimentions:", i_batch.shape)
print("Target batch dimentions:", t_batch.shape)

In [None]:
print(f"Training loader contains {len(train_loader)} batches")
print(f"Also we have {len(validation_loader)} validation batches")
print(f"And {len(test_loader)} batches in test loader")

### Prepare model for fine-tuning

In [4]:
for param in gpt.parameters():
    param.requires_grad = False

In [5]:
torch.manual_seed(m_seed)

num_classes = 2
gpt.out_head = nn.Linear(in_features=GPT_CONFIG_124M["emb_dim"], out_features=num_classes)
# this by default unfreeze parameters of output layer

In [6]:
for param in gpt.final_norm.parameters():
    param.requires_grad = True
    
for param in gpt.trf_blocks[-1].parameters():
    param.requires_grad = True

### Model evaluation

In [None]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)

print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

In [None]:
with torch.no_grad():
    outputs = gpt(inputs)

print("Outputs:", outputs)
print("Outputs dimentions:", outputs.shape)

In [None]:
probs = torch.softmax(outputs[:, -1, :], dim=-1)

print(probs)
label = torch.argmax(probs)
print("Class label:", label.item())

In [27]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()

    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (i_batch, t_batch) in enumerate(data_loader):
        if i < num_batches:
            i_batch, t_batch = i_batch.to(device), t_batch.to(device)

            with torch.no_grad():
                logits = model(i_batch)[:, -1, :]

            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == t_batch).sum().item()
        else:
            break

    return correct_predictions / num_examples

In [None]:
torch.manual_seed(m_seed)

train_accuracy = calc_accuracy_loader(train_loader, gpt, device, num_batches=10)
print(f"Train accuracy: {train_accuracy * 100:.2f}%")

validation_accuracy = calc_accuracy_loader(validation_loader, gpt, device, num_batches=10)
print(f"Validation accuracy: {validation_accuracy * 100:.2f}%")

test_accuracy = calc_accuracy_loader(test_loader, gpt, device, num_batches=10)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

### Loss function

In [29]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]
    loss = nn.functional.cross_entropy(logits, target_batch)

    return loss

In [30]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0

    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (i_batch, t_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(i_batch, t_batch, model, device)
            total_loss += loss.item()
        else:
            break

    return total_loss / num_batches

In [None]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, gpt, device, num_batches=5)
    print(f"Training loss: {train_loss:.3f}")

    validation_loss = calc_loss_loader(validation_loader, gpt, device, num_batches=5)
    print(f"Validation loss: {validation_loss:.3f}")

    test_loss = calc_loss_loader(test_loader, gpt, device, num_batches=5)
    print(f"Test loss: {test_loss:.3f}")

### Fine-tuting

In [32]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()

    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)

    model.train()

    return train_loss, val_loss


def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen = 0
    global_step = -1

    for epoch in range(num_epochs):
        model.train()

        for i_batch, t_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(i_batch, t_batch, model, device)
            loss.backward()
            optimizer.step()
            examples_seen += i_batch.shape[0]
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Epoch {epoch + 1} (Step {global_step:06d}): "
                      f"Train loss = {train_loss:.03f}, Val loss = {val_loss:.03f}")
            
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy * 100:.02f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy * 100:.02f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
    
    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [None]:
start_time = time.time()

torch.manual_seed(m_seed)

optimizer = torch.optim.AdamW(gpt.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    gpt, train_loader, validation_loader, optimizer, device,
    num_epochs=num_epochs,
    eval_freq=50,
    eval_iter=5)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Fine-tuning completed in {execution_time_minutes:.2f} mins")

In [34]:
def plot_values(epoches_seen, examples_seen, train_values, val_values, label="loss"):
    fig, ax1 = plt.subplots(figsize=(9, 6))

    ax1.plot(epoches_seen, train_values, label=f"Training {label}")
    ax1.plot(epoches_seen, val_values, label=f"Validation {label}", linestyle="-.")
    ax1.set_xlabel("Epoches")
    ax1.set_ylabel(label.capitalize())
    ax1.legend()

    ax2 = ax1.twiny()
    ax2.plot(examples_seen, train_values, alpha=0)
    ax2.set_xlabel("Examples seen")

    fig.tight_layout()
    plt.savefig(f"{label}-plot.pdf")
    plt.show()

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))

plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_accs))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs))

plot_values(epochs_tensor, examples_seen_tensor, train_accs, val_accs)

In [None]:
torch.manual_seed(m_seed)

train_accuracy = calc_accuracy_loader(train_loader, gpt, device)
print(f"Train accuracy: {train_accuracy * 100:.2f}%")

validation_accuracy = calc_accuracy_loader(validation_loader, gpt, device)
print(f"Validation accuracy: {validation_accuracy * 100:.2f}%")

test_accuracy = calc_accuracy_loader(test_loader, gpt, device)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

In [9]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[0]

    input_ids = input_ids[:min(max_length, supported_context_length)]

    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0)

    with torch.no_grad():
        logits = model(input_tensor)[:, -1, :]
    predicted_label = torch.argmax(logits, dim=-1).item()

    return "SPAM" if predicted_label == 1 else "NOT spam"

In [None]:
text_01 = (
    "You are the winner, you have been specialy"
    " selected to receive $1000 cash or a $2k reword!"
)

print(f"Text:\n {text_01}")
print("was classified as", classify_review(text_01, gpt, tokenizer, device, max_length=train_dataset.max_length))

In [None]:
text_02 = (
    "Hey, just wanted to check if we r still on"
    " for dinner tn? Let me know!"
)

print(f"Text:\n {text_02}")
print("was classified as", classify_review(text_02, gpt, tokenizer, device, max_length=train_dataset.max_length))

In [41]:
torch.save(gpt.state_dict(), "review_classifier.pth")

In [None]:
model_state_dict = torch.load("review_classifier.pth")
gpt.load_state_dict(model_state_dict)