In [2]:
import sys
import os

# Manually set root if in notebook
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
import torch
from models.gpt2.gpt_model_v1 import GPTModel
from utils.gpt2_utils import download_and_load_gpt2, load_weights_into_gpt
from datasets.loader import get_spam_instruction_loaders
import tiktoken
import os
import urllib.request
import zipfile
from pathlib import Path
import pandas as pd


2025-04-14 22:19:56.913975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744649396.925623  269064 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744649396.929160  269064 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744649396.938493  269064 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744649396.938506  269064 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744649396.938507  269064 computation_placer.cc:177] computation placer alr

In [4]:
# ------------------ Download Spam Dataset ------------------ #
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [5]:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [6]:
# ------------------ Prepare CSVs ------------------ #
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])

def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    return pd.concat([ham_subset, df[df["Label"] == "spam"]])

balanced_df = create_balanced_dataset(df)
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})


def random_split(df, train_frac=0.7, validation_frac=0.1):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    val_end = train_end + int(len(df) * validation_frac)
    return df[:train_end], df[train_end:val_end], df[val_end:]

train_df, val_df, test_df = random_split(balanced_df)
train_df.to_csv("train.csv", index=False)
val_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [7]:
# ------------------ Config ------------------ #
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Classify the message as spam or not spam."

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024, 
    "drop_rate": 0.1,
    "qkv_bias": True
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])


In [8]:

# ------------------ Tokenizer & Data ------------------ #
tokenizer = tiktoken.get_encoding("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader, val_loader, test_loader = get_spam_instruction_loaders(
    "train.csv", "validation.csv", "test.csv", tokenizer, device
)

In [9]:
# ------------------ Model ------------------ #
settings, params = download_and_load_gpt2("124M", "gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

# Freeze base model except final norm and last block
for param in model.parameters():
    param.requires_grad = False

model.out_head = torch.nn.Linear(BASE_CONFIG["emb_dim"], 2)  # Binary class

for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

model.to(device)

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [10]:
# ------------------ Loss & Train Utils ------------------ #
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)[:, -1, :]
    return torch.nn.functional.cross_entropy(logits, target_batch)

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = sum(calc_loss_batch(x, y, model, device).item() for i, (x, y) in enumerate(train_loader) if i < eval_iter) / eval_iter
        val_loss = sum(calc_loss_batch(x, y, model, device).item() for i, (x, y) in enumerate(val_loader) if i < eval_iter) / eval_iter
    model.train()
    return train_loss, val_loss

In [11]:
def train_instruction_classifier(model, train_loader, val_loader, device, num_epochs=3, eval_freq=50, eval_iter=5):
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5, weight_decay=0.1)
    global_step = 0

    for epoch in range(num_epochs):
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

            global_step += 1
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                print(f"Epoch {epoch+1}, Step {global_step}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    return model

In [12]:
trained_model = train_instruction_classifier(model, train_loader, val_loader, device)

Epoch 1, Step 50: Train Loss = 1.1667, Val Loss = 1.0194
Epoch 1, Step 100: Train Loss = 0.7033, Val Loss = 0.6926
Epoch 2, Step 150: Train Loss = 0.7010, Val Loss = 0.6768
Epoch 2, Step 200: Train Loss = 0.6003, Val Loss = 0.6167
Epoch 2, Step 250: Train Loss = 0.5569, Val Loss = 0.5978
Epoch 3, Step 300: Train Loss = 0.5335, Val Loss = 0.5536
Epoch 3, Step 350: Train Loss = 0.4751, Val Loss = 0.4783


In [13]:
def classify_review(
        text, model, tokenizer, device, max_length=None,
        pad_token_id=50256):
    model.eval()

    input_ids = tokenizer.encode(text)          #1
    supported_context_length = model.pos_emb.weight.shape[0]

    input_ids = input_ids[:min(              #2
        max_length, supported_context_length
    )]

    input_ids += [pad_token_id] * (max_length - len(input_ids))    #3

    input_tensor = torch.tensor(
        input_ids, device=device
    ).unsqueeze(0)              #4

    with torch.no_grad():                                #5
        logits = model(input_tensor)[:, -1, :]     #6
    predicted_label = torch.argmax(logits, dim=-1).item()

    return "spam" if predicted_label == 1 else "not spam" 

In [15]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_review(
    text_1, model, tokenizer, device, max_length=120
))

not spam
