In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

df_train = pd.read_csv("data/train.csv", engine="python", on_bad_lines="skip")
df_test = pd.read_csv("data/test.csv", engine="python", on_bad_lines="skip")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token 

In [3]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
import torch


class TextDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer) -> None:

        self.df = df
        self.tokenizer = tokenizer


    def __len__(self) -> int:
        return len(self.df)
    

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]

        encodings = self.tokenizer(
            row["text"],
            truncation=True,
            padding=False,
            max_length=512,
        )

        item = {
            "input_ids": torch.tensor(encodings["input_ids"]),
            "attention_mask": torch.tensor(encodings["attention_mask"]),
            "text_id": torch.tensor(idx),
            "labels": torch.tensor(row["label"] - 1)
        }

        return item

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class GPTBlock(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ff: int):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.ln1 = nn.LayerNorm(d_model)
        self.mlp = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, padding_mask=None):
        attn_out, _ = self.attn(
            x, x, x,
            key_padding_mask=padding_mask   # <-- only mask we need
        )

        x = x + attn_out
        x = x + self.mlp(self.ln1(x))
        return x


class TinyGPT(nn.Module):
    def __init__(self, vocab_size, d_model=64, n_heads=4, d_ff=256, n_layers=2, num_labels=1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.blocks = nn.ModuleList([
            GPTBlock(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embed(input_ids)

        # key_padding_mask expects True where padding should be ignored
        padding_mask = (attention_mask == 0) if attention_mask is not None else None
        
        for block in self.blocks:
            x = block(x, padding_mask=padding_mask)

        repr = x.mean(axis=-2)
        return self.fc(repr)

In [5]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(TextDataset(df_train, tokenizer), batch_size=32, collate_fn=collator)
test_loader = DataLoader(TextDataset(df_test, tokenizer), batch_size=32, collate_fn=collator)


gpt = TinyGPT(vocab_size=tokenizer.vocab_size)
batch = next(iter(train_loader))
out = gpt(batch["input_ids"], batch["attention_mask"])

sum(p.numel() for p in gpt.parameters()), out.shape

(3316609, torch.Size([32, 1]))

In [6]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [7]:
gpt.load_state_dict(torch.load("slm.pt"))

<All keys matched successfully>

In [8]:
import torch
import torch.nn as nn
from tqdm import tqdm

epochs = 5
grad_clip = 10.0
device = torch.device(choose_device())
print(f"Training on device: {device}")

gpt.to(device)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.AdamW(gpt.parameters())

# for epoch in range(1, epochs + 1):
#     gpt.train()
#     total_loss = 0.0
#     correct = 0
    # total = 0

    # progress = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}/{epochs}")

    # for i, batch in progress:
    #     batch = {k: v.to(device) for k, v in batch.items()}

    #     optimizer.zero_grad()
    #     out = gpt(batch["input_ids"], batch["attention_mask"])
        
    #     loss = criterion(out.view(-1), batch["labels"].view(-1).float())
    #     loss.backward()

#         torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)
#         optimizer.step()

#         total_loss += loss.item()

#         preds = torch.sigmoid(out).view(-1)  # convert logits to probabilities
#         predicted_labels = (preds >= 0.5).long()
#         correct += (predicted_labels == batch["labels"].view(-1)).sum().item()
#         total += batch["labels"].numel()

#         avg_loss = total_loss / (i + 1)
#         acc = correct / total

#         progress.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{acc:.4f}", "lr": optimizer.param_groups[0]["lr"]})

#     print(f"Epoch {epoch} done | Avg Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")

# torch.save(gpt.state_dict(), "slm.pt")


Training on device: mps


TinyGPT(
  (embed): Embedding(50257, 64)
  (blocks): ModuleList(
    (0-1): 2 x GPTBlock(
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=64, bias=True)
      )
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
  )
  (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [9]:

progress = tqdm(enumerate(test_loader), total=len(test_loader), desc=f"Test evaluation")
gpt.eval()

pred_labels = []
probas = []
real_labels = []

correct = 0
total = 0

with torch.inference_mode():
    for i, batch in progress:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = gpt(batch["input_ids"], batch["attention_mask"])

        preds = torch.sigmoid(out).view(-1)  # convert logits to probabilities
        predicted_labels = (preds >= 0.5).long()

        probas.append(preds)
        pred_labels.append(predicted_labels)
        real_labels.append(batch["labels"].view(-1))

        correct += (predicted_labels == batch["labels"].view(-1)).sum().item()
        total += batch["labels"].numel()
        acc = correct / total

        progress.set_postfix({"acc": f"{acc:.4f}"})

Test evaluation: 100%|██████████| 1188/1188 [00:58<00:00, 20.34it/s, acc=0.9308]


In [10]:
probas = torch.concatenate(probas)
real_labels = torch.concatenate(real_labels)

In [11]:
probas, real_labels = probas.to("cpu"), real_labels.to("cpu")

In [12]:
from sklearn.metrics import accuracy_score, f1_score

preds = (probas > 0.5).long()
acc = accuracy_score(real_labels, preds)
f1_macro = f1_score(real_labels, preds, average="macro")
f1_weighted = f1_score(real_labels, preds, average="weighted")

print("Accuracy:", acc)
print("F1 macro:", f1_macro)
print("F1 weighted:", f1_weighted)

Accuracy: 0.9307894736842105
F1 macro: 0.9307420849630599
F1 weighted: 0.9307420849630599


In [13]:
sum(p.numel() for p in gpt.parameters())

3316609