In [49]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [50]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
gpt = AutoModelForCausalLM.from_pretrained("distilgpt2")

tokenizer.pad_token = tokenizer.eos_token 

In [51]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
import torch


class TextDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer) -> None:

        self.df = df
        self.tokenizer = tokenizer


    def __len__(self) -> int:
        return len(self.df)
    

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]

        encodings = self.tokenizer(
            row["text"],
            truncation=True,
            padding=False,
            max_length=512,
        )

        item = {
            "input_ids": torch.tensor(encodings["input_ids"]),
            "attention_mask": torch.tensor(encodings["attention_mask"]),
            "text_id": torch.tensor(idx),
            "labels": torch.tensor(row["label"] - 1)
        }

        return item

In [52]:
import torch.nn as nn

class FinalLayer(nn.Module):

    def __init__(self, hidden_dim: int, num_classes: int):
        super().__init__()
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        repr = x.mean(axis=-2)
        return self.classifier(repr)

In [53]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(TextDataset(df_train, tokenizer), batch_size=32, collate_fn=collator)
test_loader = DataLoader(TextDataset(df_test, tokenizer), batch_size=32, collate_fn=collator)

batch = next(iter(train_loader))

final_layer = FinalLayer(768, 1)
gpt.lm_head = final_layer
out = gpt(batch["input_ids"])

sum(p.numel() for p in gpt.parameters()), out.logits.shape

(81913345, torch.Size([32, 1]))

In [54]:
for param in gpt.parameters():
    param.requires_grad = False

for param in final_layer.parameters():
    param.requires_grad = True

In [55]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [56]:
import torch
import torch.nn as nn
from tqdm import tqdm

epochs = 3

device = torch.device(choose_device())  # your device selection
print(f"Training on device: {device}")

gpt.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(final_layer.parameters())

for epoch in range(1, epochs + 1):
    gpt.train()
    total_loss = 0.0
    correct = 0
    total = 0

    progress = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}/{epochs}")

    for i, batch in progress:
        # move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        out = gpt(batch["input_ids"]).logits
        
        loss = criterion(out.view(-1), batch["labels"].view(-1).float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.sigmoid(out).view(-1)  # convert logits to probabilities
        predicted_labels = (preds >= 0.5).long()
        correct += (predicted_labels == batch["labels"].view(-1)).sum().item()
        total += batch["labels"].numel()

        avg_loss = total_loss / (i + 1)
        acc = correct / total

        progress.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{acc:.4f}", "lr": optimizer.param_groups[0]["lr"]})

    print(f"Epoch {epoch} done | Avg Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")


Training on device: mps


Epoch 1/3:   0%|          | 67/17500 [00:35<2:35:20,  1.87it/s, loss=0.6610, acc=0.6068, lr=0.001]


KeyboardInterrupt: 