In [4]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
gpt = AutoModelForCausalLM.from_pretrained("distilgpt2")

tokenizer.pad_token = tokenizer.eos_token 

In [6]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
import torch


class TextDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer) -> None:

        self.df = df
        self.tokenizer = tokenizer


    def __len__(self) -> int:
        return len(self.df)
    

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]

        encodings = self.tokenizer(
            row["text"],
            truncation=True,
            padding=False,
            max_length=512,
        )

        item = {
            "input_ids": torch.tensor(encodings["input_ids"]),
            "attention_mask": torch.tensor(encodings["attention_mask"]),
            "text_id": torch.tensor(idx),
            "labels": torch.tensor(row["label"] - 1)
        }

        return item

In [7]:
import torch.nn as nn

class FinalLayer(nn.Module):

    def __init__(self, hidden_dim: int, num_classes: int):
        super().__init__()
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        repr = x.mean(axis=-2)
        return self.classifier(repr)

In [8]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(TextDataset(df_train, tokenizer), batch_size=32, collate_fn=collator)
test_loader = DataLoader(TextDataset(df_test, tokenizer), batch_size=32, collate_fn=collator)

batch = next(iter(train_loader))

final_layer = FinalLayer(768, 1)
gpt.lm_head = final_layer
out = gpt(batch["input_ids"])

sum(p.numel() for p in gpt.parameters()), out.logits.shape

(81913345, torch.Size([32, 1]))

In [9]:
gpt.load_state_dict(torch.load("finetuned.pt", map_location="cpu"))

<All keys matched successfully>

In [10]:
# for param in gpt.parameters():
#     param.requires_grad = False

# for param in final_layer.parameters():
#     param.requires_grad = True

In [10]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [11]:
import torch
import torch.nn as nn
from tqdm import tqdm

epochs = 3

device = torch.device(choose_device())  # your device selection
print(f"Training on device: {device}")

gpt.to(device)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.AdamW(final_layer.parameters())

# for epoch in range(1, epochs + 1):
#     gpt.train()
#     total_loss = 0.0
#     correct = 0
#     total = 0

#     progress = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}/{epochs}")

#     for i, batch in progress:
#         # move batch to device
#         batch = {k: v.to(device) for k, v in batch.items()}

#         optimizer.zero_grad()
#         out = gpt(batch["input_ids"]).logits
        
#         loss = criterion(out.view(-1), batch["labels"].view(-1).float())
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#         preds = torch.sigmoid(out).view(-1)  # convert logits to probabilities
#         predicted_labels = (preds >= 0.5).long()
#         correct += (predicted_labels == batch["labels"].view(-1)).sum().item()
#         total += batch["labels"].numel()

#         avg_loss = total_loss / (i + 1)
#         acc = correct / total

#         progress.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{acc:.4f}", "lr": optimizer.param_groups[0]["lr"]})

#     print(f"Epoch {epoch} done | Avg Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")


Training on device: mps


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): FinalLayer(
    (classifier): Linear(in_features=768, out_features=1, bias=True)
  )
)

In [14]:
progress = tqdm(enumerate(test_loader), total=len(test_loader), desc=f"Test evaluation")
gpt.eval()

pred_labels = []
probas = []
real_labels = []

correct = 0
total = 0

with torch.inference_mode():
    for i, batch in progress:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = gpt(batch["input_ids"]).logits

        preds = torch.sigmoid(out).view(-1)  # convert logits to probabilities
        predicted_labels = (preds >= 0.5).long()

        probas.append(preds)
        pred_labels.append(predicted_labels)
        real_labels.append(batch["labels"].view(-1))

        correct += (predicted_labels == batch["labels"].view(-1)).sum().item()
        total += batch["labels"].numel()
        acc = correct / total

        progress.set_postfix({"acc": f"{acc:.4f}"})

Test evaluation: 100%|██████████| 1188/1188 [08:39<00:00,  2.29it/s, acc=0.8771]


In [None]:
probas = torch.concatenate(probas)
real_labels = torch.concatenate(real_labels)

In [16]:
probas, real_labels = probas.to("cpu"), real_labels.to("cpu")

In [17]:
from sklearn.metrics import accuracy_score, f1_score

preds = (probas > 0.5).long()
acc = accuracy_score(real_labels, preds)
f1_macro = f1_score(real_labels, preds, average="macro")
f1_weighted = f1_score(real_labels, preds, average="weighted")

print("Accuracy:", acc)
print("F1 macro:", f1_macro)
print("F1 weighted:", f1_weighted)

Accuracy: 0.8771315789473684
F1 macro: 0.876833628833608
F1 weighted: 0.8768336288336082


In [18]:
sum(p.numel() for p in gpt.parameters())

81913345