<a href="https://colab.research.google.com/github/nickeubank/leaa_subj/blob/main/bert_training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)


In [9]:
from google.colab import drive
drive.mount('/content/gdrive')



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [10]:

dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(dir + "subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]


# Encode labels. For 1 digit codes, not important, but
# the two digits aren't sequential so let's just use.
label_encoder = LabelEncoder()
labeled["label_1_encoded"] = label_encoder.fit_transform(labeled["label_1"])

#for speed in testing
npr.seed(43)
labeled = labeled.sample(n=300)

labeled = labeled.sort_values("description")


train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"].values,
    labeled["description"].values,
    test_size=0.8,   # super inflated for hyper-param-tuning
    random_state=42,
    stratify=labeled["label_1"],
)
print(len(train_label))
print(len(test_label))

60
240


In [11]:

########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }




In [12]:
hypers = {"lr":[], "batch_size": [], "accuracy": []}

In [None]:
# Parameters
for mlen in [64, 128, 256, 512]:
  for batch_size in [8, 16, 32]:
      MAX_LEN = mlen
      BATCH_SIZE = batch_size
      EPOCHS = 3
      LEARNING_RATE = 1e-4

      # Define dataset
      model = "bert-base-uncased"
      tokenizer = BertTokenizer.from_pretrained(model)
      train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
      test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

      train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
      val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

      # Model and Device Setup
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      model = BertForSequenceClassification.from_pretrained(
          model, num_labels=grants["label_1"].nunique()
      )
      model.to(device)

      # Optimizer
      optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

      # Training Loop
      checkpoint_dir = "/content/gdrive/MyDrive/leaa/checkpoints"
      os.makedirs(checkpoint_dir, exist_ok=True)  # Create directory if it doesn't exist

      for epoch in range(EPOCHS):
          model.train()
          total_loss = 0
          loop = tqdm(train_loader, leave=True)

          if epoch > 0:
              checkpoint_path = os.path.join(checkpoint_dir, f"epoch_{epoch}_1digit.pth")
              torch.save(
                  {
                      "epoch": epoch + 1,
                      "model_state_dict": model.state_dict(),
                      "optimizer_state_dict": optimizer.state_dict(),
                      "loss": total_loss,
                  },
                  checkpoint_path,
              )
              print(f"Checkpoint saved to {checkpoint_path}")

          for batch in loop:
              optimizer.zero_grad()
              input_ids = batch["input_ids"].to(device)
              attention_mask = batch["attention_mask"].to(device)
              labels = batch["label"].to(device)

              outputs = model(
                  input_ids=input_ids, attention_mask=attention_mask, labels=labels
              )
              loss = outputs.loss
              total_loss += loss.item()

              loss.backward()
              optimizer.step()

              loop.set_description(f"Epoch {epoch}")
              loop.set_postfix(loss=loss.item())

          print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")


      # Evaluation
      model.eval()
      correct = 0
      total = 0

      with torch.no_grad():
          for batch in val_loader:
              input_ids = batch["input_ids"].to(device)
              attention_mask = batch["attention_mask"].to(device)
              labels = batch["label"].to(device)

              outputs = model(input_ids=input_ids, attention_mask=attention_mask)
              predictions = torch.argmax(outputs.logits, dim=1)

              correct += (predictions == labels).sum().item()
              total += labels.size(0)

      accuracy = correct / total
      print(f"Validation Accuracy: {accuracy:.4f}")

      hypers["lr"].append(learn_rate)
      hypers["batch_size"].append(batch_size)
      hypers["accuracy"].append(accuracy)
      print(f"LR: {learn_rate}, Batch: {batch_size}, Accuracy: {accuracy}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 8/8 [00:48<00:00,  6.02s/it, loss=1.37]


Epoch 0 Loss: 1.531050980091095


  0%|          | 0/8 [00:00<?, ?it/s]

Checkpoint saved to /content/gdrive/MyDrive/leaa/checkpoints/epoch_1_1digit.pth


Epoch 1:  62%|██████▎   | 5/8 [03:05<01:04, 21.34s/it, loss=1.28]

In [14]:
dir = "/content/gdrive/MyDrive/leaa/"

pd.DataFrame(hypers).to_parquet(dir + "hyperparameters_1digit.parquet")

In [15]:
pd.DataFrame(hypers)

Unnamed: 0,lr,batch_size,accuracy
0,0.001,8,0.425
1,0.0005,8,0.425
2,0.0001,8,0.658333
3,2e-05,8,0.445833


In [16]:
# Save
dir = "/content/gdrive/MyDrive/leaa/"
model.save_pretrained(dir + "bert_grant_classifier_1digit")
tokenizer.save_pretrained(dir + "bert_grant_classifier_1digit")
torch.save(label_encoder, dir + "label_encoder_1digit.pth")
