In [1]:
import datetime
import os

os.environ["PATH"] = (
    "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4/" + os.environ["PATH"]
)
os.environ["LD_LIBRARY_PATH"] = (
    "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4"
    + os.environ.get("LD_LIBRARY_PATH", "")
)

from huggingface_hub import HfApi

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
# dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(workingdir + "subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]
labeled["label_1_encoded"] = labeled["label_1"] - 1

In [4]:
labeled = labeled.sort_values("description")

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"].values,
    labeled["description"].values,
    test_size=0.2,
    random_state=45,
    stratify=labeled["label_1"],
)
print(len(train_label))
print(len(test_label))

64148
16038


In [8]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [6]:
# Hypertuning Parameters
# hypers = {"lr": [], "mlen": [], "batch_size": [], "accuracy": []}
# hypers = pd.read_parquet("hyperparams_1digit_bigbert.parquet").to_dict()
# for k in hypers.keys():
#     hypers[k] = list(hypers[k].values())

# df = pd.read_parquet("hyperparams_1digit_bigbert.parquet")
# df.sort_values("accuracy", ascending=False)

# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]
# params

In [7]:
# for p in params[11:]:
# MAX_LEN = p["mlen"]
# BATCH_SIZE = p["batch_size"]
# EPOCHS = 5
# LEARNING_RATE = p["lr"]

MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.000010

model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model and Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    model, num_labels=grants["label_1"].nunique()
)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")
model.push_to_hub(repo_id, commit_message=f"trained_largebert_{time}_epoch{epoch}")
tokenizer.push_to_hub(
    repo_id,
    commit_message=f"trained_largebert_{time}_epoch{epoch}",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 4010/4010 [26:40<00:00,  2.51it/s, loss=0.951] 


Epoch 0 Loss: 0.40974746229001635


Epoch 1: 100%|██████████| 4010/4010 [24:41<00:00,  2.71it/s, loss=0.0107]


Epoch 1 Loss: 0.29572485191654646


Epoch 2: 100%|██████████| 4010/4010 [24:41<00:00,  2.71it/s, loss=0.119]  


Epoch 2 Loss: 0.24037385647756482


model.safetensors: 100%|██████████| 1.34G/1.34G [00:28<00:00, 47.7MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Epoch 3: 100%|██████████| 4010/4010 [25:18<00:00,  2.64it/s, loss=0.252]  


Epoch 3 Loss: 0.17574234464498437


Epoch 4: 100%|██████████| 4010/4010 [24:40<00:00,  2.71it/s, loss=0.387]  


Epoch 4 Loss: 0.1245785100787049


Epoch 5: 100%|██████████| 4010/4010 [24:36<00:00,  2.72it/s, loss=0.00836] 


Epoch 5 Loss: 0.08930210874716697


model.safetensors: 100%|██████████| 1.34G/1.34G [00:30<00:00, 44.1MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Epoch 6: 100%|██████████| 4010/4010 [25:13<00:00,  2.65it/s, loss=0.00175] 


Epoch 6 Loss: 0.06819923999231596


Epoch 7: 100%|██████████| 4010/4010 [24:39<00:00,  2.71it/s, loss=0.00193] 


Epoch 7 Loss: 0.05406233600261383


Epoch 8: 100%|██████████| 4010/4010 [24:40<00:00,  2.71it/s, loss=0.0191]  


Epoch 8 Loss: 0.044383052300148385


model.safetensors: 100%|██████████| 1.34G/1.34G [00:45<00:00, 29.7MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Epoch 9: 100%|██████████| 4010/4010 [25:29<00:00,  2.62it/s, loss=0.00621] 


Epoch 9 Loss: 0.0405330149601906


model.safetensors: 100%|██████████| 1.34G/1.34G [00:40<00:00, 32.9MB/s]  
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nickeubank/leaa_grant_subjects/commit/c80c14f3f8781cb484dece36cae457f2df921e7e', commit_message='trained_largebert_2025_04_25_22_41_epoch9', commit_description='', oid='c80c14f3f8781cb484dece36cae457f2df921e7e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nickeubank/leaa_grant_subjects', endpoint='https://huggingface.co', repo_type='model', repo_id='nickeubank/leaa_grant_subjects'), pr_revision=None, pr_num=None)

In [10]:
# Evaluation

#############
# Load Model if not immediately after train
############
BATCH_SIZE = 16
MAX_LEN = 256

train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)

In [12]:
############
# Back to main flow
############

model.eval()
correct = 0
total = 0
all_predictions = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        formatted_to_list = list(map(lambda x: x.item(), predictions))
        all_predictions.extend(formatted_to_list)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# hypers["lr"].append(LEARNING_RATE)
# hypers["mlen"].append(MAX_LEN)
# hypers["batch_size"].append(BATCH_SIZE)
# hypers["accuracy"].append(accuracy)
# print(hypers)
# pd.DataFrame(hypers).to_parquet(workingdir + "hyperparams_1digit_bigbert.parquet")

Validation Accuracy: 0.8855


In [13]:
predictions_and_actual = pd.DataFrame(
    {"actual": test_label, "predicted": all_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1904,89,114,79,48
1.0,118,6258,35,17,149
2.0,118,80,3151,91,95
3.0,116,52,154,2184,100
4.0,81,159,73,68,705


In [None]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize=True,
)
confusion.to_parquet("large_bert_confusion_matrix.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,11.9,0.6,0.7,0.5,0.3
1.0,0.7,39.0,0.2,0.1,0.9
2.0,0.7,0.5,19.6,0.6,0.6
3.0,0.7,0.3,1.0,13.6,0.6
4.0,0.5,1.0,0.5,0.4,4.4


In [25]:
predictions_and_actual["actual"].value_counts(normalize=True).sort_index()

actual
0.0    0.139294
1.0    0.410089
2.0    0.220414
3.0    0.162489
4.0    0.067714
Name: proportion, dtype: float64

In [37]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
    margins=True,
)
# confusion.to_parquet("large_bert_confusion_matrix_margins.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,11.9,0.6,0.7,0.5,0.3,13.9
1.0,0.7,39.0,0.2,0.1,0.9,41.0
2.0,0.7,0.5,19.6,0.6,0.6,22.0
3.0,0.7,0.3,1.0,13.6,0.6,16.2
4.0,0.5,1.0,0.5,0.4,4.4,6.8
All,14.6,41.4,22.0,15.2,6.8,100.0


In [32]:
pd.crosstab?

[31mSignature:[39m
pd.crosstab(
    index,
    columns,
    values=[38;5;28;01mNone[39;00m,
    rownames=[38;5;28;01mNone[39;00m,
    colnames=[38;5;28;01mNone[39;00m,
    aggfunc=[38;5;28;01mNone[39;00m,
    margins: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    margins_name: [33m'Hashable'[39m = [33m'All'[39m,
    dropna: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
    normalize: [33m"bool | Literal[0, 1, 'all', 'index', 'columns']"[39m = [38;5;28;01mFalse[39;00m,
) -> [33m'DataFrame'[39m
[31mDocstring:[39m
Compute a simple cross tabulation of two (or more) factors.

By default, computes a frequency table of the factors unless an
array of values and an aggregation function are passed.

Parameters
----------
index : array-like, Series, or list of arrays/Series
    Values to group by in the rows.
columns : array-like, Series, or list of arrays/Series
    Values to group by in the columns.
values : array-like, optional
    Array of values to aggregate accord