In [1]:
import datetime
import os

# os.environ["PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4/" + os.environ["PATH"]
# )
# os.environ["LD_LIBRARY_PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4"
#     + os.environ.get("LD_LIBRARY_PATH", "")
# )

from huggingface_hub import HfApi

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_invweighted"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
# dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet("../00_source_data/subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]
labeled["label_1_encoded"] = labeled["label_1"] - 1

In [4]:
proportions = labeled["label_1_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Just make sure ordered right since that's critical
assert max(inverse_weights) == inverse_weights[4]
assert min(inverse_weights) == inverse_weights[1]

In [5]:
labeled = labeled.sort_values("description")

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"],
    labeled["description"],
    test_size=0.2,
    random_state=47,
    stratify=labeled["label_1_encoded"],
)
print(len(train_label))
print(len(test_label))

64148
16038


In [7]:
training_indices = train_label.index
testing_indices = test_label.index

pd.DataFrame({"train_indices": train_label.index}).to_parquet(
    "../20_intermediate_data/1digit_training_indices.parquet"
)
pd.DataFrame({"test_indices": test_label.index}).to_parquet(
    "../20_intermediate_data/1digit_testing_indices.parquet"
)

train_label = train_label.values
test_label = test_label.values
train_text = train_text.values
test_text = test_text.values

In [8]:
pd.read_parquet(
    "../30_results/hyperparams_1digit_bigbert_invweights.parquet"
).sort_values("accuracy", ascending=False)

Unnamed: 0,mlen,batch_size,lr,accuracy
14,128,16,1e-05,0.873447
6,256,8,1e-05,0.873198
10,512,8,1e-05,0.867112
2,128,8,1e-05,0.865716
30,256,32,1e-05,0.86387
9,512,8,1e-06,0.862274
26,128,32,1e-05,0.861775
22,512,16,1e-05,0.860777
18,256,16,1e-05,0.859031
1,128,8,1e-06,0.853095


In [7]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [22]:
# Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")
# df.to_parquet("hyperparams_1digit_bigbert_invweights.parquet")

# # Later Runs
# hypers = pd.read_parquet("hyperparams_1digit_bigbert_invweights.parquet")
# hypers

In [23]:
# for p in hypers[hypers["accuracy"].isnull()].iterrows():
# print("starting: ")
# print(p)
# MAX_LEN = int(p[1]["mlen"])
# BATCH_SIZE = int(p[1]["batch_size"])
# EPOCHS = 3
# LEARNING_RATE = p[1]["lr"]

MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-6


# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##########
# Load model or checkpoint
##########

model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertForSequenceClassification.from_pretrained(
    model, num_labels=labeled["label_1_encoded"].nunique()
)
model.to(device)


# model = "bert-large-uncased"
# tokenizer = BertTokenizer.from_pretrained(model)
# tokenizer.push_to_hub(repo_id, commit_message=f"transfer over large bert tokenizer")
# model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
# tokenizer = BertTokenizer.from_pretrained(repo_id)
# model.eval()

# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  weights = torch.from_numpy(inverse_weights)


tensor([0.2046, 0.0695, 0.1293, 0.1755, 0.4210], device='cuda:0')

In [24]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Actual training
for epoch in range(0, EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Use inverse weights and cross entropy
        criterion = nn.CrossEntropyLoss(weight=weights)
        criterion.to(device)

        loss = criterion(outputs.get("logits"), labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

    # Test data eval
    model.eval()
    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            formatted_to_list = list(map(lambda x: x.item(), predictions))
            all_predictions.extend(formatted_to_list)

    accuracy = correct / total
    print(f"Validation Accuracy for epoch {epoch}: {accuracy:.4f}")

    # Revert to train
    model.train()

model.push_to_hub(
    repo_id, commit_message=f"trained_invweight_large_{time}_epoch{epoch}"
)
tokenizer.push_to_hub(
    repo_id, commit_message=f"trained_invweight_large_{time}_epoch{epoch}"
)

  0%|          | 0/4010 [00:00<?, ?it/s]

Epoch 0: 100%|██████████| 4010/4010 [24:38<00:00,  2.71it/s, loss=0.224] 


Epoch 0 Loss: 0.7710346212633827
Validation Accuracy for epoch 0: 0.8548


Epoch 1: 100%|██████████| 4010/4010 [24:36<00:00,  2.72it/s, loss=0.105] 


Epoch 1 Loss: 0.4918659721842579
Validation Accuracy for epoch 1: 0.8610


Epoch 2: 100%|██████████| 4010/4010 [24:29<00:00,  2.73it/s, loss=0.0948]


Epoch 2 Loss: 0.4414804197485533
Validation Accuracy for epoch 2: 0.8692


Epoch 3: 100%|██████████| 4010/4010 [24:28<00:00,  2.73it/s, loss=1.02]  
model.safetensors: 100%|██████████| 1.34G/1.34G [00:26<00:00, 51.0MB/s]  


Epoch 3 Loss: 0.4041970289653711
Validation Accuracy for epoch 3: 0.8685


Epoch 4: 100%|██████████| 4010/4010 [24:33<00:00,  2.72it/s, loss=0.931] 


Epoch 4 Loss: 0.37446743553211714
Validation Accuracy for epoch 4: 0.8808


Epoch 5: 100%|██████████| 4010/4010 [24:28<00:00,  2.73it/s, loss=0.376] 


Epoch 5 Loss: 0.35527168098892875
Validation Accuracy for epoch 5: 0.8811


Epoch 6: 100%|██████████| 4010/4010 [24:28<00:00,  2.73it/s, loss=0.22]  
model.safetensors: 100%|██████████| 1.34G/1.34G [00:25<00:00, 51.8MB/s]  


Epoch 6 Loss: 0.33053363625173854
Validation Accuracy for epoch 6: 0.8796


Epoch 7: 100%|██████████| 4010/4010 [24:28<00:00,  2.73it/s, loss=0.024] 


Epoch 7 Loss: 0.30262619467252433
Validation Accuracy for epoch 7: 0.8832


Epoch 8: 100%|██████████| 4010/4010 [24:29<00:00,  2.73it/s, loss=0.0637]


Epoch 8 Loss: 0.2827641599779099
Validation Accuracy for epoch 8: 0.8827


Epoch 9: 100%|██████████| 4010/4010 [24:28<00:00,  2.73it/s, loss=0.213] 
model.safetensors: 100%|██████████| 1.34G/1.34G [00:24<00:00, 55.6MB/s] 


Epoch 9 Loss: 0.26099166046860545
Validation Accuracy for epoch 9: 0.8850


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nickeubank/leaa_grant_subjects_invweighted/commit/892a88b08735cbf80de99257e93f941022ba96ae', commit_message='trained_invweight_large_2025_06_07_14_36_epoch9', commit_description='', oid='892a88b08735cbf80de99257e93f941022ba96ae', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nickeubank/leaa_grant_subjects_invweighted', endpoint='https://huggingface.co', repo_type='model', repo_id='nickeubank/leaa_grant_subjects_invweighted'), pr_revision=None, pr_num=None)

In [10]:
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-6


# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)
model.eval()

# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Test data eval
model.eval()
correct = 0
total = 0
all_predictions = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        formatted_to_list = list(map(lambda x: x.item(), predictions))
        all_predictions.extend(formatted_to_list)

accuracy = correct / total

NameError: name 'epoch' is not defined

In [11]:
print(f"Validation Accuracy for epoch: {accuracy:.4f}")

# Revert to train

predictions_and_actual = pd.DataFrame(
    {"actual": test_label, "predicted": all_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

Validation Accuracy for epoch: 0.8850


predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1915,64,90,100,65
1.0,159,6054,80,49,235
2.0,125,41,3152,121,96
3.0,96,17,84,2298,111
4.0,62,91,75,84,774


In [17]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize=True,
)
confusion.to_parquet("../30_results/large_bert_confusion_matrix_1digit.parquet")
confusion.to_csv("../30_results/large_bert_confusion_matrix_1digit.csv")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,11.9,0.4,0.6,0.6,0.4
1.0,1.0,37.7,0.5,0.3,1.5
2.0,0.8,0.3,19.7,0.8,0.6
3.0,0.6,0.1,0.5,14.3,0.7
4.0,0.4,0.6,0.5,0.5,4.8


In [20]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
    margins=True,
)
confusion.to_csv("../30_results/large_bert_confusion_matrix_1digit_margins.csv")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,11.9,0.4,0.6,0.6,0.4,13.9
1.0,1.0,37.7,0.5,0.3,1.5,41.0
2.0,0.8,0.3,19.7,0.8,0.6,22.0
3.0,0.6,0.1,0.5,14.3,0.7,16.2
4.0,0.4,0.6,0.5,0.5,4.8,6.8
All,14.7,39.1,21.7,16.5,8.0,100.0


In [21]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="index",
    margins=True,
)
confusion.to_csv(
    "../30_results/large_bert_confusion_matrix_1digit_share_true_in_each.csv"
)
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,85.7,2.9,4.0,4.5,2.9
1.0,2.4,92.0,1.2,0.7,3.6
2.0,3.5,1.2,89.2,3.4,2.7
3.0,3.7,0.7,3.2,88.2,4.3
4.0,5.7,8.4,6.9,7.7,71.3
All,14.7,39.1,21.7,16.5,8.0
