In [2]:
import datetime
import json
import os

from huggingface_hub import HfApi, upload_file, hf_hub_download
import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_2digits_invweighted"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
assert torch.cuda.is_available()

In [4]:
#######
# Get Data
#######

grants = pd.read_parquet(
    "../20_intermediate_data/predicted_labels_1digit_weighted.parquet"
)
grants = grants.drop_duplicates("description")
labeled = grants[grants["label_2"].notnull()]
labeled = labeled.sort_values(
    ["label_2", "description"], ascending=True
)  # Try to stabilize labels.

In [6]:
##########
# Encode (2 digits trickier than 1 digit) and save encoding
# In the future, just make it a dict. Sigh.
##########
encoder_name = "label_mapping_2digit_invweighted.json"

# Save away
label_encoder = LabelEncoder()
label_encoder.fit(labeled["label_2"])
mapping = {label: int(idx) for idx, label in enumerate(label_encoder.classes_)}
encoder_file = f"../20_intermediate_data/{encoder_name}"
with open(encoder_file, "w") as f:
    json.dump(mapping, f)
upload_file(
    path_or_fileobj=encoder_file,
    path_in_repo=encoder_name,
    repo_id=repo_id,
    repo_type="model",
)

# Now read back in (helpful to ensure worked) encoder:
file_path = hf_hub_download(repo_id=repo_id, filename=encoder_name, repo_type="model")
with open(file_path, "r", encoding="utf-8") as f:
    encodings_json = json.load(f)

labels_to_fit = list(encodings_json.keys())

# Validate `.keys()` ordered correctly
assert list(map(lambda k: encodings_json[k], labels_to_fit)) == list(
    range(len(labels_to_fit))
)

label_encoder = LabelEncoder()

# Destring
labels_to_fit_str = list(map(float, labels_to_fit))
label_encoder.fit(labels_to_fit_str)
labeled["label_2_encoded"] = label_encoder.transform(labeled["label_2"])

No files have been modified since last commit. Skipping to prevent empty commit.


In [7]:
proportions = labeled["label_2_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Check a few
assert inverse_weights[label_encoder.classes_ == 24] < inverse_weights.mean()
assert inverse_weights[label_encoder.classes_ == 22] == inverse_weights.min()

In [8]:
labeled = labeled.sort_values("description")

# # Sample for hyper param tuning
# labeled = labeled.sample(frac=0.4)

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_2_encoded"],
    labeled["description"],
    test_size=0.2,
    random_state=50,
    stratify=labeled["label_2_encoded"],
)

pd.DataFrame({"train_indices": train_label.index}).to_parquet(
    "../20_intermediate_data/2digit_training_indices.parquet"
)
pd.DataFrame({"test_indices": test_label.index}).to_parquet(
    "../20_intermediate_data/2digit_testing_indices.parquet"
)

train_label = train_label.values
test_label = test_label.values
train_text = train_text.values
test_text = test_text.values

In [9]:
# Data holder


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [11]:
# # Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")

# hypers_file = "../30_results/hyperparams_2digit_bigbert_invweights.parquet"
# df.to_parquet(hypers_file)

In [13]:
# # Later Runs
# hypers = pd.read_parquet(hypers_file).sort_values("accuracy", ascending=False)
# hypers

Unnamed: 0,mlen,batch_size,lr,accuracy
6,256,8,1e-05,0.753508
10,512,8,1e-05,0.752572
14,128,16,1e-05,0.745401
30,256,32,1e-05,0.735549
18,256,16,1e-05,0.728191
26,128,32,1e-05,0.72208
22,512,16,1e-05,0.718464
9,512,8,1e-06,0.676311
1,128,8,1e-06,0.673754
5,256,8,1e-06,0.637401


In [14]:
# for p in hypers[hypers["accuracy"].isnull()].iterrows():
#     print("starting: ")
#     print(p)
#     MAX_LEN = int(p[1]["mlen"])
#     BATCH_SIZE = int(p[1]["batch_size"])
#     EPOCHS = 2
#     LEARNING_RATE = p[1]["lr"]

# Going a little slower than hyper params suggest given issues with collapse in
# later epochs of 1 digit with 1e-5.
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-6

# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##########
# Load model or checkpoint
##########

# ####
# # Load naive bert
# ####
model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertForSequenceClassification.from_pretrained(
    model, num_labels=labeled["label_2_encoded"].nunique()
)
model.to(device)

####
# OR Load checkpoint if loading from checkpoint
####

# model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
# tokenizer = BertTokenizer.from_pretrained(repo_id)
# model.eval()


# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Actual training
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Use inverse weights and cross entropy
        criterion = nn.CrossEntropyLoss(weight=weights)
        criterion.to(device)

        loss = criterion(outputs.get("logits"), labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

    # Test data eval
    model.eval()
    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            formatted_to_list = list(map(lambda x: x.item(), predictions))
            all_predictions.extend(formatted_to_list)

    accuracy = correct / total
    print(f"Validation Accuracy for epoch {epoch}: {accuracy:.4f}")

    # Revert to train
    model.train()


model.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)
tokenizer.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  weights = torch.from_numpy(inverse_weights)
Epoch 0: 100%|██████████| 4010/4010 [24:14<00:00,  2.76it/s, loss=2.31]


Epoch 0 Loss: 2.6334186179679526
Validation Accuracy for epoch 0: 0.6719


Epoch 1: 100%|██████████| 4010/4010 [24:08<00:00,  2.77it/s, loss=0.407]


Epoch 1 Loss: 1.496499703687028
Validation Accuracy for epoch 1: 0.7210


Epoch 2: 100%|██████████| 4010/4010 [24:00<00:00,  2.78it/s, loss=0.59] 


Epoch 2 Loss: 1.1255777285209023
Validation Accuracy for epoch 2: 0.7454


Epoch 3: 100%|██████████| 4010/4010 [23:59<00:00,  2.79it/s, loss=0.189]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:24<00:00, 54.4MB/s]


Epoch 3 Loss: 0.9686059132664281
Validation Accuracy for epoch 3: 0.7657


Epoch 4: 100%|██████████| 4010/4010 [24:10<00:00,  2.76it/s, loss=0.798]


Epoch 4 Loss: 0.8714034999435085
Validation Accuracy for epoch 4: 0.7660


Epoch 5: 100%|██████████| 4010/4010 [24:05<00:00,  2.77it/s, loss=0.32]  


Epoch 5 Loss: 0.8031697056407198
Validation Accuracy for epoch 5: 0.7743


Epoch 6: 100%|██████████| 4010/4010 [24:06<00:00,  2.77it/s, loss=1.54] 
model.safetensors: 100%|██████████| 1.34G/1.34G [00:24<00:00, 54.6MB/s]


Epoch 6 Loss: 0.7497206343881359
Validation Accuracy for epoch 6: 0.7774


Epoch 7: 100%|██████████| 4010/4010 [24:02<00:00,  2.78it/s, loss=0.497] 


Epoch 7 Loss: 0.698364432323306
Validation Accuracy for epoch 7: 0.7810


Epoch 8: 100%|██████████| 4010/4010 [24:09<00:00,  2.77it/s, loss=0.271] 


Epoch 8 Loss: 0.660501117621574
Validation Accuracy for epoch 8: 0.7853


Epoch 9: 100%|██████████| 4010/4010 [24:15<00:00,  2.76it/s, loss=0.0991]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:29<00:00, 45.6MB/s]  


Epoch 9 Loss: 0.620305609665904
Validation Accuracy for epoch 9: 0.7889


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nickeubank/leaa_grant_subjects_2digits_invweighted/commit/d5aa1b19928b1a945b233dc00136983dbd394142', commit_message='trained_invweight_2digit_large_2025_06_09_16_16_epoch9', commit_description='', oid='d5aa1b19928b1a945b233dc00136983dbd394142', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nickeubank/leaa_grant_subjects_2digits_invweighted', endpoint='https://huggingface.co', repo_type='model', repo_id='nickeubank/leaa_grant_subjects_2digits_invweighted'), pr_revision=None, pr_num=None)

In [15]:
decoded_predictions = label_encoder.inverse_transform(all_predictions)
decoded_actual = label_encoder.inverse_transform(test_label)

predictions_and_actual = pd.DataFrame(
    {"actual": decoded_actual, "predicted": decoded_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,...,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.0,496,9,2,22,9,10,4,0,1,0,...,0,3,0,0,1,4,0,2,4,0
11.0,10,1022,40,2,1,28,0,3,0,70,...,23,8,45,2,5,16,11,34,13,6
12.0,1,12,228,0,0,2,0,1,0,9,...,8,0,6,0,0,4,1,4,0,0
20.0,12,0,0,150,18,4,2,16,0,0,...,0,0,0,0,0,1,1,0,0,0
21.0,13,0,3,90,1079,7,26,23,8,2,...,1,0,0,0,0,1,4,2,1,1
22.0,17,34,7,14,10,2012,24,32,0,12,...,3,1,1,0,0,1,140,9,2,5
23.0,3,0,0,2,9,5,481,16,3,0,...,2,0,0,5,0,1,31,1,3,2
24.0,20,4,5,84,118,52,75,1643,34,0,...,4,10,1,5,0,6,4,37,12,6
25.0,0,0,0,0,0,0,0,4,45,0,...,0,3,0,0,0,0,0,0,0,2
30.0,0,36,18,0,0,8,0,0,0,384,...,4,5,16,0,1,7,0,1,5,2


In [16]:
pd.options.display.max_columns = None

confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
)
confusion.to_parquet("../30_results/large_bert_confusion_matrix_2digit.parquet")
confusion.to_csv("../30_results/large_bert_confusion_matrix_2digit.csv")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,31.0,32.0,33.0,34.0,35.0,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
10.0,3.1,0.1,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11.0,0.1,6.4,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.4,0.2,0.0,0.0,0.0,0.0,0.1,0.0,0.3,0.0,0.0,0.1,0.1,0.2,0.1,0.0
12.0,0.0,0.1,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20.0,0.1,0.0,0.0,0.9,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21.0,0.1,0.0,0.0,0.6,6.7,0.0,0.2,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22.0,0.1,0.2,0.0,0.1,0.1,12.5,0.1,0.2,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0,0.0
23.0,0.0,0.0,0.0,0.0,0.1,0.0,3.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0
24.0,0.1,0.0,0.0,0.5,0.7,0.3,0.5,10.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.0
25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30.0,0.0,0.2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.2,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="index",  # rows sum to 100.
    margins=True,
)
confusion.to_csv(
    "../30_results/large_bert_confusion_matrix_2digit_share_true_in_each.csv"
)
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,31.0,32.0,33.0,34.0,35.0,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
10.0,87.2,1.6,0.4,3.9,1.6,1.8,0.7,0.0,0.2,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.2,0.7,0.0,0.4,0.7,0.0
11.0,0.7,74.0,2.9,0.1,0.1,2.0,0.0,0.2,0.0,5.1,2.5,0.4,0.0,0.1,0.2,1.7,0.6,3.3,0.1,0.4,1.2,0.8,2.5,0.9,0.4
12.0,0.4,4.2,80.3,0.0,0.0,0.7,0.0,0.4,0.0,3.2,2.8,0.0,0.0,0.0,0.0,2.8,0.0,2.1,0.0,0.0,1.4,0.4,1.4,0.0,0.0
20.0,5.8,0.0,0.0,72.8,8.7,1.9,1.0,7.8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
21.0,1.0,0.0,0.2,7.0,84.2,0.5,2.0,1.8,0.6,0.2,0.1,0.2,1.4,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.1,0.3,0.2,0.1,0.1
22.0,0.7,1.5,0.3,0.6,0.4,86.2,1.0,1.4,0.0,0.5,0.1,0.0,0.2,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,6.0,0.4,0.1,0.2
23.0,0.5,0.0,0.0,0.4,1.6,0.9,84.2,2.8,0.5,0.0,0.0,0.0,0.4,0.0,0.9,0.4,0.0,0.0,0.9,0.0,0.2,5.4,0.2,0.5,0.4
24.0,0.9,0.2,0.2,3.9,5.5,2.4,3.5,77.1,1.6,0.0,0.0,0.0,0.4,0.1,0.0,0.2,0.5,0.0,0.2,0.0,0.3,0.2,1.7,0.6,0.3
25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,81.8,0.0,0.0,0.0,0.0,0.0,1.8,0.0,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.6
30.0,0.0,6.6,3.3,0.0,0.0,1.5,0.0,0.0,0.0,70.5,6.1,0.4,1.5,2.4,0.4,0.7,0.9,2.9,0.0,0.2,1.3,0.0,0.2,0.9,0.4
