In [1]:
import datetime
import os

# os.environ["PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4/" + os.environ["PATH"]
# )
# os.environ["LD_LIBRARY_PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4"
#     + os.environ.get("LD_LIBRARY_PATH", "")
# )

from huggingface_hub import HfApi

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_invweighted"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
# dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(workingdir + "subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]
labeled["label_1_encoded"] = labeled["label_1"] - 1

In [4]:
proportions = labeled["label_1_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Just make sure ordered right since that's critical
assert max(inverse_weights) == inverse_weights[4]
assert min(inverse_weights) == inverse_weights[1]

In [5]:
labeled = labeled.sort_values("description")

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"].values,
    labeled["description"].values,
    test_size=0.2,
    random_state=47,
    stratify=labeled["label_1_encoded"],
)
print(len(train_label))
print(len(test_label))

64148
16038


In [6]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [7]:
# Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")
# df.to_parquet("hyperparams_1digit_bigbert_invweights.parquet")

# # Later Runs
# hypers = pd.read_parquet("hyperparams_1digit_bigbert_invweights.parquet")
# hypers

In [None]:
# for p in hypers[hypers["accuracy"].isnull()].iterrows():
# print("starting: ")
# print(p)
# MAX_LEN = int(p[1]["mlen"])
# BATCH_SIZE = int(p[1]["batch_size"])
# EPOCHS = 3
# LEARNING_RATE = p[1]["lr"]

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.000010


# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##########
# Load model or checkpoint
##########

# model = "bert-large-uncased"
# tokenizer = BertTokenizer.from_pretrained(model)
# model = BertForSequenceClassification.from_pretrained(
#     model, num_labels=labeled["label_1_encoded"].nunique()
# )
# model.to(device)
#
# model = "bert-large-uncased"
# tokenizer = BertTokenizer.from_pretrained(model)
# tokenizer.push_to_hub(repo_id, commit_message=f"transfer over large bert tokenizer")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)
model.eval()

# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

  weights = torch.from_numpy(inverse_weights)


tensor([0.2046, 0.0695, 0.1293, 0.1755, 0.4210], device='cuda:0')

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Actual training
for epoch in range(4, EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Use inverse weights and cross entropy
        criterion = nn.CrossEntropyLoss(weight=weights)
        criterion.to(device)

        loss = criterion(outputs.get("logits"), labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

model.push_to_hub(
    repo_id, commit_message=f"trained_invweight_large_{time}_epoch{epoch}"
)
tokenizer.push_to_hub(
    repo_id, commit_message=f"trained_invweight_large_{time}_epoch{epoch}"
)

In [8]:
############
# Back to main flow
############

model.eval()
correct = 0
total = 0
all_predictions = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        formatted_to_list = list(map(lambda x: x.item(), predictions))
        all_predictions.extend(formatted_to_list)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# hypers.loc[
#     (hypers["mlen"] == MAX_LEN)
#     & (hypers["batch_size"] == BATCH_SIZE)
#     & (hypers["lr"] == LEARNING_RATE),
#     "accuracy",
# ] = accuracy
# print(
#     hypers[
#         (hypers["mlen"] == MAX_LEN)
#         & (hypers["batch_size"] == BATCH_SIZE)
#         & (hypers["lr"] == LEARNING_RATE)
#     ]
# )
# pd.DataFrame(hypers).to_parquet(
#     workingdir + "hyperparams_1digit_bigbert_invweights.parquet"
# )

Validation Accuracy: 0.8819


In [10]:
predictions_and_actual = pd.DataFrame(
    {"actual": test_label, "predicted": all_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1849,74,135,121,55
1.0,133,6134,93,44,173
2.0,116,47,3177,109,86
3.0,100,27,121,2251,107
4.0,62,124,78,89,733


In [11]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize=True,
)
confusion.to_parquet("large_bert_confusion_matrix.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,11.5,0.5,0.8,0.8,0.3
1.0,0.8,38.2,0.6,0.3,1.1
2.0,0.7,0.3,19.8,0.7,0.5
3.0,0.6,0.2,0.8,14.0,0.7
4.0,0.4,0.8,0.5,0.6,4.6


In [12]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
    margins=True,
)
# confusion.to_parquet("large_bert_confusion_matrix_margins.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,11.5,0.5,0.8,0.8,0.3,13.9
1.0,0.8,38.2,0.6,0.3,1.1,41.0
2.0,0.7,0.3,19.8,0.7,0.5,22.0
3.0,0.6,0.2,0.8,14.0,0.7,16.2
4.0,0.4,0.8,0.5,0.6,4.6,6.8
All,14.1,39.9,22.5,16.3,7.2,100.0
