In [1]:
import datetime
import os

# os.environ["PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4/" + os.environ["PATH"]
# )
# os.environ["LD_LIBRARY_PATH"] = (
#     "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4"
#     + os.environ.get("LD_LIBRARY_PATH", "")
# )

from huggingface_hub import HfApi

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_invweighted"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
# dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(workingdir + "subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]
labeled["label_1_encoded"] = labeled["label_1"] - 1

In [4]:
proportions = labeled["label_1_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Just make sure ordered right since that's critical
assert max(inverse_weights) == inverse_weights[4]
assert min(inverse_weights) == inverse_weights[1]

In [5]:
labeled = labeled.sort_values("description")

labeled = labeled.sample(frac=0.5)  # CHANGE AFTER FINE TUNING

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"].values,
    labeled["description"].values,
    test_size=0.5,  # CHANGE AFTER FINE TUNING
    random_state=45,
    stratify=labeled["label_1_encoded"],
)
print(len(train_label))
print(len(test_label))

20046
20047


In [6]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [7]:
# Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")
# df.to_parquet("hyperparams_1digit_bigbert_invweights.parquet")

# Later Runs
hypers = pd.read_parquet("hyperparams_1digit_bigbert_invweights.parquet")

In [8]:
for p in hypers[hypers["accuracy"].isnull()].iterrows():
    print("starting: ")
    print(p)
    MAX_LEN = int(p[1]["mlen"])
    BATCH_SIZE = int(p[1]["batch_size"])
    EPOCHS = 3
    LEARNING_RATE = p[1]["lr"]

    # MAX_LEN = 256
    # BATCH_SIZE = 16
    # EPOCHS = 10
    # LEARNING_RATE = 0.000010

    model = "bert-large-uncased"
    tokenizer = BertTokenizer.from_pretrained(model)
    train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
    test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Model and Device Setup
    assert torch.cuda.is_available()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BertForSequenceClassification.from_pretrained(
        model, num_labels=labeled["label_1_encoded"].nunique()
    )
    model.to(device)

    # Need weights as tensors on gpu
    weights = torch.from_numpy(inverse_weights)
    weights.to(device)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    # Actual training
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, leave=True)

        # Checkpoints
        if (epoch > 0) and (epoch % 3 == 0):

            time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

            model.push_to_hub(
                repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
            )

        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            # Use inverse weights and cross entropy
            criterion = nn.CrossEntropyLoss(weight=weights)
            criterion.to(device)

            loss = criterion(outputs.get("logits"), labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            loop.set_description(f"Epoch {epoch}")
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

    ############
    # Back to main flow
    ############

    model.eval()
    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            formatted_to_list = list(map(lambda x: x.item(), predictions))
            all_predictions.extend(formatted_to_list)

    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy:.4f}")

    hypers.loc[
        (hypers["mlen"] == MAX_LEN)
        & (hypers["batch_size"] == BATCH_SIZE)
        & (hypers["lr"] == LEARNING_RATE),
        "accuracy",
    ] = accuracy
    print(
        hypers[
            (hypers["mlen"] == MAX_LEN)
            & (hypers["batch_size"] == BATCH_SIZE)
            & (hypers["lr"] == LEARNING_RATE)
        ]
    )
    pd.DataFrame(hypers).to_parquet(
        workingdir + "hyperparams_1digit_bigbert_invweights.parquet"
    )

(0, mlen          1.280000e+02
batch_size    8.000000e+00
lr            1.000000e-07
accuracy               NaN
Name: 0, dtype: float64)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  weights = torch.from_numpy(inverse_weights)
Epoch 0: 100%|██████████| 2506/2506 [06:54<00:00,  6.05it/s, loss=1.43]


Epoch 0 Loss: 1.5895964227860582
Validation Accuracy: 0.4234
   mlen  batch_size            lr  accuracy
0   128           8  1.000000e-07  0.423355


Epoch 1: 100%|██████████| 2506/2506 [06:53<00:00,  6.06it/s, loss=1.39] 


Epoch 1 Loss: 1.4508955907032002
Validation Accuracy: 0.5574
   mlen  batch_size            lr  accuracy
0   128           8  1.000000e-07   0.55739


Epoch 2: 100%|██████████| 2506/2506 [06:53<00:00,  6.07it/s, loss=1.24] 


Epoch 2 Loss: 1.2920053475443496
Validation Accuracy: 0.7161
   mlen  batch_size            lr  accuracy
0   128           8  1.000000e-07  0.716117
(1, mlen          128.000000
batch_size      8.000000
lr              0.000001
accuracy             NaN
Name: 1, dtype: float64)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 2506/2506 [06:52<00:00,  6.07it/s, loss=0.971]


Epoch 0 Loss: 1.129651780026918
Validation Accuracy: 0.8221
   mlen  batch_size        lr  accuracy
1   128           8  0.000001  0.822118


Epoch 1: 100%|██████████| 2506/2506 [06:52<00:00,  6.08it/s, loss=1.11]  


Epoch 1 Loss: 0.5935281920205899
Validation Accuracy: 0.8506
   mlen  batch_size        lr  accuracy
1   128           8  0.000001  0.850551


Epoch 2: 100%|██████████| 2506/2506 [06:53<00:00,  6.06it/s, loss=0.307] 


Epoch 2 Loss: 0.5082022297580363
Validation Accuracy: 0.8531
   mlen  batch_size        lr  accuracy
1   128           8  0.000001  0.853095
(2, mlen          128.00000
batch_size      8.00000
lr              0.00001
accuracy            NaN
Name: 2, dtype: float64)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 2506/2506 [06:51<00:00,  6.09it/s, loss=0.795] 


Epoch 0 Loss: 0.6153049379853266
Validation Accuracy: 0.8636
   mlen  batch_size       lr  accuracy
2   128           8  0.00001  0.863571


Epoch 1: 100%|██████████| 2506/2506 [06:51<00:00,  6.09it/s, loss=0.0653] 


Epoch 1 Loss: 0.40511492495473356
Validation Accuracy: 0.8797
   mlen  batch_size       lr  accuracy
2   128           8  0.00001  0.879733


Epoch 2: 100%|██████████| 2506/2506 [06:52<00:00,  6.08it/s, loss=0.401]  


Epoch 2 Loss: 0.3058922572051385
Validation Accuracy: 0.8657
   mlen  batch_size       lr  accuracy
2   128           8  0.00001  0.865716
(3, mlen          128.0000
batch_size      8.0000
lr              0.0001
accuracy           NaN
Name: 3, dtype: float64)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 2506/2506 [06:51<00:00,  6.09it/s, loss=1.42]


Epoch 0 Loss: 1.644203078718254
Validation Accuracy: 0.4106
   mlen  batch_size      lr  accuracy
3   128           8  0.0001  0.410635


Epoch 1: 100%|██████████| 2506/2506 [06:51<00:00,  6.09it/s, loss=1.6] 


Epoch 1 Loss: 1.6386883894348754
Validation Accuracy: 0.4106
   mlen  batch_size      lr  accuracy
3   128           8  0.0001  0.410635


Epoch 2: 100%|██████████| 2506/2506 [06:50<00:00,  6.10it/s, loss=1.7] 


Epoch 2 Loss: 1.6391388923762231
Validation Accuracy: 0.1394
   mlen  batch_size      lr  accuracy
3   128           8  0.0001  0.139372
(4, mlen          2.560000e+02
batch_size    8.000000e+00
lr            1.000000e-07
accuracy               NaN
Name: 4, dtype: float64)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 2506/2506 [10:04<00:00,  4.15it/s, loss=1.73]


Epoch 0 Loss: 1.6444270133686751
Validation Accuracy: 0.2728
   mlen  batch_size            lr  accuracy
4   256           8  1.000000e-07  0.272759


Epoch 1:  93%|█████████▎| 2320/2506 [09:25<00:45,  4.10it/s, loss=1.24] 


KeyboardInterrupt: 

In [10]:
########
#
# Test Sample Evaluation
#
#############
# Load Model if not immediately after train
############
BATCH_SIZE = 16
MAX_LEN = 256

train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)

In [13]:
predictions_and_actual = pd.DataFrame(
    {"actual": test_label, "predicted": all_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1904,89,114,79,48
1.0,118,6258,35,17,149
2.0,118,80,3151,91,95
3.0,116,52,154,2184,100
4.0,81,159,73,68,705


In [None]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize=True,
)
confusion.to_parquet("large_bert_confusion_matrix.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,11.9,0.6,0.7,0.5,0.3
1.0,0.7,39.0,0.2,0.1,0.9
2.0,0.7,0.5,19.6,0.6,0.6
3.0,0.7,0.3,1.0,13.6,0.6
4.0,0.5,1.0,0.5,0.4,4.4


In [37]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
    margins=True,
)
# confusion.to_parquet("large_bert_confusion_matrix_margins.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,11.9,0.6,0.7,0.5,0.3,13.9
1.0,0.7,39.0,0.2,0.1,0.9,41.0
2.0,0.7,0.5,19.6,0.6,0.6,22.0
3.0,0.7,0.3,1.0,13.6,0.6,16.2
4.0,0.5,1.0,0.5,0.4,4.4,6.8
All,14.6,41.4,22.0,15.2,6.8,100.0


In [32]:
pd.crosstab?

[31mSignature:[39m
pd.crosstab(
    index,
    columns,
    values=[38;5;28;01mNone[39;00m,
    rownames=[38;5;28;01mNone[39;00m,
    colnames=[38;5;28;01mNone[39;00m,
    aggfunc=[38;5;28;01mNone[39;00m,
    margins: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    margins_name: [33m'Hashable'[39m = [33m'All'[39m,
    dropna: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
    normalize: [33m"bool | Literal[0, 1, 'all', 'index', 'columns']"[39m = [38;5;28;01mFalse[39;00m,
) -> [33m'DataFrame'[39m
[31mDocstring:[39m
Compute a simple cross tabulation of two (or more) factors.

By default, computes a frequency table of the factors unless an
array of values and an aggregation function are passed.

Parameters
----------
index : array-like, Series, or list of arrays/Series
    Values to group by in the rows.
columns : array-like, Series, or list of arrays/Series
    Values to group by in the columns.
values : array-like, optional
    Array of values to aggregate accord