In [12]:
import datetime
import os

os.environ["PATH"] = (
    "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4/" + os.environ["PATH"]
)
os.environ["LD_LIBRARY_PATH"] = (
    "/opt/apps/rhel9/cuda-12.4/bin:/opt/apps/rhel9/cuda-12.4"
    + os.environ.get("LD_LIBRARY_PATH", "")
)

from huggingface_hub import HfApi

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
assert torch.cuda.is_available()

In [14]:
# dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(workingdir + "subj_text_and_labels.parquet")

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")

labeled = grants[grants["label_1"].notnull()]
labeled["label_1_encoded"] = labeled["label_1"] - 1

In [15]:
labeled = labeled.sort_values("description")

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_1_encoded"].values,
    labeled["description"].values,
    test_size=0.2,
    random_state=45,
    stratify=labeled["label_1"],
)
print(len(train_label))
print(len(test_label))

64148
16038


In [16]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [17]:
# Hypertuning Parameters
# hypers = {"lr": [], "mlen": [], "batch_size": [], "accuracy": []}
# hypers = pd.read_parquet("hyperparams_1digit_bigbert.parquet").to_dict()
# for k in hypers.keys():
#     hypers[k] = list(hypers[k].values())

# df = pd.read_parquet("hyperparams_1digit_bigbert.parquet")
# df.sort_values("accuracy", ascending=False)

# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]
# params

In [17]:
# for p in params[11:]:
# MAX_LEN = p["mlen"]
# BATCH_SIZE = p["batch_size"]
# EPOCHS = 5
# LEARNING_RATE = p["lr"]

MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.000010

model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model and Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    model, num_labels=grants["label_1"].nunique()
)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )
        tokenizer.push_to_hub(
            repo_id,
            commit_message=f"checkpoint_large_{time}_epoch{epoch}",
        )

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")
model.push_to_hub(repo_id, commit_message=f"trained_largebert_{time}_epoch{epoch}")
tokenizer.push_to_hub(
    repo_id,
    commit_message=f"trained_largebert_{time}_epoch{epoch}",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 1003/1003 [11:49<00:00,  1.41it/s, loss=1.39]


Epoch 0 Loss: 1.4512595287408097


Epoch 1: 100%|██████████| 1003/1003 [11:49<00:00,  1.41it/s, loss=1.53]


Epoch 1 Loss: 1.4740087474925687


Epoch 2: 100%|██████████| 1003/1003 [11:49<00:00,  1.41it/s, loss=1.28]


Epoch 2 Loss: 1.4759330496949665


Epoch 3: 100%|██████████| 1003/1003 [11:49<00:00,  1.41it/s, loss=1.48]


Epoch 3 Loss: 1.4737451928681653


Epoch 4: 100%|██████████| 1003/1003 [11:49<00:00,  1.41it/s, loss=1.67]


Epoch 4 Loss: 1.4760060884422461
Validation Accuracy: 0.4120
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246243063, 0.7568747271933653, 0.7764544490864875, 0.8709858452328989, 0.87260709609

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s, loss=1.29]


Epoch 0 Loss: 1.584797185730649


Epoch 1: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s, loss=1.8] 


Epoch 1 Loss: 1.4694474814422578


Epoch 2: 100%|██████████| 502/502 [03:03<00:00,  2.73it/s, loss=1.49]


Epoch 2 Loss: 1.4187838698763295


Epoch 3: 100%|██████████| 502/502 [03:03<00:00,  2.73it/s, loss=1.65]


Epoch 3 Loss: 1.3611481560178962


Epoch 4: 100%|██████████| 502/502 [03:03<00:00,  2.73it/s, loss=1.32] 


Epoch 4 Loss: 1.2689757027711526
Validation Accuracy: 0.5851
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246243063, 0.7568747271933653, 0.7764544490864875, 0.870985845232898

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [03:06<00:00,  2.69it/s, loss=0.916]


Epoch 0 Loss: 1.2199355082445411


Epoch 1: 100%|██████████| 502/502 [03:05<00:00,  2.70it/s, loss=0.962]


Epoch 1 Loss: 0.7081139040183261


Epoch 2: 100%|██████████| 502/502 [03:08<00:00,  2.66it/s, loss=0.139]


Epoch 2 Loss: 0.5491387742270987


Epoch 3: 100%|██████████| 502/502 [03:06<00:00,  2.69it/s, loss=1.33] 


Epoch 3 Loss: 0.4761932602975948


Epoch 4: 100%|██████████| 502/502 [03:06<00:00,  2.69it/s, loss=0.278]


Epoch 4 Loss: 0.4338548208969048
Validation Accuracy: 0.8651
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246243063, 0.7568747271933653, 0.7764544490864875, 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [03:06<00:00,  2.69it/s, loss=0.929] 


Epoch 0 Loss: 0.5770211536214647


Epoch 1: 100%|██████████| 502/502 [03:08<00:00,  2.67it/s, loss=0.108] 


Epoch 1 Loss: 0.33906245411214125


Epoch 2: 100%|██████████| 502/502 [03:06<00:00,  2.69it/s, loss=0.0604]


Epoch 2 Loss: 0.2689175803689249


Epoch 3: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s, loss=0.0585]


Epoch 3 Loss: 0.19729286674187477


Epoch 4: 100%|██████████| 502/502 [03:06<00:00,  2.70it/s, loss=0.0152]


Epoch 4 Loss: 0.1512523057881698
Validation Accuracy: 0.8742
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246243063, 0.7568747271933653, 0.776

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [03:04<00:00,  2.72it/s, loss=1.4]  


Epoch 0 Loss: 1.0994551437665743


Epoch 1: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s, loss=1.34]


Epoch 1 Loss: 1.4746730054517192


Epoch 2: 100%|██████████| 502/502 [03:03<00:00,  2.73it/s, loss=1.18]


Epoch 2 Loss: 1.4722514777069549


Epoch 3: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s, loss=1.6] 


Epoch 3 Loss: 1.469263973464054


Epoch 4: 100%|██████████| 502/502 [03:05<00:00,  2.70it/s, loss=1.44]


Epoch 4 Loss: 1.4726735626558858
Validation Accuracy: 0.4120
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246243063, 0.756874

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=1.59]


Epoch 0 Loss: 1.5676026759869548


Epoch 1: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=1.38]


Epoch 1 Loss: 1.457659854356986


Epoch 2: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=1.21]


Epoch 2 Loss: 1.3493546967012473


Epoch 3: 100%|██████████| 502/502 [05:38<00:00,  1.48it/s, loss=1.05] 


Epoch 3 Loss: 1.2456859879759679


Epoch 4: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=0.887]


Epoch 4 Loss: 1.1657552784420104
Validation Accuracy: 0.6451
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.5928166115857081, 0.865623246

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [05:38<00:00,  1.48it/s, loss=1.08] 


Epoch 0 Loss: 1.2904329822357907


Epoch 1: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=0.132]


Epoch 1 Loss: 0.6628474878718653


Epoch 2: 100%|██████████| 502/502 [05:38<00:00,  1.48it/s, loss=0.144]


Epoch 2 Loss: 0.5124314470713832


Epoch 3: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=2.97] 


Epoch 3 Loss: 0.4534685044323068


Epoch 4: 100%|██████████| 502/502 [05:38<00:00,  1.48it/s, loss=0.623]


Epoch 4 Loss: 0.41530938010173013
Validation Accuracy: 0.8695
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 0.592816611585

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=0.0514]


Epoch 0 Loss: 0.6044451224272588


Epoch 1: 100%|██████████| 502/502 [05:39<00:00,  1.48it/s, loss=1.02]  


Epoch 1 Loss: 0.36407584045453373


Epoch 2: 100%|██████████| 502/502 [05:40<00:00,  1.48it/s, loss=0.387] 


Epoch 2 Loss: 0.28590205368174026


Epoch 3: 100%|██████████| 502/502 [05:37<00:00,  1.49it/s, loss=0.164] 


Epoch 3 Loss: 0.2253346254203127


Epoch 4: 100%|██████████| 502/502 [05:40<00:00,  1.48it/s, loss=0.053] 


Epoch 4 Loss: 0.176599095753613
Validation Accuracy: 0.8787
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256, 256], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.4079940138429881, 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [05:36<00:00,  1.49it/s, loss=1.93] 


Epoch 0 Loss: 1.3388535781685575


Epoch 1: 100%|██████████| 502/502 [05:37<00:00,  1.49it/s, loss=1.56]


Epoch 1 Loss: 1.4747830579480328


Epoch 2: 100%|██████████| 502/502 [05:37<00:00,  1.49it/s, loss=1.42]


Epoch 2 Loss: 1.4717651087924304


Epoch 3: 100%|██████████| 502/502 [05:37<00:00,  1.49it/s, loss=1.48]


Epoch 3 Loss: 1.470814642678219


Epoch 4: 100%|██████████| 502/502 [05:38<00:00,  1.48it/s, loss=1.71]


Epoch 4 Loss: 1.4680953384395614
Validation Accuracy: 0.4120
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256, 256, 256], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793415227286899, 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=1.72]


Epoch 0 Loss: 1.567781114958197


Epoch 1: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=1.29]


Epoch 1 Loss: 1.433726050939218


Epoch 2: 100%|██████████| 502/502 [11:14<00:00,  1.34s/it, loss=1.27]


Epoch 2 Loss: 1.3487888455865868


Epoch 3: 100%|██████████| 502/502 [11:14<00:00,  1.34s/it, loss=1.17] 


Epoch 3 Loss: 1.251213628457362


Epoch 4: 100%|██████████| 502/502 [11:16<00:00,  1.35s/it, loss=0.57] 


Epoch 4 Loss: 1.1524583887293993
Validation Accuracy: 0.6388
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256, 256, 256, 512], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717590571803, 0.8793

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.992]


Epoch 0 Loss: 1.2188468352257018


Epoch 1: 100%|██████████| 502/502 [11:15<00:00,  1.34s/it, loss=0.806]


Epoch 1 Loss: 0.7336343935880053


Epoch 2: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.539]


Epoch 2 Loss: 0.5262224125375311


Epoch 3: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.229]


Epoch 3 Loss: 0.4504181856801548


Epoch 4: 100%|██████████| 502/502 [11:14<00:00,  1.34s/it, loss=0.0338]


Epoch 4 Loss: 0.40821864440354455
Validation Accuracy: 0.8692
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.6717590571802706, 0.8716717

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.163] 


Epoch 0 Loss: 0.6194193438289175


Epoch 1: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.0344]


Epoch 1 Loss: 0.3523119836451998


Epoch 2: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.509] 


Epoch 2 Loss: 0.2721680678071016


Epoch 3: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.0401]


Epoch 3 Loss: 0.20883439047506963


Epoch 4: 100%|██████████| 502/502 [11:15<00:00,  1.35s/it, loss=0.0218] 


Epoch 4 Loss: 0.14877604184100826
Validation Accuracy: 0.8749
{'lr': [1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-07, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05, 0.0001, 1e-07, 1e-06, 1e-05], 'mlen': [128, 128, 128, 128, 128, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512, 512, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512], 'batch_size': [8, 8, 8, 8, 16, 16, 16, 16, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 8, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32], 'accuracy': [0.7602419405125647, 0.8706740662218619, 0.8705493546174472, 0.4105506017334913, 0.6138928727318077, 0.8669327180894182, 0.8797780133441416, 0.4105506017334913, 0.7128515308349442, 0.7245744216499346, 0.8767225790359793, 0.875413107189624, 0.4079940138429881, 0.67175905718

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/502 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 31.50 GiB of which 172.12 MiB is free. Including non-PyTorch memory, this process has 31.33 GiB memory in use. Of the allocated memory 28.32 GiB is allocated by PyTorch, and 2.64 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# hypers["lr"].append(LEARNING_RATE)
# hypers["mlen"].append(MAX_LEN)
# hypers["batch_size"].append(BATCH_SIZE)
# hypers["accuracy"].append(accuracy)
# print(hypers)
# pd.DataFrame(hypers).to_parquet(workingdir + "hyperparams_1digit_bigbert.parquet")

In [17]:
# # Look at fine tuning
# df = pd.DataFrame(hypers).sort_values("accuracy", ascending=False)
# df.groupby("lr")["accuracy"].mean()