In [1]:
import datetime
import json
import os

from huggingface_hub import HfApi, upload_file
import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_2digits_invweighted"
workingdir = "/hpc/group/ssri/nce8/leaa_subj/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
#######
# Get Data
#######

grants = pd.read_parquet(workingdir + "00_source_data/subj_text_and_labels.parquet")
grants = grants.drop_duplicates("description")
labeled = grants[grants["label_2"].notnull()]
labeled = labeled.sort_values(
    ["label_2", "description"], ascending=True
)  # Try to stabilize labels.

In [4]:
##########
# Encode (2 digits trickier than 1 digit) and save encoding
##########
label_encoder = LabelEncoder()
labeled["label_2_encoded"] = label_encoder.fit_transform(labeled["label_2"])

# Save away
mapping = {label: int(idx) for idx, label in enumerate(label_encoder.classes_)}
encoder_name = "label_mapping_2digit_invweighted.json"
encoder_file = f"../20_intermediate_data/{encoder_name}"
with open(encoder_file, "w") as f:
    json.dump(mapping, f)
upload_file(
    path_or_fileobj=encoder_file,
    path_in_repo=encoder_name,
    repo_id=repo_id,
    repo_type="model",
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nickeubank/leaa_grant_subjects_2digits_invweighted/commit/9efde8f961cbabadf3e57d9e3b422115f8c03bb4', commit_message='Upload label_mapping_2digit_invweighted.json with huggingface_hub', commit_description='', oid='9efde8f961cbabadf3e57d9e3b422115f8c03bb4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nickeubank/leaa_grant_subjects_2digits_invweighted', endpoint='https://huggingface.co', repo_type='model', repo_id='nickeubank/leaa_grant_subjects_2digits_invweighted'), pr_revision=None, pr_num=None)

In [5]:
proportions = labeled["label_2_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Check a few
assert inverse_weights[label_encoder.classes_ == 24] < inverse_weights.mean()
assert inverse_weights[label_encoder.classes_ == 22] == inverse_weights.min()

In [6]:
labeled = labeled.sort_values("description")

# # Sample for hyper param tuning
# labeled = labeled.sample(frac=0.4)

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_2_encoded"].values,
    labeled["description"].values,
    test_size=0.2,
    random_state=48,
    stratify=labeled["label_2_encoded"],
)
print(len(train_label))
print(len(test_label))

64148
16038


In [7]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [8]:
# # Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")

# hypers_file = "../20_intermediate_data/hyperparams_2digit_bigbert_invweights.parquet"
# df.to_parquet(hypers_file)

In [9]:
# # Later Runs
# hypers = pd.read_parquet(hypers_file)
# hypers

In [None]:
# for p in hypers[hypers["accuracy"].isnull()].iterrows():
#     print("starting: ")
#     print(p)
#     MAX_LEN = int(p[1]["mlen"])
#     BATCH_SIZE = int(p[1]["batch_size"])
#     EPOCHS = 2
#     LEARNING_RATE = p[1]["lr"]

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.000010

# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##########
# Load model or checkpoint
##########

####
# Load naive bert
####
model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertForSequenceClassification.from_pretrained(
    model, num_labels=labeled["label_2_encoded"].nunique()
)
model.to(device)

####
# OR Load checkpoint if loading from checkpoint
####
# model = "bert-large-uncased"
# tokenizer = BertTokenizer.from_pretrained(model)
# tokenizer.push_to_hub(repo_id, commit_message=f"transfer over large bert tokenizer")
#
# model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
# tokenizer = BertTokenizer.from_pretrained(repo_id)
# model.eval()

# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Actual training
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Use inverse weights and cross entropy
        criterion = nn.CrossEntropyLoss(weight=weights)
        criterion.to(device)

        loss = criterion(outputs.get("logits"), labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

model.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)
tokenizer.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  weights = torch.from_numpy(inverse_weights)
  0%|          | 0/8019 [00:00<?, ?it/s]

Epoch 0: 100%|██████████| 8019/8019 [31:54<00:00,  4.19it/s, loss=0.55]  


Epoch 0 Loss: 1.1661675127525328


Epoch 1: 100%|██████████| 8019/8019 [31:48<00:00,  4.20it/s, loss=0.346] 


Epoch 1 Loss: 0.7831538256661009


Epoch 2: 100%|██████████| 8019/8019 [31:51<00:00,  4.20it/s, loss=0.0858]


Epoch 2 Loss: 0.6297987360399937


Epoch 3: 100%|██████████| 8019/8019 [31:51<00:00,  4.20it/s, loss=1.22]   
model.safetensors: 100%|██████████| 1.34G/1.34G [00:29<00:00, 46.1MB/s]


Epoch 3 Loss: 0.49782873771205305


Epoch 4: 100%|██████████| 8019/8019 [31:53<00:00,  4.19it/s, loss=0.0155] 


Epoch 4 Loss: 0.39033394336711125


Epoch 5: 100%|██████████| 8019/8019 [31:49<00:00,  4.20it/s, loss=0.0228] 


Epoch 5 Loss: 0.3065021473117436


Epoch 6: 100%|██████████| 8019/8019 [31:52<00:00,  4.19it/s, loss=0.168]  
model.safetensors:   4%|▎         | 47.8M/1.34G [00:00<00:18, 69.1MB/s]

In [None]:

    ############
    # Back to main flow
    ############

    model.eval()
    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            formatted_to_list = list(map(lambda x: x.item(), predictions))
            all_predictions.extend(formatted_to_list)

    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy:.4f}")

    hypers.loc[
        (hypers["mlen"] == MAX_LEN)
        & (hypers["batch_size"] == BATCH_SIZE)
        & (hypers["lr"] == LEARNING_RATE),
        "accuracy",
    ] = accuracy
    print(
        hypers[
            (hypers["mlen"] == MAX_LEN)
            & (hypers["batch_size"] == BATCH_SIZE)
            & (hypers["lr"] == LEARNING_RATE)
        ]
    )
    pd.DataFrame(hypers).to_parquet(hypers_file)

In [24]:
hypers.sort_values("accuracy", ascending=False)

Unnamed: 0,mlen,batch_size,lr,accuracy
6,256,8,1e-05,0.753508
10,512,8,1e-05,0.752572
14,128,16,1e-05,0.745401
30,256,32,1e-05,0.735549
18,256,16,1e-05,0.728191
26,128,32,1e-05,0.72208
22,512,16,1e-05,0.718464
9,512,8,1e-06,0.676311
1,128,8,1e-06,0.673754
5,256,8,1e-06,0.637401


In [10]:
predictions_and_actual = pd.DataFrame(
    {"actual": test_label, "predicted": all_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1849,74,135,121,55
1.0,133,6134,93,44,173
2.0,116,47,3177,109,86
3.0,100,27,121,2251,107
4.0,62,124,78,89,733


In [11]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize=True,
)
confusion.to_parquet("large_bert_confusion_matrix.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,11.5,0.5,0.8,0.8,0.3
1.0,0.8,38.2,0.6,0.3,1.1
2.0,0.7,0.3,19.8,0.7,0.5
3.0,0.6,0.2,0.8,14.0,0.7
4.0,0.4,0.8,0.5,0.6,4.6


In [12]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
    margins=True,
)
# confusion.to_parquet("large_bert_confusion_matrix_margins.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,0,1,2,3,4,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,11.5,0.5,0.8,0.8,0.3,13.9
1.0,0.8,38.2,0.6,0.3,1.1,41.0
2.0,0.7,0.3,19.8,0.7,0.5,22.0
3.0,0.6,0.2,0.8,14.0,0.7,16.2
4.0,0.4,0.8,0.5,0.6,4.6,6.8
All,14.1,39.9,22.5,16.3,7.2,100.0
