In [1]:
import datetime
import json
import os

from huggingface_hub import HfApi, upload_file, hf_hub_download
import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (  # AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

pd.set_option("mode.copy_on_write", True)
repo_id = "nickeubank/leaa_grant_subjects_2digits_invweighted"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [4]:
#######
# Get Data
#######

grants = pd.read_parquet(workingdir + "00_source_data/subj_text_and_labels.parquet")
grants = grants.drop_duplicates("description")
labeled = grants[grants["label_2"].notnull()]
labeled = labeled.sort_values(
    ["label_2", "description"], ascending=True
)  # Try to stabilize labels.

In [None]:
##########
# Encode (2 digits trickier than 1 digit) and save encoding
##########
encoder_name = "label_mapping_2digit_invweighted.json"

# Save away
# label_encoder = LabelEncoder()

# mapping = {label: int(idx) for idx, label in enumerate(label_encoder.classes_)}
# encoder_file = f"../20_intermediate_data/{encoder_name}"
# with open(encoder_file, "w") as f:
#     json.dump(mapping, f)
# upload_file(
#     path_or_fileobj=encoder_file,
#     path_in_repo=encoder_name,
#     repo_id=repo_id,
#     repo_type="model",
# )

# Now read encoder:
file_path = hf_hub_download(repo_id=repo_id, filename=encoder_name, repo_type="model")
with open(file_path, "r", encoding="utf-8") as f:
    encodings_json = json.load(f)

labels_to_fit = list(encodings_json.keys())
# Validate `.keys()` ordered correctly
assert list(map(lambda k: encodings_json[k], labels_to_fit)) == list(
    range(len(labels_to_fit))
)
label_encoder = LabelEncoder()

# Destring
labels_to_fit_str = list(map(float, labels_to_fit))
label_encoder.fit(labels_to_fit_str)
labeled["label_2_encoded"] = label_encoder.transform(labeled["label_2"])

In [42]:
proportions = labeled["label_2_encoded"].value_counts(normalize=True).sort_index()
inv_proportions = 1 / proportions
inverse_weights = (inv_proportions / inv_proportions.sum()).astype(np.float32).values
inverse_weights

# Check a few
assert inverse_weights[label_encoder.classes_ == 24] < inverse_weights.mean()
assert inverse_weights[label_encoder.classes_ == 22] == inverse_weights.min()

In [43]:
labeled = labeled.sort_values("description")

# # Sample for hyper param tuning
# labeled = labeled.sample(frac=0.4)

train_label, test_label, train_text, test_text = train_test_split(
    labeled["label_2_encoded"].values,
    labeled["description"].values,
    test_size=0.2,
    random_state=48,
    stratify=labeled["label_2_encoded"],
)
print(len(train_label))
print(len(test_label))

24056
24056


In [44]:
########
# Preprocess
########


class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [8]:
# # Hypertuning Parameters

# # Run 1
# params = [
#     {"mlen": mlen, "batch_size": batch_size, "lr": lr}
#     for batch_size in [8, 16, 32]
#     for mlen in [128, 256, 512]
#     for lr in [1e-7, 1e-6, 1e-5, 1e-4]
# ]

# df = pd.DataFrame(params)
# df["accuracy"] = np.nan
# df["batch_size"] = df["batch_size"].astype("int")
# df["mlen"] = df["mlen"].astype("int")

# hypers_file = "../30_results/hyperparams_2digit_bigbert_invweights.parquet"
# df.to_parquet(hypers_file)

In [9]:
# # Later Runs
# hypers = pd.read_parquet(hypers_file)
# hypers

In [45]:
# for p in hypers[hypers["accuracy"].isnull()].iterrows():
#     print("starting: ")
#     print(p)
#     MAX_LEN = int(p[1]["mlen"])
#     BATCH_SIZE = int(p[1]["batch_size"])
#     EPOCHS = 2
#     LEARNING_RATE = p[1]["lr"]

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.000010

# Device Setup
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##########
# Load model or checkpoint
##########

# ####
# # Load naive bert
# ####
# model = "bert-large-uncased"
# tokenizer = BertTokenizer.from_pretrained(model)
# model = BertForSequenceClassification.from_pretrained(
#     model, num_labels=labeled["label_2_encoded"].nunique()
# )
# model.to(device)

####
# OR Load checkpoint if loading from checkpoint
####

# model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
# tokenizer = BertTokenizer.from_pretrained(repo_id)
# model.eval()


# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Actual training
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Use inverse weights and cross entropy
        criterion = nn.CrossEntropyLoss(weight=weights)
        criterion.to(device)

        loss = criterion(outputs.get("logits"), labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Checkpoints
    if (epoch > 0) and (epoch % 3 == 0):

        time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")

        model.push_to_hub(
            repo_id, commit_message=f"checkpoint_large_{time}_epoch{epoch}"
        )

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

model.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)
tokenizer.push_to_hub(
    repo_id, commit_message=f"trained_invweight_2digit_large_{time}_epoch{epoch}"
)

KeyError: 'predicted_label_1'

In [None]:
########
# Hyper Param Tuning Block
########


# ############
# # Back to main flow
# ############

# model.eval()
# correct = 0
# total = 0
# all_predictions = []

# with torch.no_grad():
#     for batch in val_loader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         predictions = torch.argmax(outputs.logits, dim=1)

#         correct += (predictions == labels).sum().item()
#         total += labels.size(0)

#         formatted_to_list = list(map(lambda x: x.item(), predictions))
#         all_predictions.extend(formatted_to_list)

# accuracy = correct / total
# print(f"Validation Accuracy: {accuracy:.4f}")

# hypers.loc[
#     (hypers["mlen"] == MAX_LEN)
#     & (hypers["batch_size"] == BATCH_SIZE)
#     & (hypers["lr"] == LEARNING_RATE),
#     "accuracy",
# ] = accuracy
# print(
#     hypers[
#         (hypers["mlen"] == MAX_LEN)
#         & (hypers["batch_size"] == BATCH_SIZE)
#         & (hypers["lr"] == LEARNING_RATE)
#     ]
# )
# pd.DataFrame(hypers).to_parquet(hypers_file)

In [16]:
########
# Test actual fit
########


MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.000010

assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Prep
train_dataset = ClassificationDataset(train_text, train_label, tokenizer, MAX_LEN)
test_dataset = ClassificationDataset(test_text, test_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Need weights as tensors on gpu
weights = torch.from_numpy(inverse_weights)
weights.to(device)

# Get model
model = "bert-large-uncased"
model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)
model.eval()

############
# Back to main flow
############

model.eval()
correct = 0
total = 0
all_predictions = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        formatted_to_list = list(map(lambda x: x.item(), predictions))
        all_predictions.extend(formatted_to_list)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.7945


In [17]:
decoded_predictions = label_encoder.inverse_transform(all_predictions)
decoded_actual = label_encoder.inverse_transform(test_label)
predictions_and_actual = pd.DataFrame(
    {"actual": decoded_actual, "predicted": decoded_predictions}
)
pd.crosstab(predictions_and_actual["actual"], predictions_and_actual["predicted"])

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,...,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.0,486,4,0,23,5,13,1,7,0,0,...,0,3,0,0,0,4,5,1,9,0
11.0,5,1074,28,0,0,36,0,0,0,96,...,9,4,27,1,4,16,4,18,16,3
12.0,3,16,216,0,0,2,0,0,1,8,...,5,0,8,1,0,3,0,4,1,0
20.0,12,0,1,170,5,3,2,9,0,1,...,0,0,0,0,0,2,0,0,0,0
21.0,18,0,0,161,949,16,20,77,2,2,...,0,0,0,0,0,3,2,3,8,2
22.0,11,31,4,10,3,2167,13,24,0,14,...,1,1,2,0,0,4,23,8,2,1
23.0,1,0,0,5,13,11,478,23,0,1,...,1,1,0,3,0,3,25,0,1,4
24.0,16,2,0,110,70,62,56,1709,10,3,...,1,20,0,4,0,8,3,34,8,5
25.0,0,0,0,1,1,2,0,8,34,0,...,0,4,0,1,0,0,0,0,0,4
30.0,0,54,26,0,0,5,0,0,0,387,...,1,3,14,0,0,2,0,1,7,0


In [None]:
pd.options.display.max_columns = None

confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="all",
)
confusion.to_parquet("../30_results/large_bert_confusion_matrix_2digit.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,31.0,32.0,33.0,34.0,35.0,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
10.0,3.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
11.0,0.0,6.7,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.6,0.2,0.0,0.0,0.0,0.0,0.1,0.0,0.2,0.0,0.0,0.1,0.0,0.1,0.1,0.0
12.0,0.0,0.1,1.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20.0,0.1,0.0,0.0,1.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21.0,0.1,0.0,0.0,1.0,5.9,0.1,0.1,0.5,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22.0,0.1,0.2,0.0,0.1,0.0,13.5,0.1,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0
23.0,0.0,0.0,0.0,0.0,0.1,0.1,3.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0
24.0,0.1,0.0,0.0,0.7,0.4,0.4,0.3,10.7,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30.0,0.0,0.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
confusion = pd.crosstab(
    predictions_and_actual["actual"],
    predictions_and_actual["predicted"],
    normalize="index",  # rows sum to 100.
    margins=True,
)
# confusion.to_parquet("large_bert_confusion_matrix_margins.parquet")
confusion_to_print = np.round(confusion * 100, decimals=1)
confusion_to_print

predicted,10.0,11.0,12.0,20.0,21.0,22.0,23.0,24.0,25.0,30.0,31.0,32.0,33.0,34.0,35.0,40.0,41.0,42.0,43.0,44.0,50.0,51.0,52.0,53.0,54.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
10.0,85.4,0.7,0.0,4.0,0.9,2.3,0.2,1.2,0.0,0.0,0.7,0.0,0.5,0.2,0.0,0.0,0.5,0.0,0.0,0.0,0.7,0.9,0.2,1.6,0.0
11.0,0.4,77.7,2.0,0.0,0.0,2.6,0.0,0.0,0.0,6.9,2.5,0.1,0.0,0.4,0.0,0.7,0.3,2.0,0.1,0.3,1.2,0.3,1.3,1.2,0.2
12.0,1.1,5.6,76.1,0.0,0.0,0.7,0.0,0.0,0.4,2.8,4.9,0.4,0.4,0.0,0.0,1.8,0.0,2.8,0.4,0.0,1.1,0.0,1.4,0.4,0.0
20.0,5.8,0.0,0.5,82.5,2.4,1.5,1.0,4.4,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
21.0,1.4,0.0,0.0,12.6,74.0,1.2,1.6,6.0,0.2,0.2,0.2,0.2,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.6,0.2
22.0,0.5,1.3,0.2,0.4,0.1,92.8,0.6,1.0,0.0,0.6,0.4,0.0,0.0,0.1,0.1,0.0,0.0,0.1,0.0,0.0,0.2,1.0,0.3,0.1,0.0
23.0,0.2,0.0,0.0,0.9,2.3,1.9,83.7,4.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.2,0.2,0.0,0.5,0.0,0.5,4.4,0.0,0.2,0.7
24.0,0.8,0.1,0.0,5.2,3.3,2.9,2.6,80.2,0.5,0.1,0.0,0.0,0.2,0.1,0.1,0.0,0.9,0.0,0.2,0.0,0.4,0.1,1.6,0.4,0.2
25.0,0.0,0.0,0.0,1.8,1.8,3.6,0.0,14.5,61.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,0.0,1.8,0.0,0.0,0.0,0.0,0.0,7.3
30.0,0.0,9.9,4.8,0.0,0.0,0.9,0.0,0.0,0.0,71.0,3.1,0.2,1.7,3.1,0.2,0.2,0.6,2.6,0.0,0.0,0.4,0.0,0.2,1.3,0.0
