In [None]:
# Utilities/Misc
import os
from tqdm import tqdm
from pathlib import Path
import pickle as pkl
from typing import Tuple

# Data handling
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.preprocessing import LabelEncoder

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# ML/Huggingface tools
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW

# Scoring
import torch.nn.functional as F
from sklearn.metrics import (
    f1_score,
    balanced_accuracy_score,
    roc_auc_score,
)

In [None]:
# Paths and directories
CWD = Path(os.getcwd())
DATA_DIR = CWD / "data"
CORPUS_DIR = DATA_DIR / "corpus_files"
OBJ_DIR = DATA_DIR / "objects"
MODEL_SEARCH = DATA_DIR / "model_search_results"

In [None]:
# Load/Prepare Data
doc_corpus_path = CORPUS_DIR / "document_corpus.pkl"
doc_df: pd.DataFrame = pd.read_pickle(doc_corpus_path)
doc_df.info()

### Get Training (65%)/Testing(25%)/Validation(10%) Sets

In [None]:
if "split" not in doc_df.columns:
    model_data = doc_df[~doc_df["submission_flair"].isin(["Background", "Compendium"])]
    train_df, test_df = train_test_split(
        model_data, train_size=0.75, random_state=29359
    )
    train_df, valid_df = train_test_split(
        train_df, train_size=65/76, random_state=29359
    )
    doc_df["split"] = None
    doc_df.loc[doc_df["UID"].isin(train_df["UID"]), "split"] = "train"
    doc_df.loc[doc_df["UID"].isin(test_df["UID"]), "split"] = "test"
    doc_df.loc[doc_df["UID"].isin(valid_df["UID"]), "split"] = "validation"    
    doc_df.to_pickle(doc_corpus_path)
elif "validation" not in doc_df["split"].unique():
    train_df = doc_df[doc_df["split"] == "train"].copy()
    test_df = doc_df[doc_df["split"] == "test"].copy()
    train_df, valid_df = train_test_split(
        train_df, train_size=65/76, random_state=29359
    )
    doc_df.to_pickle(doc_corpus_path)
else:
    train_df = doc_df[doc_df["split"] == "train"].copy()
    valid_df = doc_df[doc_df["split"] == "validation"].copy()
    test_df = doc_df[doc_df["split"] == "test"].copy()

In [None]:
lab_enc_path = OBJ_DIR/"label_encoder.pkl"
if lab_enc_path.is_file():
    with open(lab_enc_path, "rb") as le:
        lab_enc = pkl.load(le)
    train_df["label"] = lab_enc.transform(train_df["submission_flair"])
else:
    lab_enc = LabelEncoder()
    train_df["label"] = lab_enc.fit_transform(train_df["submission_flair"])
    with open(lab_enc_path, "wb") as le:
        pkl.dump(lab_enc, le)

test_df["label"] = lab_enc.transform(test_df["submission_flair"])
valid_df["label"] = lab_enc.transform(valid_df["submission_flair"])

In [None]:
# Function to tokenize and format the input data
def tokenize_data(data: pd.DataFrame, tokenizer: RobertaTokenizer, data_str: str):
    input_ids = []
    attention_masks = []

    for text in tqdm(data['clean_text'], desc=f"Tokenizing ({data_str} data)"):
        tokens = tokenizer.encode_plus(
            text,
            max_length=512,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        input_ids.append(tokens['input_ids'])
        attention_masks.append(tokens['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)

    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data["label"].values)

    return TensorDataset(input_ids, attention_masks, labels)

In [None]:
# Initialize the RoBERTa tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# Tokenize and format the data
train_dataset = tokenize_data(train_df, roberta_tokenizer, "training")
test_dataset = tokenize_data(test_df, roberta_tokenizer, "testing")
val_dataset = tokenize_data(valid_df, roberta_tokenizer, "validation")

In [None]:
# Import/get pre-trained RoBERTA model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(train_df['label'].unique()))
# Define the optimizer and loss function
OPTIMIZER = AdamW(model.parameters(), lr=2e-5)
CRITERION = torch.nn.CrossEntropyLoss()
# Send model to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Define some training params
BATCH_SIZE = 2 # Batch size for training
MAX_EPOCHS = 100 # Maximum of training epochs
MIN_EPOCHS = 1 # Minimum number of training epochs

# Create a WeightedRandomSampler for class-balanced sampling
class_weights = [1.0 / count for count in train_df['label'].value_counts().sort_index().values]
class_weights = [i/sum(class_weights) for i in class_weights]
train_sample_weights = torch.tensor(train_df['label'].map(lambda i: class_weights[i]).values)
train_weighted_rand_sampler = WeightedRandomSampler(weights=train_sample_weights, num_samples=len(train_dataset), replacement=True)
val_sample_weights = torch.tensor(valid_df['label'].map(lambda i: class_weights[i]).values)
val_weighted_rand_sampler = WeightedRandomSampler(weights=val_sample_weights, num_samples=len(val_dataset), replacement=True)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_weighted_rand_sampler)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_weighted_rand_sampler)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
def numeric_labels_to_one_hot(labels, num_classes: int = 8):
    """
    Convert numeric label encodings to one-hot encoded numpy array.

    Args:
    labels (list or numpy array): Numeric label encodings.
    num_classes (int): Total number of classes.

    Returns:
    numpy array: One-hot encoded labels.
    """
    one_hot_labels = np.zeros((len(labels), num_classes))
    one_hot_labels[np.arange(len(labels)), labels] = 1
    return one_hot_labels

def eval_model(model: RobertaForSequenceClassification, loader: DataLoader, data_str: str, n_iter: int = 5) -> Tuple[float, float, float, float]:
    """
    Evaluate the model for mean loss, balanaced accuracy, macro F1 score, macro OVR ROC-AUC.

    Parameters
    ----------
    model : RobertaForSequenceClassification
        The RoBERTA model
    loader : DataLoader
        PyTorch data load which returns data in batches
    data_str : str
        A string describing the data loader
    n_iter : int
        A string describing the data loader


    Returns
    -------
    Tuple[float, float, float, float]
        mean_loss, balanced_accuracy, macro_f1, macro_ovr_auc
    """
    # Calculate validation loss and metrics
    model.eval()
    loss = 0.0
    predictions = []
    labels = []
    probs = []

    with torch.no_grad():
        for _ in range(n_iter):
            for batch in loader:
                input_ids, attention_mask, _labels = batch
                input_ids, attention_mask, _labels = input_ids.to(device), attention_mask.to(device), _labels.to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                _loss = CRITERION(logits, _labels)
                loss += _loss.item()
                _predictions = torch.argmax(logits, dim=1)
                _probs = F.softmax(logits, dim=1)
                predictions.extend(_predictions.cpu().numpy())
                labels.extend(_labels.cpu().numpy())
                probs.extend(_probs.cpu().numpy())

    one_hot = numeric_labels_to_one_hot(labels)
    accuracy = balanced_accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    roc_auc = roc_auc_score(one_hot, probs, multi_class="ovr", average="macro")
    mean_loss = loss / len(predictions)

    print(f"Mean {data_str} Loss: {mean_loss}")
    print(f"{data_str} Balanced Accuracy: {accuracy}")
    print(f"{data_str} Macro F1 Score: {f1}")
    print(f"{data_str} Macro OVR AUC-ROC: {roc_auc}")

    return mean_loss, accuracy, f1, roc_auc

In [None]:
model_output_path = OBJ_DIR / "roberta_final.pth"

In [None]:
if not model_output_path.is_file():
    cont_training = True
    saved = False
    prev_val_loss = np.inf
    results = {
        "epoch": [],
        "train_loss": [],
        "train_accuracy": [],
        "train_f1": [],
        "train_auc": [],
        "val_loss": [],
        "val_accuracy": [],
        "val_f1": [],
        "val_auc": [],
        }
    for epoch in range(MAX_EPOCHS+1):
        torch.save(model.state_dict(), model_output_path)
        if cont_training and (epoch <= MAX_EPOCHS):
            model.train()
            train_loss = 0.0

            # Train in batches
            for batch in tqdm(train_loader, desc="Epoch {}".format(epoch + 1)):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                
                OPTIMIZER.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels.long())
                loss = outputs.loss
                loss.backward()
                OPTIMIZER.step()
            
            # Report and record training and validation performance
            print(f"Epoch {epoch + 1}/{MAX_EPOCHS}")
            train_loss, train_accuracy, train_f1, train_roc_auc = eval_model(model, train_loader, "Training", n_iter=3)
            val_loss, val_accuracy, val_f1, val_roc_auc = eval_model(model, val_loader, "Validation", n_iter=3)

            results["epoch"].append(epoch+1)
            results["train_loss"].append(train_loss)
            results["train_accuracy"].append(train_accuracy)
            results["train_f1"].append(train_f1)
            results["train_auc"].append(train_roc_auc)
            results["val_loss"].append(val_loss)
            results["val_accuracy"].append(val_accuracy)
            results["val_f1"].append(val_f1)
            results["val_auc"].append(val_roc_auc)

            # Save the model if it validation loss has stopped decreasing (give a tolerance of 1% previous loss)
            if (prev_val_loss < val_loss*.99) & (epoch >= MIN_EPOCHS):
                best_val_loss = val_loss
                cont_training = False
            prev_val_loss = val_loss
    
        # If we are at max epochs OR we are not continuing (validation loss dropped)
        else:
            break

    results_df = pd.DataFrame(results)
    results_df.to_pickle(MODEL_SEARCH/"roberta_training_progress.pkl")
else:
    # Load the best model
    model.load_state_dict(torch.load(model_output_path))

In [None]:
test_loss, test_accuracy, test_f1, test_roc_auc = eval_model(model, test_loader, "Evaluating Test Performance", n_iter=1)
print(f"Mean Testing Loss: {test_loss / len(test_loader)}")
print(f"Testing Balanced Accuracy: {test_accuracy}")
print(f"Testing Macro F1 Score: {test_f1}")
print(f"TestingMacro OVR AUC-ROC: {test_roc_auc}")

In [None]:
for c in ["loss", "accuracy", "f1","auc"]:
    dat = results_df.set_index("epoch", drop=True)
    dat = dat[[f"val_{c}", f"train_{c}"]]
    sns.lineplot(data=dat)
    plt.title(f"{c.capitalize()} vs Epoch")
    plt.show()
    plt.close()