In [1]:
# Only do this install once, for experimenting with hugging face's transformers
# CPU usage, just to ensure that model architecture will be working.
# Will later likely need to use GPUs using a virtual environment,
# IF: our dataset is too big, possibly not the case.
#pip install transformers[torch]

# Same here, run once
#!pip install datasets

In [9]:
# imports need necessary installs found above

from naiveModel import NBLangIDModel
from typing import Any, Dict, List, Optional, Tuple
import NaiveBayesUtil
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from util import get_dataloader
# don't need cuda until using virtual machine
from BERTModel import BERTGenreClassification, train_model
from torch import manual_seed, cuda
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
naiveBayes = NBLangIDModel()

In [11]:
# load data, train test split
descriptions = pd.read_csv("cleanedData.csv")

print("Shape before dropping NaN values:", descriptions.shape)
descriptions = descriptions.dropna()
print("Shape after dropping NaN values:", descriptions.shape)

#TESTING THIS
# print("Rows to be dropped:")
# print(descriptions[descriptions['description'].isna() | ~descriptions['description'].apply(lambda x: isinstance(x, str))])
# #drop the 511 rows where description column is NaN or not a string
# descriptions = descriptions.dropna(subset=['description'])
# descriptions = descriptions[descriptions['description'].apply(lambda x: isinstance(x, str))]

train, test = train_test_split(descriptions, test_size=0.2)

Shape before dropping NaN values: (42661, 5)
Shape after dropping NaN values: (42660, 5)


In [12]:
train = train.drop("Unnamed: 0", axis= 1)
test = test.drop("Unnamed: 0", axis= 1)


In [13]:
train_X = train['description']
train_y1 = train['genre1']
train_y2 = train['genre2']
train_y3 = train['genre3']

In [14]:
test_X = test['description']
test_y1 = test['genre1']
test_y2 = test['genre2']
test_y3 = test['genre3']

In [15]:
# fit the NB model 
naiveBayes.fit(train_X.tolist(), train_y1.tolist())

In [16]:
#predict using the NB model 
predictDict = naiveBayes.predict_one_log_proba(test_X.to_string())

In [17]:
def argmaxThree(score_dict: Dict[Any, float]) -> Any:
    """
    Returns the key with the highest value in the dictionary

    Args:
        score_dict (Dict[Any, float]): the dictionary

    Returns:
        Any: the key with the highest value
    """
    sorted_scores = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    topThreeKeys = [key for key, _ in sorted_scores[:3]]
    return topThreeKeys

In [178]:
# position-dependent accuracy w/ breakdown by position (which genre1/2/3 placement was predicted most frequently accurately)
def AccPosDepend(argmaxPreds, Y1, Y2, Y3):
    totalRows = len(argmaxPreds)
    correctPreds1 = 0
    correctPreds2 = 0
    correctPreds3 = 0

    for pred, y1, y2, y3 in zip(argmaxPreds, Y1, Y2, Y3):
        pred_y1, pred_y2, pred_y3 = pred
        if pred_y1 == y1:
            correctPreds1 += 1
        if pred_y2 == y2:
            correctPreds2 += 1
        if pred_y3 == y3:
            correctPreds3 += 1

    correctPredsTotal = (correctPreds1 + correctPreds2 + correctPreds3 ) /3
    genre1Acc = correctPreds1 / totalRows
    genre2Acc = correctPreds2 / totalRows
    genre3Acc = correctPreds3 / totalRows
    accuracy = correctPredsTotal / totalRows
    return accuracy, genre1Acc, genre2Acc, genre3Acc


In [179]:
#position non-dependent accuracy, w/ breakdown by order in which NB model predicted label breakdown (so like which one the model thought was the greatest chance of being the true label)
def AccPosNonDepend(argmaxPreds, Y1, Y2, Y3):
    totalRows = len(argmaxPreds)
    correctPreds1 = 0
    correctPreds2 = 0
    correctPreds3 = 0

    for pred, y1, y2, y3 in zip(argmaxPreds, Y1, Y2, Y3):
        pred_y1, pred_y2, pred_y3 = pred
        if pred_y1 == y1 or pred_y1 == y2 or pred_y1 == y3:
            correctPreds1 += 1
        if pred_y2 == y1 or pred_y2 == y2 or pred_y2 == y3:
            correctPreds2 += 1
        if pred_y3 == y1 or pred_y3 == y2 or pred_y3 == y3:
            correctPreds3 += 1

    correctPredsTotal = (correctPreds1 + correctPreds2 + correctPreds3) /3
    genre1Acc = correctPreds1 / totalRows
    genre2Acc = correctPreds2 / totalRows
    genre3Acc = correctPreds3 / totalRows
    accuracy = correctPredsTotal / totalRows
    return accuracy, genre1Acc, genre2Acc, genre3Acc

In [180]:
#accuracy for how many you want to see correct, at least 1, at least 2, at least 3... 
def AccLevels(argmaxPreds, Y1, Y2, Y3, howManyCorrect):
    totalRows = len(argmaxPreds)
    correctPreds = 0  # Initialize counter for correct predictions

    for pred, y1, y2, y3 in zip(argmaxPreds, Y1, Y2, Y3):
        pred_y1, pred_y2, pred_y3 = pred
        # Count how many predictions match exactly one of the labels
        correct_labels = 0
        if pred_y1 in [y1, y2, y3]:
            correct_labels += 1
        if pred_y2 in [y1, y2, y3]:
            correct_labels += 1
        if pred_y3 in [y1, y2, y3]:
            correct_labels += 1
        # Increment correctPreds if exactly one label is matched correctly for a prediction
        if correct_labels >= howManyCorrect:
            correctPreds += 1

    accuracy = correctPreds / totalRows
    genreAccs = [accuracy] * 3  # Since all genres share the same accuracy

    return accuracy

In [None]:
predictions = test_X.apply(lambda sentence: naiveBayes.predict_one_log_proba(sentence))
argmaxPreds = [argmaxThree(dictionary) for dictionary in predictions]

In [174]:
accuracyPos, genre1Acc, genre2Acc, genre3Acc = AccPosDepend(argmaxPreds, test_y1, test_y2, test_y3)
accuracyNonPos, pred1Acc, pred2Acc, pred3Acc = AccPosNonDepend(argmaxPreds, test_y1, test_y2, test_y3)
accuracyLevels = AccLevels(argmaxPreds, test_y1, test_y2, test_y3)

0.3156743241131427

# Implementing Custom BERT MultiClassificationModel

In [None]:
print(device)

cpu


In [None]:
# taking away leading whitespace
train['genre1'] = train['genre1'].str.strip()
train['genre2'] = train['genre2'].str.strip()
train['genre3'] = train['genre3'].str.strip()

In [None]:
from transformers import AutoTokenizer

label_vocab = naiveBayes.labels

label_as_id = {l:k  for k, l in enumerate(label_vocab)}
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
bert_train, bert_val = train_test_split(train, test_size= 0.2)
#bert_train.drop("Unnamed: 0", axis= 1)
#bert_val.drop("Unnamed: 0", axis= 1)


In [None]:
from datasets import Dataset

raw_bert_train = Dataset.from_pandas(bert_train)
raw_bert_val = Dataset.from_pandas(bert_val)

ds = {'train': raw_bert_train, 'validation': raw_bert_val}

In [None]:
#latest test
from util import BERT_preprocess
from torch.utils.data import DataLoader

label_vocab = naiveBayes.labels
id2label = {k:l  for k, l in enumerate(label_vocab)}
label2id = {l:k  for k, l in enumerate(label_vocab)}

for split in ds:
    ds[split] = ds[split].map(lambda x: BERT_preprocess(x, id2label, tokenizer), remove_columns= ['description', 'genre1', 'genre2', 'genre3'])
    #ds[split] = DataLoader(ds[split], batch_size=batch_size)

Map: 100%|██████████| 23985/23985 [00:21<00:00, 1135.63 examples/s]
Map: 100%|██████████| 5997/5997 [00:05<00:00, 1174.36 examples/s]


In [None]:
# test
import numpy as np
BERT_preprocess({
    'description': "Blood sings to blood, Froi . . . Those born last will make the first . . . For Charyn will be barren no more. \n Three years after the curse on Lumatere was lifted, Froi has found his home... Or so he believes...Fiercely loyal to the Queen and Finnikin, Froi has been trained roughly and lovingly by the Guard sworn to protect the royal family, and has learned to control his quick temper. But when he is sent on a secretive mission to the kingdom of Charyn, nothing could have prepared him for what he finds. Here he encounters a damaged people who are not who they seem, and must unravel both the dark bonds of kinship and the mysteries of a half-mad Princess.And in this barren and mysterious place, he will discover that there is a song sleeping in his blood, and though Froi would rather not, the time has come to listen.Gripping and intense, complex and richly imagined, Froi of the Exiles is a dazzling sequel to Finnikin of the Rock, from the internationally best-selling and multi-award-winning author of Looking for Alibrandi, Saving Francesca, On the Jellicoe Road and The Piper's Son.",
    'genre1': 'Fantasy',
    'genre2': 'Young Adult',
    'genre3': 'Romance'
}, id2label, tokenizer)

{'input_ids': [101, 2668, 10955, 2000, 2668, 1010, 10424, 10448, 1012, 1012, 1012, 2216, 2141, 2197, 2097, 2191, 1996, 2034, 1012, 1012, 1012, 2005, 25869, 6038, 2097, 2022, 20225, 2053, 2062, 1012, 2093, 2086, 2044, 1996, 8364, 2006, 11320, 8585, 2890, 2001, 4196, 1010, 10424, 10448, 2038, 2179, 2010, 2188, 1012, 1012, 1012, 2030, 2061, 2002, 7164, 1012, 1012, 1012, 16265, 8884, 2000, 1996, 3035, 1998, 9303, 17471, 2078, 1010, 10424, 10448, 2038, 2042, 4738, 5560, 1998, 8295, 2135, 2011, 1996, 3457, 10741, 2000, 4047, 1996, 2548, 2155, 1010, 1998, 2038, 4342, 2000, 2491, 2010, 4248, 12178, 1012, 2021, 2043, 2002, 2003, 2741, 2006, 1037, 28607, 3260, 2000, 1996, 2983, 1997, 25869, 6038, 1010, 2498, 2071, 2031, 4810, 2032, 2005, 2054, 2002, 4858, 1012, 2182, 2002, 11340, 1037, 5591, 2111, 2040, 2024, 2025, 2040, 2027, 4025, 1010, 1998, 2442, 4895, 22401, 2140, 2119, 1996, 2601, 9547, 1997, 27866, 1998, 1996, 15572, 1997, 1037, 2431, 1011, 5506, 4615, 1012, 1998, 1999, 2023, 20225, 1998,

In [None]:
import torch
from sklearn.metrics import f1_score, accuracy_score, classification_report

def get_preds_from_logits(logits):
    ret = torch.zeros(logits.shape)
    _, idx = logits.topk(3, dim=0, largest= True)
    ret[idx] = 1.0
    return ret

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}

    predictions = get_preds_from_logits(logits)

    final_metrics['f1_micro'] = f1_score(labels, predictions, average= 'micro')
    final_metrics['f1_macro'] = f1_score(labels, predictions, average= 'macro')

    #Classification report
    print('Classification report:')
    print(classification_report(labels, predictions, zero_division = 0))

    return final_metrics

In [None]:
LEARNING_RATE = 1e-4
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 1

In [None]:
from transformers import  AutoModelForSequenceClassification, Trainer, TrainerCallback, TrainingArguments

class MultiTaskClassificationTrainer(Trainer):
    def __init__(self, group_weights = None, **kwargs):
        super().__init__(**kwargs)
        self.group_weights = group_weights

    def compute_loss(self, model, inputs, return_outputs = False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs[0]

        loss = torch.nn.functional.cross_entropy(logits, labels)

        #loss = self.group_weights[0]*loss

        return (loss, outputs) if return_outputs else loss
    
class PrinterCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        print(f'Epoch {state.epoch}: ')





In [None]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', id2label=id2label, label2id=label2id).to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir= './distil-fine-tuned',
    learning_rate= LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1_macro",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

trainer = MultiTaskClassificationTrainer(
    model = model,
    args = training_args,
    train_dataset = ds['train'],
    eval_dataset = ds['validation'],
    compute_metrics = compute_metrics,
    callbacks = [PrinterCallback]
)

THIS WILL TAKE **3** HOURS TO TRAIN

In [None]:
trainer.train()

 33%|███▎      | 500/1500 [47:18<1:29:41,  5.38s/it]

{'loss': 9.6332, 'grad_norm': 13.228955268859863, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.33}


 67%|██████▋   | 1000/1500 [1:32:27<44:54,  5.39s/it] 

{'loss': 7.9138, 'grad_norm': 14.959891319274902, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.67}


100%|██████████| 1500/1500 [2:17:37<00:00,  4.00s/it]

{'loss': 7.5415, 'grad_norm': 38.83592987060547, 'learning_rate': 0.0, 'epoch': 1.0}
Epoch 1.0: 




AttributeError: 'numpy.ndarray' object has no attribute 'topk'

# Everything past this point is implementing HW5 BERT
I kind of give up here

In [None]:
next(iter(ds['train']))['input_ids']

[tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
         101, 101]),
 tensor([ 2668, 14015,  2129,  1996,  2641,  2182,  3393,  6832,  1999,  3243,
          1999,  1000,  1996, 11810, 10881,  4098]),
 tensor([10955,  4832,  2079,  9957,  2028,  2003, 22294,  4454,  1996,  9200,
          2023,  9049,  6622,  2038, 23075, 11338]),
 tensor([2000, 2012, 2017, 1004, 1997, 1996, 2063, 2001, 3500, 2028, 7893, 1010,
         1997, 2196, 2038, 7847]),
 tensor([2668, 1037, 8081, 7015, 4701, 2034, 1040, 2019, 1997, 2851, 3117, 2601,
         6792, 9505, 2467, 9257]),
 tensor([ 1010, 16760,  1037,  3319,  1005,  2047,  1005,  2248,  2687,  2003,
          1010,  1010,  3640,  1996,  2908,  2015]),
 tensor([10424,  1012,  2293, 10359,  1055,  5449,  4300,  9575,  1010,  1996,
          2990,  1998,  7070,  1000,  2005,  3268]),
 tensor([10448,  2016,  2008,  2178,  4602,  1997,  2003,  1010, 12849,  2338,
          2003, 17950, 20062,  6825,  1996,  1037]),
 tensor(

In [None]:
# initialize model

# change later, if using virtual machine
device ='cpu' #"cuda" if cuda.is_available() else "cpu"

# seed the model before initializing weights so that your code is deterministic
manual_seed(457)

freeze_bert = False # change later
batch_size = 1 # default
epochs = 5 # default
learning_rate = 1e-2 # default

model = BERTGenreClassification(freeze_bert = freeze_bert).to(device)

In [None]:
train_model(model= model, train_dataloader= ds['train'], 
            dev_dataloader= ds['validation'], epochs= epochs, learning_rate= learning_rate)

torch.Size([256, 16])
torch.Size([256, 16])
torch.Size([303, 16])
torch.Size([16, 303])
torch.Size([16, 303])


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [16, 303]], which is output 0 of SigmoidBackward0, is at version 2; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
emb_length    = len(ds['train'][0]['input_ids'])
output_length = len(ds['train'][0]['label'])

print(f'Length of input embeddings: {emb_length}')
print(f'Length of output layer: {output_length}')

TypeError: 'DataLoader' object is not subscriptable

In [None]:
train_model()

In [None]:
'''
from util import BERT_preprocess
bert_train = BERT_preprocess(bert_train, label_as_id)
bert_val = BERT_preprocess(bert_val, label_as_id)
'''

In [None]:
'''
bert_val.head()
print(sum(bert_val.iloc[3, 6]))
'''

3


In [None]:
# BERT data loaders and training
#from util import BERT_preprocess
#bert_train = BERT_preprocess(bert_train, label_as_id)
#bert_val = BERT_preprocess(bert_val, label_as_id)

# have to change these to implement from df, not from csv
train_dataloader = get_dataloader(bert_train, batch_size=batch_size)
val_dataloader = get_dataloader(bert_val, batch_size=batch_size)


# currently, embedding dimensions are wrong
#train_model(model, train_dataloader, dev_dataloader, epochs, learning_rate)

TypeError: <lambda>() got an unexpected keyword argument 'batched'