In [5]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import os
import pandas as pd

os.environ["TOKENIZERS_PARALLELISM"] = "false"

train_set = pd.read_csv('./data/letterboxd_250movie_reviews_train.csv')
val_set  = pd.read_csv('./data/letterboxd_250movie_reviews_val.csv')
test_set  = pd.read_csv('./data/letterboxd_250movie_reviews_test.csv')

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_set),
    'validation': Dataset.from_pandas(val_set),
    'test': Dataset.from_pandas(test_set)
})

model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {
    0: "½",
    1: "★",
    2: "★½",
    3: "★★",
    4: "★★½",
    5: "★★★",
    6: "★★★½",
    7: "★★★★",
    8: "★★★★½",
    9: "★★★★★",
}
label2id = {
    "½": 0,
    "★": 1,
    "★½": 2,
    "★★": 3,
    "★★½": 4,
    "★★★": 5,
    "★★★½": 6,
    "★★★★": 7,
    "★★★★½": 8,
    "★★★★★": 9,
}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=10,
    id2label=id2label,
    label2id=label2id,
)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset.map(preprocess_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2234 [00:00<?, ? examples/s]

Map:   0%|          | 0/279 [00:00<?, ? examples/s]

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# hyperparameters
lr = 2e-4
batch_size = 16
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-letterbox-reviews-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [10]:
tokenized_data["validation"]

Dataset({
    features: ['original_id', 'text', 'label', 'movie', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 279
})

In [11]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False
    
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [12]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc", "multiclass")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # compute multiclass auc using all class probabilities
    auc = np.round(auc_score.compute(prediction_scores=probabilities, references=labels, multi_class='ovr')['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

training_output = trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,2.2681,2.219996,0.143,0.683
2,2.1389,2.106627,0.211,0.702
3,2.0482,2.075116,0.201,0.715
4,1.9958,2.062008,0.211,0.72
5,1.9555,2.0435,0.194,0.725
6,1.9279,2.019959,0.201,0.726
7,1.8997,2.027881,0.208,0.728
8,1.882,2.001136,0.208,0.729
9,1.8612,2.00744,0.201,0.729
10,1.851,2.010026,0.215,0.729




In [None]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["test"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))

# 200 per rating baseline: ???
# 200 per rating, {'Accuracy': np.float64(0.265), 'AUC': np.float64(0.738)} lr = 2e-4 batch_size = 16 num_epochs = 10
# 300 per rating baseline:{'Accuracy': np.float64(0.114), 'AUC': np.float64(0.526)}
# 300 per rating, {'Accuracy': np.float64(0.193), 'AUC': np.float64(0.716)}

print(metrics)




{'Accuracy': np.float64(0.193), 'AUC': np.float64(0.716)}


In [15]:
import torch
def predict_single(review: str):
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = trainer.predict(Dataset.from_dict(inputs))
        predicted_class_id = outputs.predictions.argmax().item()

        return model.config.id2label[predicted_class_id]

def predict_and_print(review: str):
    prediction = predict_single(review)
    print(f"{review} -> {prediction}")
predict_and_print("a whole movie about coming down from apex mountain. Safdie's worthy, long-overdue follow-up to Lenny Cooke")
predict_and_print("the rock is evolving. he is now the boulder")
predict_and_print("That guy with the long hair was so unnecessarily rude.")
predict_and_print("utterly sauceless")
predict_and_print("decent as a (very) muted romance, much louder and more effective as an argument for physical archival media.")
predict_and_print("That was incredibly upsetting but I knew it would be")
predict_and_print("you’re not depressed, you just love new england")



a whole movie about coming down from apex mountain. Safdie's worthy, long-overdue follow-up to Lenny Cooke -> ★★★★★


the rock is evolving. he is now the boulder -> ★★★★★


That guy with the long hair was so unnecessarily rude. -> ½


utterly sauceless -> ½


decent as a (very) muted romance, much louder and more effective as an argument for physical archival media. -> ★★★


That was incredibly upsetting but I knew it would be -> ★★★★★


you’re not depressed, you just love new england -> ★★★★★
