In [8]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, set_seed
import evaluate
import numpy as np
import os
import pandas as pd
import random
import torch
from sklearn.metrics import cohen_kappa_score, mean_absolute_error

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
set_seed(42)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

train_set = pd.read_csv('./data/letterboxd_250movie_reviews_train.csv')
val_set  = pd.read_csv('./data/letterboxd_250movie_reviews_val.csv')
test_set  = pd.read_csv('./data/letterboxd_250movie_reviews_test.csv')

for dset in [train_set, val_set, test_set]:
    dset.rename(columns={'review': 'text'}, inplace=True)
    dset['label'] = dset['rating'].apply(lambda r: int(round((r - 0.5) / 0.5)))
    dset.drop(columns=['rating'], inplace=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_set),
    'validation': Dataset.from_pandas(val_set),
    'test': Dataset.from_pandas(test_set)
})

model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

def rating_to_bin(r: float) -> int:
    # map 0.5→0, 1.0→1, ..., 5.0→9
    return int(round((r - 0.5) / 0.5))

def bin_to_rating(b: int) -> float:
    return 0.5 + 0.5 * b

id2label = {i: bin_to_rating(i) for i in range(10)}
label2id = {bin_to_rating(i): i for i in range(10)}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=10,
    id2label=id2label,
    label2id=label2id,
)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset.map(preprocess_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-letterbox-reviews-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [11]:
tokenized_data["validation"]

Dataset({
    features: ['original_id', 'text', 'movie', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [12]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False
    
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

# for name, param in list(model.named_parameters())[20:]:
#     if "layer.11." in name:
#         param.requires_grad = True

In [13]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc", "multiclass")

def compute_metrics(eval_pred):
    # get probabilities and true_ratings
    probabilities, true_ratings = eval_pred
    # predict most probable class
    predicted_classes = np.argmax(probabilities, axis=1)
    # debug distribution of predicted classes:
    print("predicted_classes distribution:", np.bincount(predicted_classes))
    true_classes = [rating_to_bin(r) for r in true_ratings]
    print(f"predicted_classes.head: {predicted_classes[:15]}")
    print(f"true_classes.head: {true_classes[:15]}")
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=true_classes)['accuracy'],3)
    
    # Map to half-star ratings for MAE
    pred_ratings = np.array([bin_to_rating(int(c)) for c in predicted_classes])

    qwk = cohen_kappa_score(y1=true_classes, y2=predicted_classes, weights="quadratic")
    print(f'true_ratings.head: {true_ratings[:15]}')
    print(f'pred_ratings.head: {pred_ratings[:15]}')
    mae = mean_absolute_error(true_ratings, pred_ratings)

    return {"Accuracy": acc, "QWK": qwk, "MAE": mae}
    
    return {"Accuracy": acc, "AUC": auc}

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

training_output = trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Qwk,Mae
1,2.2942,2.187266,0.03,0.263749,2.3125
2,2.1283,2.103565,0.15,0.359657,2.3175
3,2.0374,2.073277,0.085,0.38406,2.3
4,1.9602,2.044806,0.075,0.444357,2.085
5,1.9052,2.072948,0.11,0.488711,1.99
6,1.8863,2.053073,0.1,0.46955,2.04
7,1.8428,2.036499,0.07,0.461849,2.165
8,1.8268,2.021681,0.1,0.472664,2.0825
9,1.8088,2.015529,0.09,0.475336,2.0675
10,1.7918,2.014541,0.085,0.471845,2.075


predicted_classes distribution: [ 22   0   3   0 100   0   0  47   0  28]
predicted_classes.head: [9 4 7 7 4 0 9 7 4 9 7 4 7 7 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  2.5 4.  4.  2.5 0.5 5.  4.  2.5 5.  4.  2.5 4.  4.  5. ]




predicted_classes distribution: [15 92  1  0  0 25  8  0 11 48]
predicted_classes.head: [9 9 9 9 1 0 9 1 1 9 6 1 8 1 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  5.  5.  5.  1.  0.5 5.  1.  1.  5.  3.5 1.  4.5 1.  5. ]




predicted_classes distribution: [58 18 22 13  1 26  7 19  0 36]
predicted_classes.head: [9 4 7 7 0 0 9 3 2 9 6 0 0 5 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  2.5 4.  4.  0.5 0.5 5.  2.  1.5 5.  3.5 0.5 0.5 3.  5. ]




predicted_classes distribution: [ 4 25 33 19 35  8 28  7 14 27]
predicted_classes.head: [9 4 7 7 2 2 8 2 2 9 6 3 8 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  2.5 4.  4.  1.5 1.5 4.5 1.5 1.5 5.  3.5 2.  4.5 2.5 5. ]




predicted_classes distribution: [27 22 15  3 11 31  1 22 38 30]
predicted_classes.head: [9 4 7 7 1 8 8 8 5 9 8 0 8 8 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  2.5 4.  4.  1.  4.5 4.5 4.5 3.  5.  4.5 0.5 4.5 4.5 5. ]




predicted_classes distribution: [18  8 38 13 25 21  9 32  2 34]
predicted_classes.head: [9 4 7 7 2 9 9 3 2 9 6 2 2 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [5.  2.5 4.  4.  1.5 5.  5.  2.  1.5 5.  3.5 1.5 1.5 2.5 5. ]




predicted_classes distribution: [44  7  8 40 18 13  2 24 24 20]
predicted_classes.head: [1 4 7 7 2 0 8 3 2 9 8 0 8 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [1.  2.5 4.  4.  1.5 0.5 4.5 2.  1.5 5.  4.5 0.5 4.5 2.5 5. ]




predicted_classes distribution: [32 12  5 31 22 26 22 17 12 21]
predicted_classes.head: [1 4 7 7 5 0 8 3 4 9 6 0 8 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [1.  2.5 4.  4.  3.  0.5 4.5 2.  2.5 5.  3.5 0.5 4.5 2.5 5. ]




predicted_classes distribution: [29 15 11 20 26 28 15 18 16 22]
predicted_classes.head: [1 4 7 7 5 8 8 3 4 9 6 0 8 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [1.  2.5 4.  4.  3.  4.5 4.5 2.  2.5 5.  3.5 0.5 4.5 2.5 5. ]




predicted_classes distribution: [29 14 12 22 24 28 18 14 17 22]
predicted_classes.head: [1 4 7 7 5 8 8 3 2 9 6 0 8 4 9]
true_classes.head: [15, 3, 7, 15, 17, 7, 13, 1, 9, 15, 17, 3, 11, 9, 13]
true_ratings.head: [8 2 4 8 9 4 7 1 5 8 9 2 6 5 7]
pred_ratings.head: [1.  2.5 4.  4.  3.  4.5 4.5 2.  1.5 5.  3.5 0.5 4.5 2.5 5. ]


In [15]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["test"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))

# pool layer unfrozen
# 200 per rating baseline: {'Accuracy': np.float64(0.1), 'AUC': np.float64(0.536)}
# 200 per rating, {'Accuracy': np.float64(0.265), 'AUC': np.float64(0.738)} lr = 2e-4 batch_size = 16 num_epochs = 10
# 300 per rating baseline: {'Accuracy': np.float64(0.114), 'AUC': np.float64(0.526)}
# 300 per rating, {'Accuracy': np.float64(0.193), 'AUC': np.float64(0.716)} lr = 2e-4 batch_size = 16 num_epochs = 10
# 300 per rating, {'Accuracy': np.float64(0.200), 'AUC': np.float64(0.716)} lr = 2e-4 batch_size = 8 num_epochs = 10
# 300 per rating, {'Accuracy': np.float64(0.154), 'AUC': np.float64(0.663)} lr = 2e-5 batch_size = 8 num_epochs = 10
# 300 per rating, {'Accuracy': np.float64(0.214), 'AUC': np.float64(0.716)} lr = 2e-4 batch_size = 8 num_epochs = 20
# 200 per rating, {'Accuracy': np.float64(0.265), 'AUC': np.float64(0.745)} lr = 2e-4 batch_size = 8 num_epochs = 20
# 200 per rating, {'Accuracy': np.float64(0.275), 'AUC': np.float64(0.745)} lr = 2e-4 batch_size = 16 num_epochs = 20
# 200 per rating, {'Accuracy': np.float64(0.180), 'AUC': np.float64(0.677)} lr = 2e-5 batch_size = 16 num_epochs = 20
print(metrics)

# also layer 11 unfrozen
# 200 per rating, {'Accuracy': np.float64(0.23), 'AUC': np.float64(0.776)} lr = 2e-4 batch_size = 8 num_epochs = 10
# 200 per rating, {'Accuracy': np.float64(0.31), 'AUC': np.float64(0.785)} lr = 2e-4 batch_size = 16 num_epochs = 10




predicted_classes distribution: [30 29  5 14 15 21 24 12  9 41]
predicted_classes.head: [0 2 1 1 9 4 0 3 5 6 9 1 9 0 0]
true_classes.head: [7, 3, 1, -1, 13, 7, 17, 11, 13, 9, 3, 5, -1, 7, -1]
true_ratings.head: [4 2 1 0 7 4 9 6 7 5 2 3 0 4 0]
pred_ratings.head: [0.5 1.5 1.  1.  5.  2.5 0.5 2.  3.  3.5 5.  1.  5.  0.5 0.5]
predicted_classes distribution: [30 29  5 14 15 21 24 12  9 41]
predicted_classes.head: [0 2 1 1 9 4 0 3 5 6 9 1 9 0 0]
true_classes.head: [7, 3, 1, -1, 13, 7, 17, 11, 13, 9, 3, 5, -1, 7, -1]
true_ratings.head: [4 2 1 0 7 4 9 6 7 5 2 3 0 4 0]
pred_ratings.head: [0.5 1.5 1.  1.  5.  2.5 0.5 2.  3.  3.5 5.  1.  5.  0.5 0.5]
{'Accuracy': np.float64(0.085), 'QWK': 0.4547498964340433, 'MAE': 2.2725}


In [16]:
import torch
def predict_single(review: str):
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = trainer.predict(Dataset.from_dict(inputs))
        predicted_class_id = outputs.predictions.argmax().item()

        return model.config.id2label[predicted_class_id]

def predict_and_print(review: str):
    prediction = predict_single(review)
    print(f"{review} -> {prediction}")
predict_and_print("a whole movie about coming down from apex mountain. Safdie's worthy, long-overdue follow-up to Lenny Cooke")
predict_and_print("the rock is evolving. he is now the boulder")
predict_and_print("That guy with the long hair was so unnecessarily rude.")
predict_and_print("utterly sauceless")
predict_and_print("decent as a (very) muted romance, much louder and more effective as an argument for physical archival media.")
predict_and_print("That was incredibly upsetting but I knew it would be")
predict_and_print("you’re not depressed, you just love new england")



a whole movie about coming down from apex mountain. Safdie's worthy, long-overdue follow-up to Lenny Cooke -> 5.0


the rock is evolving. he is now the boulder -> 5.0


That guy with the long hair was so unnecessarily rude. -> 0.5


utterly sauceless -> 0.5


decent as a (very) muted romance, much louder and more effective as an argument for physical archival media. -> 1.5


That was incredibly upsetting but I knew it would be -> 5.0


you’re not depressed, you just love new england -> 4.0
