In [1]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)
from transformers.utils import logging
import evaluate
import numpy as np
import pandas as pd
import torch
import os
import random
from sklearn.metrics import mean_squared_error

# Set seeds for reproducibility
os.environ["TOKENIZERS_PARALLELISM"] = "false"
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
set_seed(42)

dataset = pd.read_csv("./letterboxd_250movie_reviews_normalized_sampled.csv")
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {
    0: 0.5,
    1: 1.0,
    2: 1.5,
    3: 2.0,
    4: 2.5,
    5: 3.0,
    6: 3.5,
    7: 4.0,
    8: 4.5,
    9: 5.0,
}
label2id = {
    0.5: 1,
    1.0: 2,
    1.5: 3,
    2.0: 4,
    2.5: 5,
    3.0: 6,
    3.5: 7,
    4.0: 8,
    4.5: 9,
    5.0: 10,
}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=10,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:
review = "This movie was fantastic! Great acting and amazing plot."
inputs = tokenizer(
    review, return_tensors="pt", padding=True, truncation=True, max_length=1024
)
print("Tokenized input:")
print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Attention mask shape: {inputs['attention_mask'].shape}")

# Make prediction (no gradient calculation needed for inference)
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

print(f"\nRaw logits: {outputs.logits}")
print(f"Probabilities: {predictions}")
print(f"Predicted class: {torch.argmax(predictions, dim=-1).item()}")


Tokenized input:
Input IDs shape: torch.Size([1, 13])
Attention mask shape: torch.Size([1, 13])

Raw logits: tensor([[ 0.2907,  0.4949, -0.2167,  0.8362, -0.0250, -0.8328,  0.1779,  0.3750,
         -0.5198, -0.3578]])
Probabilities: tensor([[0.1169, 0.1433, 0.0704, 0.2016, 0.0852, 0.0380, 0.1044, 0.1271, 0.0520,
         0.0611]])
Predicted class: 3


In [4]:
test_set = pd.read_csv("./data/letterboxd_250movie_reviews_test.csv")

inputs = tokenizer(
    test_set["text"].tolist(),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)


In [5]:
# predicted_labels = torch.argmax(predictions, dim=1)
true_labels = torch.from_numpy(
    test_set["rating"].to_numpy()
)

In [6]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc", "multiclass")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    labels_ids = [label2id[label] for label in labels]
    
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # compute multiclass auc using all class probabilities
    auc = np.round(auc_score.compute(prediction_scores=probabilities, references=labels_ids, multi_class="ovr")['roc_auc'],3)
    
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

compute_metrics((outputs.logits.numpy(), true_labels.numpy()))
# with max 200 samples per rating: {'Accuracy': np.float64(0.1), 'AUC': np.float64(0.536)}

{'Accuracy': np.float64(0.19), 'AUC': np.float64(0.536)}