In [1]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from transformers.utils import logging
import evaluate
import numpy as np
import pandas as pd
import torch
import os
from sklearn.metrics import mean_squared_error

os.environ["TOKENIZERS_PARALLELISM"] = "false"

dataset = pd.read_csv("./letterboxd_250movie_reviews_normalized_sampled.csv")
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {
    0: "½",
    1: "★",
    2: "★½",
    3: "★★",
    4: "★★½",
    5: "★★★",
    6: "★★★½",
    7: "★★★★",
    8: "★★★★½",
    9: "★★★★★",
}
label2id = {
    "½": 0,
    "★": 1,
    "★½": 2,
    "★★": 3,
    "★★½": 4,
    "★★★": 5,
    "★★★½": 6,
    "★★★★": 7,
    "★★★★½": 8,
    "★★★★★": 9,
}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=10,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:
review = "This movie was fantastic! Great acting and amazing plot."
inputs = tokenizer(
    review, return_tensors="pt", padding=True, truncation=True, max_length=1024
)
print("Tokenized input:")
print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Attention mask shape: {inputs['attention_mask'].shape}")

# Make prediction (no gradient calculation needed for inference)
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

print(f"\nRaw logits: {outputs.logits}")
print(f"Probabilities: {predictions}")
print(f"Predicted class: {torch.argmax(predictions, dim=-1).item()}")


Tokenized input:
Input IDs shape: torch.Size([1, 13])
Attention mask shape: torch.Size([1, 13])

Raw logits: tensor([[ 0.1682, -0.7521, -0.0206,  0.5124, -0.3735, -0.2041, -0.7227,  0.2623,
         -0.4108,  0.1616]])
Probabilities: tensor([[0.1255, 0.0500, 0.1039, 0.1770, 0.0730, 0.0865, 0.0515, 0.1378, 0.0703,
         0.1246]])
Predicted class: 3


In [4]:
test_set = pd.read_csv("./data/letterboxd_250movie_reviews_test.csv")

inputs = tokenizer(
    test_set["text"].tolist(),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)


In [6]:
# predicted_labels = torch.argmax(predictions, dim=1)
true_labels = torch.from_numpy(
    test_set["label"].to_numpy()
)

In [10]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # compute multiclass auc using all class probabilities
    auc = np.round(auc_score.compute(prediction_scores=probabilities, references=labels, multi_class="ovr")['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

# acc_result = accuracy.compute(
#     predictions=predicted_labels, references=true_labels, normalize=True
# )
# print("Accuracy:", acc_result['accuracy'])

# mse = mean_squared_error(y_true=true_labels, y_pred=predicted_labels)
# print("MSE:", mse)
compute_metrics((outputs.logits.numpy(), true_labels.numpy()))

ValueError: Module inputs don't match the expected format.
Expected format: {'references': Value('int32'), 'prediction_scores': Value('float32')},
Input references: [4 2 1 0 7 4 9 6 7 5 2 3 0 4 0 3 0 6 8 8 2 3 8 2 1 5 6 4 2 1 2 4 3 0 2 2 9
 3 4 0 7 7 6 6 3 2 8 9 6 9 6 8 0 4 8 8 7 4 8 7 0 6 6 0 2 6 5 7 1 9 1 9 6 6
 7 8 9 2 8 2 6 3 8 4 2 4 8 4 6 5 4 0 7 1 3 1 8 3 8 9 0 2 7 2 0 1 4 6 7 0 6
 6 0 1 0 6 8 9 2 7 0 0 1 5 9 5 3 7 5 2 8 9 8 2 8 5 9 3 9 2 1 0 6 0 3 5 0 1
 7 1 2 3 9 1 3 2 0 4 7 6 7 0 3 2 8 9 5 7 6 8 3 5 8 3 0 0 8 2 5 4 9 4 3 6 5
 9 0 8 5 7 9 4 6 4 6 6 5 3 0 6],
Input prediction_scores: [[0.09784653 0.07659286 0.08248442 ... 0.13397627 0.071133   0.13050076]
 [0.1277937  0.05590996 0.0991604  ... 0.12975194 0.07107601 0.1238809 ]
 [0.1148041  0.06491779 0.08334477 ... 0.14174904 0.0662249  0.11772063]
 ...
 [0.14523251 0.03998476 0.10717452 ... 0.13240436 0.0682165  0.11420897]
 [0.12342191 0.06038117 0.08614881 ... 0.13221551 0.06676943 0.13375297]
 [0.10228305 0.06011888 0.11044272 ... 0.12273306 0.08708755 0.1221231 ]]