In [1]:
from datasets import DatasetDict, Dataset
from dataclasses import dataclass
import math, re, numpy as np
from typing import Dict, Any, Optional, List

import torch
import torch.nn as nn
from torch.nn import functional as F

from transformers import (
    AutoTokenizer, AutoModel,
    PreTrainedModel, PretrainedConfig,
    TrainingArguments, Trainer, DataCollatorWithPadding
)

from datasets import Dataset, load_dataset
from sklearn.metrics import cohen_kappa_score, mean_absolute_error
import pandas as pd
from transformers.modeling_outputs import SequenceClassifierOutput
import evaluate

# ---------- Config ----------
MODEL_NAME = "microsoft/deberta-base-mnli"
MAX_LEN = 384
NUM_BINS = 10                      # 0.5, 1.0, ..., 5.0  => 10 bins
BIN_VALUES = np.arange(0.5, 5.0 + 0.5, 0.5)  # [0.5, 1.0, ..., 5.0]

def rating_to_bin(r: float) -> int:
    # map 0.5→0, 1.0→1, ..., 5.0→9
    return int(round((r - 0.5) / 0.5))

def bin_to_rating(b: int) -> float:
    return 0.5 + 0.5 * b

def class_to_cumulative_targets(y: torch.Tensor, num_bins: int) -> torch.Tensor:
    # For class c, targets for thresholds k=0..K-2 are 1 if c > k else 0
    # y: (B,) long
    B = y.size(0)
    k = torch.arange(num_bins - 1, device=y.device).unsqueeze(0).expand(B, -1)
    return (y.unsqueeze(1) > k).float()  # (B, K-1)

def preprocess(ex):
    enc = tokenizer(ex["text"], truncation=True, max_length=MAX_LEN)
    # map rating -> class 0..9
    label = rating_to_bin(float(ex["rating"]))
    enc["labels"] = label
    return enc

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_set = pd.read_csv('./data/letterboxd_250movie_reviews_train.csv')
val_set  = pd.read_csv('./data/letterboxd_250movie_reviews_val.csv')
test_set  = pd.read_csv('./data/letterboxd_250movie_reviews_test.csv')

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_set),
    'validation': Dataset.from_pandas(val_set),
    'test': Dataset.from_pandas(test_set)
})

dataset = {k: v.map(preprocess) for k,v in dataset.items()}
# dataset = dataset.map(preprocess, batched=True)

# ---------- CORAL Model ----------
class CoralConfig(PretrainedConfig):
    model_type = "coral"
    def __init__(self, base_model_name=MODEL_NAME, num_bins=NUM_BINS, **kwargs):
        super().__init__(**kwargs)
        self.base_model_name = base_model_name
        self.num_bins = num_bins

class CoralForOrdinalRegression(PreTrainedModel):
    config_class = CoralConfig

    def __init__(self, config: CoralConfig):
        super().__init__(config)
        self.encoder = AutoModel.from_pretrained(config.base_model_name)
        hidden = self.encoder.config.hidden_size

        # Shared weight vector w (d->1), CORAL: logit_k = w^T h + b_k
        self.shared_linear = nn.Linear(hidden, 1, bias=False)
        self.thresholds = nn.Parameter(torch.zeros(config.num_bins - 1))
        self.dropout = nn.Dropout(getattr(self.encoder.config, "hidden_dropout_prob", 0.1))

        self.post_init()

    def forward(
        self,
        input_ids=None, attention_mask=None, token_type_ids=None,
        labels: Optional[torch.LongTensor] = None
    ):
        # Get [CLS]-like pooled representation (use first token)
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # Use mean pooling of last hidden state → often more stable than CLS for some models
        last = out.last_hidden_state  # (B, T, H)
        mask = attention_mask.unsqueeze(-1)  # (B, T, 1)
        pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
        pooled = self.dropout(pooled)

        s = self.shared_linear(pooled).squeeze(-1)                    # (B,)
        logits = s.unsqueeze(1) + self.thresholds.unsqueeze(0)        # (B, K-1)

        loss = None
        if labels is not None:
            targets = class_to_cumulative_targets(labels, self.config.num_bins)  # (B, K-1)
            # BCEWithLogits over all thresholds
            loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="mean")

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None
        )

    @torch.no_grad()
    def predict_classes(self, input_ids, attention_mask, token_type_ids=None, threshold: float = 0.5):
        out = self.forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        probs = torch.sigmoid(out["logits"])           # (B, K-1)
        # predicted class = count of thresholds passed (p_k > 0.5)
        return (probs > threshold).sum(dim=1)          # (B,)

accuracy = evaluate.load("accuracy")

# ---------- Metrics ----------
def compute_metrics(eval_pred):
    # eval_pred.predictions is (B, K-1) logits
    logits, labels = eval_pred
    # make sure labels all fall in 0..9
    if labels.min() < 0 or labels.max() >= NUM_BINS:
        raise ValueError(f"Labels should be in the range [0, {NUM_BINS-1}]")
    probs = 1 / (1 + np.exp(-logits))
    preds_class = (probs > 0.5).sum(axis=1)  # 0..9
    true_class = labels

    # Map to half-star ratings for MAE
    preds_rating = np.array([bin_to_rating(int(c)) for c in preds_class])
    true_rating  = np.array([bin_to_rating(int(c)) for c in true_class])

    qwk = cohen_kappa_score(true_class, preds_class, weights="quadratic")
    mae = mean_absolute_error(true_rating, preds_rating)
    acc = np.round(accuracy.compute(predictions=preds_rating, references=true_rating)['accuracy'],3)
    return {"qwk": qwk, "mae": mae, "acc": acc}

# ---------- Train ----------
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = CoralForOrdinalRegression(CoralConfig())

# Freeze all base model params
for name, param in model.encoder.named_parameters():
    param.requires_grad = False

encode_layers_to_unfreeze = [10, 11]

# Unfreeze last encoder layer (optional, for better adaptation)
for name, param in model.encoder.named_parameters():
    if any([f"encoder.layer.{l}" in name for l in encode_layers_to_unfreeze]):
        param.requires_grad = True

for name, param in model.encoder.named_parameters():
    if "pooler" in name: # for bert models
        param.requires_grad = True

# for name, param in model.encoder.named_parameters():
#     if "encoder.rel_embeddings.weight" in name: #for DeBERTa models
#         param.requires_grad = True

# Unfreeze CORAL head and thresholds, for bert models
model.shared_linear.weight.requires_grad = True
model.thresholds.requires_grad = True

# Print trainable status for all parameters
for name, param in model.named_parameters():
    print(name, param.requires_grad)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

thresholds True
encoder.embeddings.word_embeddings.weight False
encoder.embeddings.LayerNorm.weight False
encoder.embeddings.LayerNorm.bias False
encoder.encoder.layer.0.attention.self.q_bias False
encoder.encoder.layer.0.attention.self.v_bias False
encoder.encoder.layer.0.attention.self.in_proj.weight False
encoder.encoder.layer.0.attention.self.pos_proj.weight False
encoder.encoder.layer.0.attention.self.pos_q_proj.weight False
encoder.encoder.layer.0.attention.self.pos_q_proj.bias False
encoder.encoder.layer.0.attention.output.dense.weight False
encoder.encoder.layer.0.attention.output.dense.bias False
encoder.encoder.layer.0.attention.output.LayerNorm.weight False
encoder.encoder.layer.0.attention.output.LayerNorm.bias False
encoder.encoder.layer.0.intermediate.dense.weight False
encoder.encoder.layer.0.intermediate.dense.bias False
encoder.encoder.layer.0.output.dense.weight False
encoder.encoder.layer.0.output.dense.bias False
encoder.encoder.layer.0.output.LayerNorm.weight False

In [2]:
# hyperparameters
lr = 2e-4
batch_size = 32
num_epochs = 10

args = TrainingArguments(
    output_dir="./bert-letterbox-reviews-ordinal-regression_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Qwk,Mae,Acc
1,0.638,0.649151,0.451506,1.575,0.15
2,0.5703,0.602843,0.522322,1.435,0.16
3,0.5354,0.59858,0.494781,1.435,0.17
4,0.5164,0.607922,0.4954,1.4425,0.165
5,0.4898,0.623574,0.517182,1.44,0.155
6,0.484,0.619134,0.463664,1.4725,0.16
7,0.4669,0.622693,0.473825,1.485,0.15
8,0.4597,0.620729,0.490067,1.47,0.15
9,0.4531,0.618725,0.496292,1.45,0.155
10,0.4531,0.614531,0.478206,1.4575,0.155




TrainOutput(global_step=500, training_loss=0.5066825103759766, metrics={'train_runtime': 1851.0898, 'train_samples_per_second': 8.644, 'train_steps_per_second': 0.27, 'total_flos': 3657219163310592.0, 'train_loss': 0.5066825103759766, 'epoch': 10.0})

In [None]:
# apply model to validation dataset
predictions = trainer.predict(dataset["test"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# google-bert/bert-base-uncased
#  200 samples per rating + only pooler unfrozen
#  {'qwk': 0.5639322133133833, 'mae': 1.2325, 'acc': np.float64(0.235)} lr = 2e-4 batch_size = 16 num_epochs = 10
#  200 samples per rating + pooler and 11th layers unfrozen
#  {'qwk': 0.5772998908234213, 'mae': 1.305, 'acc': np.float64(0.205)} lr = 2e-4 batch_size = 16 num_epochs = 10
#  200 samples per rating + pooler, 10 and 11th layers unfrozen
#  {'qwk': 0.5454352351416798, 'mae': 1.35, 'acc': np.float64(0.205)} lr = 2e-4 batch_size = 16 num_epochs = 10
# microsoft/deberta-base-mnli
#  200 samples per rating + 11th layer and unfrozen
#  {'qwk': 0.6079358581652387, 'mae': 1.27, 'acc': np.float64(0.205)} lr = 2e-4 batch_size = 16 num_epochs = 10
#  200 samples per rating + 10th and 11th layer and unfrozen
#  {'qwk': 0.6149355764361673, 'mae': 1.255, 'acc': np.float64(0.21)} lr = 2e-4 batch_size = 16 num_epochs = 10
#  200 samples per rating + 9th, 10th and 11th layer and pooler
#  {'qwk': 0.554329071675058, 'mae': 1.3425, 'acc': np.float64(0.21)} lr = 2e-4 batch_size = 16 num_epochs = 10
#  200 samples per rating + 9th, 10th and 11th layer and pooler
#  {'qwk': 0.605914123300682, 'mae': 1.26, 'acc': np.float64(0.2)} lr = 2e-4 batch_size = 24 num_epochs = 10
#  {'qwk': 0.5589545796318471, 'mae': 1.355, 'acc': np.float64(0.2)} lr = 2e-4 batch_size = 32 num_epochs = 10







{'qwk': 0.5589545796318471, 'mae': 1.355, 'acc': np.float64(0.2)}


In [4]:
def predict_single(review: str):
    inputs = tokenizer(review, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    with torch.no_grad():
        outputs = trainer.predict(Dataset.from_dict(inputs))
        # print(f'predictions: {outputs.predictions}')
        predicted_class_id = (outputs.predictions > 0.5).sum(axis=1)

        return bin_to_rating(predicted_class_id)

for example in dataset["test"].to_list()[:10]:
    # print(example)
    text = example["text"]
    rating = example["rating"]
    movie = example.get("movie", "Unknown")
    pred = predict_single(text)
    print(f'Movie: {movie}\nText: {text}\nTrue rating: {rating}, Predicted rating: {pred}\n')




Movie: oppenheimer-2023
Text: idk if i have any large overarching thoughts yet so once again it is time for a scattered list!!!!!!!!!!!!!- truly goofy sex scenes. why is florence pugh taking a break from riding cillian murphy to pull a sanskrit book off his shelf and making him read it to her? AND absolutely smoothbrained to have her point at a random line, say, "read it" and it's literally "i am become death, destroyer of worlds" like fuck offffff with your obvious shit lol- no character in this movie is developed very deeply beyond the guy himself but florence pugh being here just to show tits and be a communist then die because apparently she's also unspecifically mentally ill is like....how are we still this bad at character christopher!!! when will you learn that a girlie hating flowers does not count as depth!!!! - last flo-related point but whoever decided cillian murphy should have his legs crossed in that scene is a coward- many weird and funny cameos but i think my favorite i

Movie: army-of-shadows
Text: I'm sorry, did I watch a different film than everyone else? This was boring as hell, I can't count how many times I fell asleep and how many times I looked away while it was on. I didn't even finish it so I shouldn't even be writing this, but there's no way I can watch the rest. I'll probably give it a chance in the future, but for now I would rather forget about it.
True rating: 1.5, Predicted rating: [0.5]



Movie: mary-and-max
Text: A niche stop motion animation that takes a really personal subject for a lot of people sounds as a promising project; yet I found this film insoportable from start to finish. This film tries too hard to be funny but is just way too quirky and in several stances asinine that its intentions of "comedy" are a pain in the ass; maybe this could be considered a subjective aspect, I mean while I believe that comedy has a lineament to determine whether it is acceptable or it isn't (like there are topics that are unacceptable to treat with mockery) , when a joke can be morally and socially considered as acceptable I think it is mostly (if not entirely) subjective.Then you have Mary and Max, a film that I wouldn't know how to put it exactly in terms of its tone and intended comedy. I wouldn't say it is completely incorrect but it is unsettling and can be easily misguided making a pun out of the characters instead out of their actions as I guess is originally intended. I

Movie: life-is-beautiful
Text: "Silence is the most powerful cry."Finding comedy in a concentration camp. I'm just going to assume the audience behind this film's reception shares a one-circle Venn diagram with the audience behind Jojo Rabbit. Sadly, I cannot get behind a vanity project that humorously infantilizes and exploits the Holocaust / industrialized genocide as a Whimsical and Kitschy, melodramatic reinforcement of "Smiling in the face of horror" and "You can get over it" - or in the former film's case: "Let everything happen to you. No feeling is final." If you ever want to make an inquiry into the fascism of sentimentality, you can start here. To Benigni, you can fuck right off, thanks.
True rating: 0.5, Predicted rating: [0.5]



Movie: sing-sing-2023
Text: Really really impactful. Made me genuinely laugh out loud throughout but also always felt so sobering and therapeutic. And made me cry too, of course
True rating: 4.0, Predicted rating: [5.]



Movie: napoleon
Text: Week 20 of the 2nd Letterboxd Season ChallengeAn unseen movie from Paste Magazine's The 100 Best Silent Films of All Time listI'm sorry......but this is a 5 and a half hour movie.I know I chose to watch it but there was nothing else I really wanted to see from that list and since this is in the Letterboxd Top 250 (which I hope to complete someday) I figured I'd just watch this.There were some creative technical aspects and editing choices in this film which were way ahead of its time. There was also one really nice shot that made me go "oooo". But making me go "oooo" does not warrant 5 and a half hours of my time.It's like going through a really long vine compilation full of Jake Paul and Lele Pons just to get to the one vine where the little kid bellydances to the remix of Rihanna's song Monster.I can't really recommend this film to anyone unless you really want to know about the entire history of Napoleon but then again you could just read his wikipedia page you

Movie: mommy-2014
Text: by the end of this i was one arm leaned up against the wall breathing like my ass just ran a 5k. i had so much i wanted to say as i was watching this, but now I'm just speechless and I want to lay down............ unbelievably beautiful in every way. amazing portrayal of dysfunctional relationships and the way we end up chasing the highs of the good times like a gambler desperate for a win just to break-even.Also not to indirect another review but someone called the inclusion of White Flag by Dido in this laughable or some shit idk I stopped reading but I'll have you know that song is incredible and warranted it's place in this film! I used to shower to that song everyday like 6 months ago and that shit saved my life
True rating: 5.0, Predicted rating: [5.]



Movie: the-thing
Text: 74%"I don't know. Thousands of years ago it crashes, and this thing... gets thrown out, or crawls out, and it ends up freezing in the ice."Basically, Carpenter's cult horror film "The Thing" is a wintry Earth-based version of "Alien". Admittedly, I really like Ridley Scott's classic, so it only hurts more to say that The Thing left me quite a bit unsatisfied. Without a doubt what Carpenter has done here is impressive. By that I mean first and foremost the practical effects he and his team crafted together and that in the 80s. But the source of inspiration is clearly visible and I just had the feeling that the film lacks originality. Above all, it also takes the almost exact same problems from Alien (that I noticed) and continues them disastrously. And somehow it lacks a skillful suspense that I was waiting for all along. Also atmospherically the film could only convince me at times. In the whole it just doesn't want to work here, as many people can call the film 

Movie: happy-together-1997
Text: Film Club #20I am going to go ahead and give this film the 4 stars I think despite how much this film really wrecked my vibe and mood and I think I may hate it for that ?? But like this was a really interesting film, that takes complex themes, but wraps it into a pretty aimless story of toxic romance. The two leads are great and they are really doing some amazing work here. However, WKW must be commended for the strong direction and writing in this. It just has such a distinct style and tone that it really is unique. Overall, this has been one of the most interesting film club picks so far and I got to hand it to Lindsay for picking this, because I might not have watched it otherwise.
True rating: 4.0, Predicted rating: [5.]



Movie: the-best-of-youth
Text: Spouse: “What are you watching this time?”Movie Obsessed Person: “A 6 hour Italian made-for-TV movie.”Spouse: looksMovie Obsessed Person: “It’s on the list.” I remember when The Best of Youth hit Netflix, it was a big deal in some circles. It currently sits at #114 on the Letterboxd Top 250 movies and has an 8.5 with 22,000 votes on IMDb. This film has made an impact on folks. It covers 4 decades in the life of an Italian family, focusing firmly on two brothers: Matteo & Nicola. In this sprawling familia epic we are introduced to our brothers at the end of an academic year. They are going before their professors for their final exams. Immediately we see that Matteo has an extensive internal life that is not shown to others, he is perhaps hiding himself from everyone. Nicola greets the world with an open mind, he seems more accepting of life on its terms. Matteo walks out of his exam when his professor disagrees with one of his opinions, while Nicola nabs 