## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [1]:
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments
from transformers.trainer_callback import TrainerCallback

In [2]:
df = (
    pl.scan_parquet(
        "movie_data_plus_embeds_all.parquet"
    )
    .select(["tconst", "averageRating", "embedding"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
    .sample(fraction=1.0, shuffle=True, seed=42)
)

df

tconst,averageRating,embedding
str,f32,"array[f32, 768]"
"""tt0173052""",4.1,"[0.046187, 0.006053, … 0.011911]"
"""tt0266288""",7.4,"[-0.004875, -0.046969, … 0.017516]"
"""tt6263490""",4.3,"[0.005363, -0.018672, … 0.015112]"
"""tt10049110""",7.8,"[-0.009997, -0.029303, … 0.037793]"
"""tt5761612""",3.8,"[0.020259, -0.031869, … -0.01841]"
…,…,…
"""tt0079376""",6.2,"[0.062672, -0.009446, … 0.019441]"
"""tt1161064""",3.2,"[0.022779, 0.053063, … -0.009691]"
"""tt0179526""",5.7,"[0.001937, 0.003111, … -0.002453]"
"""tt0188233""",5.7,"[0.03125, 0.013802, … 0.009849]"


In [3]:
device = "cuda:0"
n_test = 20000

X_train = torch.from_numpy(df[:-n_test]["embedding"].to_numpy().copy()).to(device)
X_test = torch.from_numpy(df[-n_test:]["embedding"].to_numpy().copy()).to(device)

y_train = torch.from_numpy(df[:-n_test]["averageRating"].to_numpy().copy()).to(device)
y_test = torch.from_numpy(df[-n_test:]["averageRating"].to_numpy().copy()).to(device)

y_train

tensor([4.1000, 7.4000, 4.3000,  ..., 6.4000, 6.0000, 6.5000], device='cuda:0')

In [4]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

In [5]:
class RatingsModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(768, 1536)
        self.linear_2 = nn.Linear(1536, 768)
        self.output = nn.Linear(768, 1)

    def forward(self, x, targets=None):
        x = F.gelu(self.linear_1(x))
        x = F.gelu(self.linear_2(x))
        x = self.output(x)

        return x.squeeze()  # return 1D output

In [6]:
model = RatingsModel()
_ = model.to(device)
model

RatingsModel(
  (linear_1): Linear(in_features=768, out_features=1536, bias=True)
  (linear_2): Linear(in_features=1536, out_features=768, bias=True)
  (output): Linear(in_features=768, out_features=1, bias=True)
)

Validation loss doesn't play nice with the `Trainer` out of the boss, so need [some tweaks](https://discuss.huggingface.co/t/no-log-for-validation-loss-during-training-with-trainer/40094/3).


In [7]:
def collate_fn(examples):
    inputs = torch.stack([f[0] for f in examples])
    targets = torch.stack([f[1] for f in examples])

    return {"x": inputs, "targets": targets}


class MAETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE

        return (loss, outputs) if return_outputs else loss


In [12]:
training_args = TrainingArguments(
    learning_rate=1e-2,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=4096,
    per_device_eval_batch_size=4096,
    num_train_epochs=40,
    weight_decay=0.001,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.05,
    logging_strategy="steps",
    logging_steps=0.05,
    fp16=True,
    dataloader_num_workers=0,  # since data is in memory
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
)

# reinstantiate a clean model
model = RatingsModel()
_ = model.to(device)

trainer = MAETrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [13]:
trainer.train()

Step,Training Loss,Validation Loss
110,3.3057,0.91089
220,0.9648,0.939981
330,0.9345,0.837674
440,0.9005,0.826436
550,0.9063,0.872799
660,0.8509,0.816579
770,0.8481,0.822054
880,0.851,0.863146
990,0.8529,0.807504
1100,0.8248,0.798917


TrainOutput(global_step=2200, training_loss=0.9553426118330522, metrics={'train_runtime': 111.2626, 'train_samples_per_second': 80009.602, 'train_steps_per_second': 19.773, 'total_flos': 0.0, 'train_loss': 0.9553426118330522, 'epoch': 40.0})

In [14]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.7821616530418396,
 'eval_runtime': 0.1714,
 'eval_samples_per_second': 116713.991,
 'eval_steps_per_second': 29.178,
 'epoch': 40.0}

## Test Model


In [15]:
_ = model.to(device)
eval_dataset = test_dataset[0:10]

with torch.no_grad():
    output = model(x=eval_dataset[0])
    preds = output.detach().cpu().numpy()

pl.DataFrame({"Predicted": preds, "Actual": eval_dataset[1].cpu().numpy()}).with_columns(
    abs_diff=(pl.col("Predicted") - pl.col("Actual")).abs().round(2)
)

Predicted,Actual,abs_diff
f32,f32,f32
6.96875,7.1,0.13
6.359375,6.5,0.14
5.4296875,4.1,1.33
4.953125,5.5,0.55
7.082031,7.2,0.12
5.3671875,6.4,1.03
6.734375,8.3,1.57
6.199219,6.3,0.1
6.2578125,5.4,0.86
5.574219,4.0,1.57
