## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [1]:
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = (
    pl.scan_parquet(
        "/Users/maxwoolf/Downloads/movie_data_plus_embeds_all.parquet", n_rows=40000
    )
    .select(["tconst", "averageRating", "embedding"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
)

df

tconst,averageRating,embedding
str,f32,"array[f32, 768]"
"""tt0000009""",5.4,"[-0.007815, -0.022642, … 0.005391]"
"""tt0000147""",5.3,"[0.012021, 0.014255, … -0.015754]"
"""tt0000574""",6.0,"[-0.010052, -0.015825, … 0.040161]"
"""tt0000591""",5.6,"[0.00765, 0.019661, … -0.010763]"
"""tt0000630""",3.2,"[0.03492, 0.00301, … 0.027586]"
…,…,…
"""tt0084637""",7.6,"[0.002538, 0.011368, … -0.010203]"
"""tt0084643""",7.4,"[-0.016075, -0.010667, … 0.009743]"
"""tt0084645""",5.4,"[0.04202, -0.009168, … 0.049604]"
"""tt0084646""",7.3,"[0.025038, 0.008065, … 0.05365]"


In [10]:
device = "cpu"

tensor_embeddings = torch.from_numpy(df["embedding"].to_numpy().copy()).to(device)
tensor_ratings = torch.from_numpy(df["averageRating"].to_numpy().copy()).to(device)
tensor_dataset = TensorDataset(tensor_embeddings, tensor_ratings)

In [11]:
test_proportion = 0.1

train_dataset, test_dataset = torch.utils.data.random_split(
    tensor_dataset, [1 - test_proportion, test_proportion]
)

In [40]:
class RatingsModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(768, 256)
        # self.batchnorm_1 = nn.BatchNorm1d(256)
        self.linear_2 = nn.Linear(256, 128)
        # self.batchnorm_2 = nn.BatchNorm1d(128)
        self.output = nn.Linear(128, 1)

    def forward(self, x, targets=None):
        x = F.gelu(self.linear_1(x))
        # x = self.batchnorm_1(x)
        x = F.gelu(self.linear_2(x))
        # x = self.batchnorm_2(x)
        x = self.output(x)

        return x.squeeze()  # return 1D output

In [41]:
model = RatingsModel()
_ = model.to(device)
model

RatingsModel(
  (linear_1): Linear(in_features=768, out_features=256, bias=True)
  (linear_2): Linear(in_features=256, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)

Validation loss doesn't play nice with the `Trainer` out of the boss, so need [some tweaks](https://discuss.huggingface.co/t/no-log-for-validation-loss-during-training-with-trainer/40094/3).


In [42]:
def collate_fn(examples):
    inputs = torch.stack([f[0] for f in examples])
    targets = torch.stack([f[1] for f in examples])

    return {"x": inputs, "targets": targets}


class MAETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE

        return (loss, outputs) if return_outputs else loss


In [43]:
training_args = TrainingArguments(
    learning_rate=1e-3,
    lr_scheduler_type="cosine_with_restarts",
    per_device_train_batch_size=1024,
    per_device_eval_batch_size=1024,
    num_train_epochs=100,
    weight_decay=0.1,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.1,
    logging_strategy="steps",
    logging_steps=0.1,
    fp16=False,
    dataloader_num_workers=0,  # since data is in memory
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
)

# reinstantiate a clean model
model = RatingsModel()
_ = model.to(device)

trainer = MAETrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [44]:
trainer.train()

Step,Training Loss,Validation Loss
360,0.9686,0.676024
720,0.6548,0.671002
1080,0.6454,0.658555
1440,0.6362,0.654707
1800,0.6309,0.647676
2160,0.6254,0.644955
2520,0.6227,0.644122
2880,0.6201,0.643597
3240,0.6183,0.64314
3600,0.6178,0.643094


TrainOutput(global_step=3600, training_loss=0.6640065256754557, metrics={'train_runtime': 26.4682, 'train_samples_per_second': 136012.219, 'train_steps_per_second': 136.012, 'total_flos': 0.0, 'train_loss': 0.6640065256754557, 'epoch': 100.0})

In [45]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.6430938243865967,
 'eval_runtime': 0.0619,
 'eval_samples_per_second': 64579.417,
 'eval_steps_per_second': 64.579,
 'epoch': 100.0}

## Test Model


In [47]:
_ = model.to(device)
eval_dataset = test_dataset[0:10]

with torch.no_grad():
    output = model(x=eval_dataset[0])
    preds = output.detach().cpu()

pl.DataFrame({"Predicted": preds, "Actual": eval_dataset[1]}).with_columns(
    abs_diff=(pl.col("Predicted") - pl.col("Actual")).abs().round(2)
)

Predicted,Actual,abs_diff
f32,f32,f32
6.289316,4.3,1.99
6.712427,6.7,0.01
6.127418,6.3,0.17
5.253753,5.4,0.15
6.577616,8.3,1.72
5.759141,6.1,0.34
6.300344,6.9,0.6
6.355377,6.4,0.04
5.796121,6.3,0.5
6.410416,4.5,1.91
