## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [None]:
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = (
    pl.scan_parquet(
        "/Users/maxwoolf/Downloads/movie_data_plus_embeds_all.parquet", n_rows=20000
    )
    .select(["tconst", "averageRating", "embedding"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
)

df

tconst,averageRating,embedding
str,f32,"array[f32, 768]"
"""tt0000009""",5.4,"[-0.007815, -0.022642, … 0.005391]"
"""tt0000147""",5.3,"[0.012021, 0.014255, … -0.015754]"
"""tt0000574""",6.0,"[-0.010052, -0.015825, … 0.040161]"
"""tt0000591""",5.6,"[0.00765, 0.019661, … -0.010763]"
"""tt0000630""",3.2,"[0.03492, 0.00301, … 0.027586]"
…,…,…
"""tt0052854""",6.2,"[0.007484, 0.005061, … 0.013337]"
"""tt0052858""",5.9,"[-0.004158, -0.001111, … -0.012037]"
"""tt0052860""",5.8,"[0.003819, -0.020857, … 0.005093]"
"""tt0052861""",8.1,"[0.01314, -0.004061, … 0.022055]"


In [44]:
device = "cpu"

tensor_embeddings = torch.from_numpy(df["embedding"].to_numpy().copy()).to(device)
tensor_ratings = torch.from_numpy(df["averageRating"].to_numpy().copy()).to(device)
tensor_dataset = TensorDataset(tensor_embeddings, tensor_ratings)

In [45]:
test_proportion = 0.1

train_dataset, test_dataset = torch.utils.data.random_split(
    tensor_dataset, [1 - test_proportion, test_proportion]
)

In [46]:
class RatingsModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(768, 1536)
        self.batchnorm_1 = nn.BatchNorm1d(1536)
        self.linear_2 = nn.Linear(1536, 768)
        self.batchnorm_2 = nn.BatchNorm1d(768)
        self.linear_3 = nn.Linear(768, 256)
        self.batchnorm_3 = nn.BatchNorm1d(256)
        self.output = nn.Linear(256, 1)

    def forward(self, x, targets):
        x = F.gelu(self.linear_1(x))
        x = self.batchnorm_1(x)
        x = F.gelu(self.linear_2(x))
        x = self.batchnorm_2(x)
        x = F.gelu(self.linear_3(x))
        x = self.batchnorm_3(x)
        x = self.output(x)

        # # add logical constraints
        # x = torch.clamp(x, min=1., max=10.)
        # x = torch.round(x, decimals=1)

        return x.squeeze()  # return 1D output

In [47]:
model = RatingsModel()
_ = model.to(device)
model

RatingsModel(
  (linear_1): Linear(in_features=768, out_features=1536, bias=True)
  (batchnorm_1): BatchNorm1d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear_2): Linear(in_features=1536, out_features=768, bias=True)
  (batchnorm_2): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear_3): Linear(in_features=768, out_features=256, bias=True)
  (batchnorm_3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (output): Linear(in_features=256, out_features=1, bias=True)
)

In [69]:
def collate_fn(examples):
    inputs = torch.stack([f[0] for f in examples])
    targets = torch.stack([f[1] for f in examples])

    return {"x": inputs, "targets": targets}


class MAETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE

        return (loss, outputs) if return_outputs else loss


In [70]:
training_args = TrainingArguments(
    learning_rate=1e-2,
    lr_scheduler_type="cosine_with_restarts",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=50,
    weight_decay=0.001,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.1,
    logging_strategy="steps",
    logging_steps=0.1,
    fp16=False,
    dataloader_num_workers=0,  # since data is in memory
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
    label_names=[],
)

# reinstantiate a clean model
model = RatingsModel()
_ = model.to(device)

trainer = MAETrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [71]:
trainer.train()

Step,Training Loss,Validation Loss
355,0.8861,0.58107
710,0.5652,0.616062
1065,0.534,0.586211
1420,0.4739,0.615877
1775,0.4098,0.665594
2130,0.3434,0.653778
2485,0.2748,0.619877
2840,0.2185,0.625056
3195,0.1782,0.626663
3550,0.1614,0.625208


TrainOutput(global_step=3550, training_loss=0.4045102186605964, metrics={'train_runtime': 29.358, 'train_samples_per_second': 30655.99, 'train_steps_per_second': 120.921, 'total_flos': 0.0, 'train_loss': 0.4045102186605964, 'epoch': 50.0})