## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [1]:
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments, ModernBertConfig, AutoModel
from tqdm import tqdm

In [2]:
df = (
    pl.scan_parquet(
        "movie_data_plus_embeds_all.parquet",
    )
    .select(["tconst", "averageRating", "json"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
    .sample(fraction=1.0, shuffle=True, seed=42)
)

df

tconst,averageRating,json
str,f32,str
"""tt0173052""",4.1,"""{  ""title"": ""The Prince and t…"
"""tt0266288""",7.4,"""{  ""title"": ""Azhakiya Ravanan…"
"""tt6263490""",4.3,"""{  ""title"": ""Getaway"",  ""gen…"
"""tt10049110""",7.8,"""{  ""title"": ""Die Wiese"",  ""g…"
"""tt5761612""",3.8,"""{  ""title"": ""Woman on the Edg…"
…,…,…
"""tt0079376""",6.2,"""{  ""title"": ""The Proud Twins""…"
"""tt1161064""",3.2,"""{  ""title"": ""Super Capers: Th…"
"""tt0179526""",5.7,"""{  ""title"": ""Who's the Caboos…"
"""tt0188233""",5.7,"""{  ""title"": ""That's Erotic"", …"


## Train a Custom Tokenizer

Use the `modernbert` tokenizer as a base, just reduce the vocabulary significantly and tailor it to this specific dataset.


In [3]:
from transformers import AutoTokenizer

json_docs = df["json"].to_list()

base_tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
print(str(base_tokenizer.vocab)[0:100])
print(len(base_tokenizer(json_docs[0])["input_ids"]))

{'ĠprÃ©': 19856, 'Ġdisclaim': 23464, 'illon': 24632, 'ĠAB': 12056, 'Ġreactor': 22578, 'Ġsalvage': 40
378


In [4]:
vocab_size = 5000

# don't train on all texts because it will take forever
tokenizer = base_tokenizer.train_new_from_iterator(
    iter(json_docs[:50000]), vocab_size=vocab_size,
    # new_special_tokens=["  ", "    ", "      "]
)

print(str(tokenizer.vocab)[0:100])
print(len(tokenizer(json_docs[0])["input_ids"]))




{'Mass': 2476, ',': 18, 'ĠMcDonald': 4206, 'ĠCla': 4298, 'str': 1686, 'Lor': 4188, '¦': 106, 'ĠLip':
302


Preencode all the tokens. A `max_length` of 1024 may be excessive but does not cause a proportionate reduction in model training speed over a 512 max length due to ModernBERT's unpadding + RoPE.

In order to avoid OOMs on the host system, generate in batches, then push to the GPU. (ideally we _could_ push to the GPU for each batch but that will cause GPU memory leaks)


In [5]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [6]:
max_length = 1024
token_batch_size = 2048
device = "cuda:0"

# input_ids = torch.empty((0, max_length)).to("cpu")
# attention_mask = torch.empty((0, max_length)).to("cpu")

input_ids = []
attention_mask = []

for docs in tqdm(batch(json_docs, token_batch_size), total=len(json_docs) // token_batch_size):
    tokens = tokenizer(docs,
                       max_length=max_length,
                       padding="max_length",
                       truncation=True,
                       return_tensors="pt").to("cpu")
    
    # input_ids = torch.vstack([input_ids, tokens["input_ids"]])
    # attention_mask = torch.vstack([attention_mask, tokens["attention_mask"]])
    
    input_ids.append(tokens["input_ids"])
    attention_mask.append(tokens["attention_mask"])
   
input_ids = torch.vstack(input_ids).to(device)
attention_mask = torch.vstack(attention_mask).to(device)

input_ids.size()

119it [03:22,  1.70s/it]                         
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([242552, 1024])

In [7]:
device = "cuda:0"
n_test = 20000

X_input_ids_train = input_ids[:-n_test].int().to(device)
X_input_ids_test = input_ids[-n_test:].int().to(device)

X_attention_train = attention_mask[:-n_test].int().to(device)
X_attention_test = attention_mask[-n_test:].int().to(device)

y_train = torch.from_numpy(df[:-n_test]["averageRating"].to_numpy().copy()).to(device)
y_test = torch.from_numpy(df[-n_test:]["averageRating"].to_numpy().copy()).to(device)

y_train

tensor([4.1000, 7.4000, 4.3000,  ..., 6.4000, 6.0000, 6.5000], device='cuda:0')

In [8]:
train_dataset = TensorDataset(X_input_ids_train, X_attention_train, y_train)
test_dataset = TensorDataset(X_input_ids_test, X_attention_test, y_test)

## Build the Model

Due to the new tokenizer, the special tokens for the fresh ModernBERT model have to be explicitly defined.


In [9]:
special_token_dict = dict(
    zip(tokenizer.special_tokens_map.keys(), tokenizer.all_special_ids)
)
special_token_dict

{'unk_token': 2,
 'sep_token': 4,
 'pad_token': 5,
 'cls_token': 3,
 'mask_token': 6}

In [10]:
hidden_size = 128
dropout = 0.5

config = ModernBertConfig(
    vocab_size=vocab_size,
    max_position_embeddings=max_length,
    hidden_size=hidden_size,
    intermediate_size=512,
    num_hidden_layers=6,
    num_attention_heads=4,
    global_attn_every_n_layers=2,
    local_attention=32,
    attention_dropout=dropout,
    embeddings_dropout=dropout,
    mlp_dropout=dropout,
    unk_token_id=special_token_dict["unk_token"],
    sep_token_id=special_token_dict["sep_token"],
    pad_token_id=special_token_dict["pad_token"],
    cls_token_id=special_token_dict["cls_token"],
    mask_token_id=special_token_dict["mask_token"],
)

transformer_model = AutoModel.from_config(config)
total_params = sum(p.numel() for p in transformer_model.parameters())
total_params

2214528

In [11]:
class RatingsModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.transformer_model = model
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, targets=None):
        x = self.transformer_model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        x = x.last_hidden_state[:, 0]  # the [CLS] vector
        x = self.output(x)

        return x.squeeze()  # return 1D output

In [12]:
model = RatingsModel(transformer_model)
_ = model.to(device)

torch.set_float32_matmul_precision('high')  # perf increase for ModernBERT

Validation loss doesn't play nice with the `Trainer` out of the boss, so need [some tweaks](https://discuss.huggingface.co/t/no-log-for-validation-loss-during-training-with-trainer/40094/3).


In [13]:
def collate_fn(examples):
    input_ids = torch.stack([f[0] for f in examples])
    attention_masks = torch.stack([f[1] for f in examples])
    targets = torch.stack([f[2] for f in examples])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "targets": targets,
    }


class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        # loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE
        loss = nn.MSELoss()(outputs, inputs["targets"])

        return (loss, outputs) if return_outputs else loss


In [16]:
training_args = TrainingArguments(
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.001,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.05,
    logging_strategy="steps",
    logging_steps=0.05,
    fp16=True,
    dataloader_num_workers=0,  # since data is in memory, as problem is GPU bound
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
)

# reinstantiate a clean model
transformer_model = AutoModel.from_config(config)
model = RatingsModel(transformer_model)
_ = model.to(device)

trainer = RegressionTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [17]:
trainer.train()

Step,Training Loss,Validation Loss
3478,1.7515,1.127283
6956,1.1823,1.100623
10434,1.1156,1.079834
13912,1.1177,1.066914
17390,1.0798,1.065529
20868,1.0738,1.065454
24346,1.0347,1.06161
27824,1.0453,1.048201
31302,1.0021,1.053628
34780,1.0124,1.048743


TrainOutput(global_step=69550, training_loss=1.0403015373663282, metrics={'train_runtime': 1726.1904, 'train_samples_per_second': 1289.267, 'train_steps_per_second': 40.291, 'total_flos': 0.0, 'train_loss': 1.0403015373663282, 'epoch': 10.0})

In [18]:
logs = trainer.state.log_history

logs[0:4]

[{'loss': 1.7515,
  'grad_norm': 4.927058219909668,
  'learning_rate': 4.969317935368992e-05,
  'epoch': 0.5000718907260964,
  'step': 3478},
 {'eval_loss': 1.1272826194763184,
  'eval_runtime': 5.9426,
  'eval_samples_per_second': 3365.521,
  'eval_steps_per_second': 105.173,
  'epoch': 0.5000718907260964,
  'step': 3478},
 {'loss': 1.1823,
  'grad_norm': 4.919924259185791,
  'learning_rate': 4.8778157098802465e-05,
  'epoch': 1.0001437814521927,
  'step': 6956},
 {'eval_loss': 1.1006231307983398,
  'eval_runtime': 5.4397,
  'eval_samples_per_second': 3676.699,
  'eval_steps_per_second': 114.897,
  'epoch': 1.0001437814521927,
  'step': 6956}]

In [19]:
logs_consolidated = []
i = 0
while i < len(logs)-1:
    base_log = logs[i]
    base_log.update(logs[i+1])
    logs_consolidated.append(base_log)
    i += 2
    
df_logs = pl.DataFrame(logs_consolidated).sort("epoch")
df_logs.write_parquet("llm_scratch_train_logs.parquet")
df_logs

loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second
f64,f64,f64,f64,i64,f64,f64,f64,f64
1.7515,4.927058,0.00005,0.500072,3478,1.127283,5.9426,3365.521,105.173
1.1823,4.919924,0.000049,1.000144,6956,1.100623,5.4397,3676.699,114.897
1.1156,6.001026,0.000047,1.500216,10434,1.079834,5.3925,3708.867,115.902
1.1177,6.690367,0.000045,2.000288,13912,1.066914,5.4362,3679.061,114.971
1.0798,9.009649,0.000043,2.500359,17390,1.065529,5.4792,3650.18,114.068
…,…,…,…,…,…,…,…,…
0.9317,9.880666,0.000007,7.501078,52170,1.058548,5.3844,3714.437,116.076
0.9328,8.97761,0.000005,8.00115,55648,1.055869,5.4355,3679.487,114.984
0.9217,6.215415,0.000003,8.501222,59126,1.066425,5.4003,3703.501,115.734
0.9177,4.816526,0.000001,9.001294,62604,1.059349,5.3999,3703.802,115.744


In [20]:
from safetensors.torch import save_model

save_model(model, "imdb_embeddings_llm_scratch.safetensors")

## Test Model

In this case, need to evaluate the LLM in batches to avoid going OOM.

In [21]:
import numpy as np

_ = model.to("cuda:0").eval()  # to disable BatchNorm1D

dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64,
                                         shuffle=False,
                                         pin_memory=False)
preds_bucket = []

for batch in tqdm(dataloader, smoothing=0):
    with torch.no_grad():
        output = model(input_ids=batch[0],
                       attention_mask=batch[1])
        preds = output.detach().cpu().numpy()

    preds_bucket.append(preds)
        
actual_values = torch.stack([f[2] for f in test_dataset])

test_results = (pl.DataFrame({"Predicted": np.hstack(preds_bucket), "Actual": actual_values.cpu().numpy()})
                .with_columns(
                    abs_diff=(pl.col("Predicted") - pl.col("Actual")).abs(),
                    square_error = ((pl.col("Actual") - pl.col("Predicted")) ** 2)
                )
               )
                
test_results

100%|██████████| 313/313 [00:03<00:00, 100.30it/s]


Predicted,Actual,abs_diff,square_error
f32,f32,f32,f32
7.191406,7.1,0.091406,0.008355
6.472656,6.5,0.027344,0.000748
5.234375,4.1,1.134375,1.286807
6.011719,5.5,0.511719,0.261856
7.683594,7.2,0.483594,0.233863
…,…,…,…
6.421875,6.2,0.221875,0.049229
4.675781,3.2,1.475781,2.17793
5.636719,5.7,0.063281,0.004004
5.65625,5.7,0.04375,0.001914


In [22]:
# Mean Absolute Error (MAE)
test_results["abs_diff"].mean()

0.7710315263330937

In [23]:
# Mean Square Error (MSE)
test_results["square_error"].mean()

1.0620494208003317