## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [1]:
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments, ModernBertConfig, AutoModel
from tqdm import tqdm

In [2]:
df = (
    pl.scan_parquet(
        "movie_data_plus_embeds_all.parquet",
    )
    .select(["tconst", "averageRating", "json"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
    .sample(fraction=1.0, shuffle=True, seed=42)
)

df

tconst,averageRating,json
str,f32,str
"""tt0173052""",4.1,"""{  ""title"": ""The Prince and t…"
"""tt0266288""",7.4,"""{  ""title"": ""Azhakiya Ravanan…"
"""tt6263490""",4.3,"""{  ""title"": ""Getaway"",  ""gen…"
"""tt10049110""",7.8,"""{  ""title"": ""Die Wiese"",  ""g…"
"""tt5761612""",3.8,"""{  ""title"": ""Woman on the Edg…"
…,…,…
"""tt0079376""",6.2,"""{  ""title"": ""The Proud Twins""…"
"""tt1161064""",3.2,"""{  ""title"": ""Super Capers: Th…"
"""tt0179526""",5.7,"""{  ""title"": ""Who's the Caboos…"
"""tt0188233""",5.7,"""{  ""title"": ""That's Erotic"", …"


## Train a Custom Tokenizer

Use the `modernbert` tokenizer as a base, just reduce the vocabulary significantly and tailor it to this specific dataset.


In [3]:
from transformers import AutoTokenizer

json_docs = df["json"].to_list()

base_tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
print(str(base_tokenizer.vocab)[0:100])
print(len(base_tokenizer(json_docs[0])["input_ids"]))

{'South': 17967, 'Ġgestation': 34422, 'Ġrelevant': 4623, 'Ġcron': 42695, 'Ġweaker': 21076, 'occ': 34
378


In [4]:
vocab_size = 5000

# don't train on all texts because it will take forever
tokenizer = base_tokenizer.train_new_from_iterator(
    iter(json_docs[:50000]), vocab_size=vocab_size,
    # new_special_tokens=["  ", "    ", "      "]
)

print(str(tokenizer.vocab)[0:100])
print(len(tokenizer(json_docs[0])["input_ids"]))




{'Brendan': 3660, 'gu': 752, 'Ir': 1595, 'ĠRuiz': 3764, 'Ã¼ller': 3184, 'Martin': 958, 'Adri': 2039,
302


Preencode all the tokens. A `max_length` of 1024 may be excessive but does not cause a proportionate reduction in model training speed over a 512 max length due to ModernBERT's unpadding + RoPE.

In order to avoid OOMs on the host system, generate in batches, then push to the GPU. (ideally we _could_ push to the GPU for each batch but that will cause GPU memory leaks)


In [5]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [6]:
max_length = 1024
token_batch_size = 2048
device = "cuda:0"

# input_ids = torch.empty((0, max_length)).to("cpu")
# attention_mask = torch.empty((0, max_length)).to("cpu")

input_ids = []
attention_mask = []

for docs in tqdm(batch(json_docs, token_batch_size), total=len(json_docs) // token_batch_size):
    tokens = tokenizer(docs,
                       max_length=max_length,
                       padding="max_length",
                       truncation=True,
                       return_tensors="pt").to("cpu")
    
    # input_ids = torch.vstack([input_ids, tokens["input_ids"]])
    # attention_mask = torch.vstack([attention_mask, tokens["attention_mask"]])
    
    input_ids.append(tokens["input_ids"])
    attention_mask.append(tokens["attention_mask"])
   
input_ids = torch.vstack(input_ids).to(device)
attention_mask = torch.vstack(attention_mask).to(device)

input_ids.size()

119it [03:23,  1.71s/it]                         
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([242552, 1024])

In [7]:
device = "cuda:0"
n_test = 20000

X_input_ids_train = input_ids[:-n_test].int().to(device)
X_input_ids_test = input_ids[-n_test:].int().to(device)

X_attention_train = attention_mask[:-n_test].int().to(device)
X_attention_test = attention_mask[-n_test:].int().to(device)

y_train = torch.from_numpy(df[:-n_test]["averageRating"].to_numpy().copy()).to(device)
y_test = torch.from_numpy(df[-n_test:]["averageRating"].to_numpy().copy()).to(device)

y_train

tensor([4.1000, 7.4000, 4.3000,  ..., 6.4000, 6.0000, 6.5000], device='cuda:0')

In [8]:
train_dataset = TensorDataset(X_input_ids_train, X_attention_train, y_train)
test_dataset = TensorDataset(X_input_ids_test, X_attention_test, y_test)

## Build the Model

Due to the new tokenizer, the special tokens for the fresh ModernBERT model have to be explicitly defined.


In [9]:
special_token_dict = dict(
    zip(tokenizer.special_tokens_map.keys(), tokenizer.all_special_ids)
)
special_token_dict

{'unk_token': 2,
 'sep_token': 4,
 'pad_token': 5,
 'cls_token': 3,
 'mask_token': 6}

In [16]:
hidden_size = 128
dropout = 0.1

config = ModernBertConfig(
    vocab_size=vocab_size,
    max_position_embeddings=max_length,
    hidden_size=hidden_size,
    intermediate_size=768,
    num_hidden_layers=12,
    num_attention_heads=8,
    # global_attn_every_n_layers=2,
    local_attention=16,
    attention_dropout=dropout,
    # embeddings_dropout=dropout,
    # mlp_dropout=dropout,
    unk_token_id=special_token_dict["unk_token"],
    sep_token_id=special_token_dict["sep_token"],
    pad_token_id=special_token_dict["pad_token"],
    cls_token_id=special_token_dict["cls_token"],
    mask_token_id=special_token_dict["mask_token"],
)

transformer_model = AutoModel.from_config(config)

In [17]:
class RatingsModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.transformer_model = model
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, targets=None):
        x = self.transformer_model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        x = x.last_hidden_state[:, 0]  # the [CLS] vector

        return self.output(x).squeeze()  # return 1D output

In [18]:
model = RatingsModel(transformer_model)
_ = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(total_params)

torch.set_float32_matmul_precision('high')  # perf increase for ModernBERT

4968705


Validation loss doesn't play nice with the `Trainer` out of the boss, so need [some tweaks](https://discuss.huggingface.co/t/no-log-for-validation-loss-during-training-with-trainer/40094/3).


In [19]:
def collate_fn(examples):
    input_ids = torch.stack([f[0] for f in examples])
    attention_masks = torch.stack([f[1] for f in examples])
    targets = torch.stack([f[2] for f in examples])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "targets": targets,
    }


class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        # loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE
        loss = nn.MSELoss()(outputs, inputs["targets"])

        return (loss, outputs) if return_outputs else loss


In [20]:
training_args = TrainingArguments(
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    weight_decay=0.001,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.05,
    logging_strategy="steps",
    logging_steps=0.05,
    fp16=True,
    dataloader_num_workers=0,  # since data is in memory, as problem is GPU bound
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
)

# reinstantiate a clean model
transformer_model = AutoModel.from_config(config)
model = RatingsModel(transformer_model)
_ = model.to(device)

trainer = RegressionTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [21]:
trainer.train()

Step,Training Loss,Validation Loss
870,1.5113,1.173166
1740,1.1608,1.088658
2610,1.0907,1.074188
3480,1.0908,1.047315
4350,1.0226,1.043286
5220,1.0158,1.051057
6090,0.9398,1.046057
6960,0.955,1.02558
7830,0.8633,1.047111
8700,0.8845,1.027024


TrainOutput(global_step=17390, training_loss=0.8742096105205938, metrics={'train_runtime': 2009.5099, 'train_samples_per_second': 1107.494, 'train_steps_per_second': 8.654, 'total_flos': 0.0, 'train_loss': 0.8742096105205938, 'epoch': 10.0})

In [22]:
logs = trainer.state.log_history

logs[0:4]

[{'loss': 1.5113,
  'grad_norm': 10.823572158813477,
  'learning_rate': 0.00019878432868078476,
  'epoch': 0.5002875215641173,
  'step': 870},
 {'eval_loss': 1.1731656789779663,
  'eval_runtime': 5.6989,
  'eval_samples_per_second': 3509.442,
  'eval_steps_per_second': 27.549,
  'epoch': 0.5002875215641173,
  'step': 870},
 {'loss': 1.1608,
  'grad_norm': 3.601486921310425,
  'learning_rate': 0.00019513352557918312,
  'epoch': 1.0005750431282345,
  'step': 1740},
 {'eval_loss': 1.0886578559875488,
  'eval_runtime': 5.673,
  'eval_samples_per_second': 3525.466,
  'eval_steps_per_second': 27.675,
  'epoch': 1.0005750431282345,
  'step': 1740}]

In [23]:
logs_consolidated = []
i = 0
while i < len(logs)-1:
    base_log = logs[i]
    base_log.update(logs[i+1])
    logs_consolidated.append(base_log)
    i += 2
    
df_logs = pl.DataFrame(logs_consolidated).sort("epoch")
df_logs.write_parquet("llm_scratch_train_logs.parquet")
df_logs

loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second
f64,f64,f64,f64,i64,f64,f64,f64,f64
1.5113,10.823572,0.000199,0.500288,870,1.173166,5.6989,3509.442,27.549
1.1608,3.601487,0.000195,1.000575,1740,1.088658,5.673,3525.466,27.675
1.0907,2.74441,0.000189,1.500863,2610,1.074188,5.7019,3507.58,27.535
1.0908,4.296513,0.000181,2.00115,3480,1.047315,5.6973,3510.434,27.557
1.0226,3.782547,0.000171,2.501438,4350,1.043286,5.6781,3522.284,27.65
…,…,…,…,…,…,…,…,…
0.6759,4.424142,0.000029,7.504313,13050,1.135734,5.6571,3535.363,27.753
0.6799,3.291351,0.000019,8.0046,13920,1.118866,5.6781,3522.313,27.65
0.6419,3.996638,0.000011,8.504888,14790,1.145908,5.6604,3533.313,27.737
0.6386,4.407036,0.000005,9.005175,15660,1.145411,5.6848,3518.16,27.618


In [24]:
from safetensors.torch import save_model

save_model(model, "imdb_embeddings_llm_scratch.safetensors")

## Test Model

In this case, need to evaluate the LLM in batches to avoid going OOM.

In [25]:
import numpy as np

_ = model.to("cuda:0").eval()  # to disable BatchNorm1D

dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64,
                                         shuffle=False,
                                         pin_memory=False)
preds_bucket = []

for batch in tqdm(dataloader, smoothing=0):
    with torch.no_grad():
        output = model(input_ids=batch[0],
                       attention_mask=batch[1])
        preds = output.detach().cpu().numpy()

    preds_bucket.append(preds)
        
actual_values = torch.stack([f[2] for f in test_dataset])

test_results = (pl.DataFrame({"Predicted": np.hstack(preds_bucket), "Actual": actual_values.cpu().numpy()})
                .with_columns(
                    abs_diff=(pl.col("Predicted") - pl.col("Actual")).abs(),
                    square_error = ((pl.col("Actual") - pl.col("Predicted")) ** 2)
                )
               )
                
test_results

100%|██████████| 313/313 [00:05<00:00, 55.84it/s]


Predicted,Actual,abs_diff,square_error
f32,f32,f32,f32
7.066406,7.1,0.033594,0.001129
6.2890625,6.5,0.2109375,0.044495
6.214844,4.1,2.114844,4.472565
5.214844,5.5,0.285156,0.081314
7.707031,7.2,0.507031,0.257081
…,…,…,…
6.6953125,6.2,0.495313,0.245335
4.644531,3.2,1.444531,2.08667
6.472656,5.7,0.772656,0.596998
6.09375,5.7,0.39375,0.155039


In [26]:
# Mean Absolute Error (MAE)
test_results["abs_diff"].mean()

0.8016399429321289

In [27]:
# Mean Square Error (MSE)
test_results["square_error"].mean()

1.1549042394775857