## PyTorch Training

Uses the Trainer included in Hugging Face `transformers` (backed by `accelerate`) since it mitigates a lot of annoying boilerplate.


In [1]:
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments, ModernBertConfig, AutoModel
from tqdm import tqdm

In [2]:
df = (
    pl.scan_parquet(
        "movie_data_plus_embeds_all.parquet",
    )
    .select(["tconst", "averageRating", "json"])
    .with_columns(averageRating=pl.col("averageRating").cast(pl.Float32))
    .collect()
    .sample(fraction=1.0, shuffle=True, seed=42)
)

df

tconst,averageRating,json
str,f32,str
"""tt0173052""",4.1,"""{  ""title"": ""The Prince and t…"
"""tt0266288""",7.4,"""{  ""title"": ""Azhakiya Ravanan…"
"""tt6263490""",4.3,"""{  ""title"": ""Getaway"",  ""gen…"
"""tt10049110""",7.8,"""{  ""title"": ""Die Wiese"",  ""g…"
"""tt5761612""",3.8,"""{  ""title"": ""Woman on the Edg…"
…,…,…
"""tt0079376""",6.2,"""{  ""title"": ""The Proud Twins""…"
"""tt1161064""",3.2,"""{  ""title"": ""Super Capers: Th…"
"""tt0179526""",5.7,"""{  ""title"": ""Who's the Caboos…"
"""tt0188233""",5.7,"""{  ""title"": ""That's Erotic"", …"


## Train a Custom Tokenizer

Use the `modernbert` tokenizer as a base, just reduce the vocabulary significantly and tailor it to this specific dataset.


In [3]:
from transformers import AutoTokenizer

json_docs = df["json"].to_list()

base_tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
print(str(base_tokenizer.vocab)[0:100])
print(len(base_tokenizer(json_docs[0])["input_ids"]))

{'devices': 49242, 'ĠLloyd': 24395, 'ĠR': 416, 'ĠTruck': 42712, 'Ġsatisfies': 12310, 'Follow': 18905
378


In [4]:
vocab_size = 5000

# don't train on all texts because it will take forever
tokenizer = base_tokenizer.train_new_from_iterator(
    iter(json_docs[:50000]), vocab_size=vocab_size
)

print(str(tokenizer.vocab)[0:100])
print(len(tokenizer(json_docs[0])["input_ids"]))




{'lick': 4092, 'yl': 1128, 'ĠAp': 2639, 'ď': 210, 'mith': 856, 'ò': 181, 'Rand': 3915, 'Horror': 665
302


Preencode all the tokens. A `max_length` of 1024 may be excessive but does not cause a proportionate reduction in model training speed over a 512 max length due to ModernBERT's unpadding.

In order to avoid OOMs on the host system, generate in batches, then push to the GPU. (ideally we _could_ push to the GPU for each batch but that will cause GPU memory leaks)


In [5]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [6]:
max_length = 1024
token_batch_size = 2048
device = "cuda:0"

# input_ids = torch.empty((0, max_length)).to("cpu")
# attention_mask = torch.empty((0, max_length)).to("cpu")

input_ids = []
attention_mask = []

for docs in tqdm(batch(json_docs, token_batch_size), total=len(json_docs) // token_batch_size):
    tokens = tokenizer(docs,
                       max_length=max_length,
                       padding="max_length",
                       truncation=True,
                       return_tensors="pt").to("cpu")
    
    # input_ids = torch.vstack([input_ids, tokens["input_ids"]])
    # attention_mask = torch.vstack([attention_mask, tokens["attention_mask"]])
    
    input_ids.append(tokens["input_ids"])
    attention_mask.append(tokens["attention_mask"])
   
input_ids = torch.vstack(input_ids).to(device)
attention_mask = torch.vstack(attention_mask).to(device)

input_ids.size()

119it [03:21,  1.70s/it]                         
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([242552, 1024])

In [7]:
device = "cuda:0"
n_test = 20000

X_input_ids_train = input_ids[:-n_test].int().to(device)
X_input_ids_test = input_ids[-n_test:].int().to(device)

X_attention_train = attention_mask[:-n_test].int().to(device)
X_attention_test = attention_mask[-n_test:].int().to(device)

y_train = torch.from_numpy(df[:-n_test]["averageRating"].to_numpy().copy()).to(device)
y_test = torch.from_numpy(df[-n_test:]["averageRating"].to_numpy().copy()).to(device)

y_train

tensor([4.1000, 7.4000, 4.3000,  ..., 6.4000, 6.0000, 6.5000], device='cuda:0')

In [8]:
train_dataset = TensorDataset(X_input_ids_train, X_attention_train, y_train)
test_dataset = TensorDataset(X_input_ids_test, X_attention_test, y_test)

## Build the Model

Due to the new tokenizer, the special tokens for the fresh ModernBERT model have to be explicitly defined.


In [9]:
special_token_dict = dict(
    zip(tokenizer.special_tokens_map.keys(), tokenizer.all_special_ids)
)
special_token_dict

{'unk_token': 2,
 'sep_token': 4,
 'pad_token': 5,
 'cls_token': 3,
 'mask_token': 6}

In [37]:
hidden_size = 128

config = ModernBertConfig(
    vocab_size=vocab_size,
    max_position_embeddings=max_length,
    hidden_size=hidden_size,
    intermediate_size=768,
    num_hidden_layers=12,
    num_attention_heads=8,
    unk_token_id=special_token_dict["unk_token"],
    sep_token_id=special_token_dict["sep_token"],
    pad_token_id=special_token_dict["pad_token"],
    cls_token_id=special_token_dict["cls_token"],
    mask_token_id=special_token_dict["mask_token"],
)

transformer_model = AutoModel.from_config(config)
total_params = sum(p.numel() for p in transformer_model.parameters())
total_params

4968576

In [38]:
class RatingsModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.transformer_model = model
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, targets=None):
        x = self.transformer_model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        x = x.last_hidden_state[:, 0]  # the [CLS] vector
        x = self.output(x)

        return x.squeeze()  # return 1D output

In [39]:
model = RatingsModel(transformer_model)
_ = model.to(device)

torch.set_float32_matmul_precision('high')  # perf increase for ModernBERT

Validation loss doesn't play nice with the `Trainer` out of the boss, so need [some tweaks](https://discuss.huggingface.co/t/no-log-for-validation-loss-during-training-with-trainer/40094/3).


In [40]:
def collate_fn(examples):
    input_ids = torch.stack([f[0] for f in examples])
    attention_masks = torch.stack([f[1] for f in examples])
    targets = torch.stack([f[2] for f in examples])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "targets": targets,
    }


class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        outputs = model(**inputs)
        # loss = nn.L1Loss()(outputs, inputs["targets"])  # L1 loss is MAE
        loss = nn.MSELoss()(outputs, inputs["targets"])

        return (loss, outputs) if return_outputs else loss


In [41]:
training_args = TrainingArguments(
    learning_rate=1e-3,
    lr_scheduler_type="cosine_with_restarts",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.001,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=0.05,
    logging_strategy="steps",
    logging_steps=0.05,
    fp16=True,
    dataloader_num_workers=0,  # since data is in memory, as problem is GPU bound
    dataloader_pin_memory=False,
    dataloader_persistent_workers=False,
)

# reinstantiate a clean model
transformer_model = AutoModel.from_config(config)
model = RatingsModel(transformer_model)
_ = model.to(device)

trainer = RegressionTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainer.can_return_loss = True

In [42]:
trainer.train()

W0623 01:47:29.129000 15552 site-packages/torch/_inductor/utils.py:1250] [1/4] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss,Validation Loss
1739,1.3567,1.147596
3478,1.2124,1.131216
5217,1.1525,1.124591
6956,1.1506,1.105348
8695,1.1172,1.090652
10434,1.1019,1.113789
12173,1.0612,1.093917
13912,1.0717,1.058538
15651,1.0127,1.06871
17390,1.0255,1.053821


TrainOutput(global_step=34780, training_loss=0.9480032518857371, metrics={'train_runtime': 1883.1121, 'train_samples_per_second': 1181.831, 'train_steps_per_second': 18.469, 'total_flos': 0.0, 'train_loss': 0.9480032518857371, 'epoch': 10.0})

In [43]:
trainer.evaluate(test_dataset)

{'eval_loss': 1.2251088619232178,
 'eval_runtime': 5.1475,
 'eval_samples_per_second': 3885.398,
 'eval_steps_per_second': 60.806,
 'epoch': 10.0}

In [44]:
logs = trainer.state.log_history

logs[0:4]

[{'loss': 1.3567,
  'grad_norm': 7.730082035064697,
  'learning_rate': 0.0009938864888000992,
  'epoch': 0.5,
  'step': 1739},
 {'eval_loss': 1.1475961208343506,
  'eval_runtime': 8.6645,
  'eval_samples_per_second': 2308.258,
  'eval_steps_per_second': 36.124,
  'epoch': 0.5,
  'step': 1739},
 {'loss': 1.2124,
  'grad_norm': 11.11757755279541,
  'learning_rate': 0.0009756119265622743,
  'epoch': 1.0,
  'step': 3478},
 {'eval_loss': 1.1312155723571777,
  'eval_runtime': 5.213,
  'eval_samples_per_second': 3836.568,
  'eval_steps_per_second': 60.042,
  'epoch': 1.0,
  'step': 3478}]

In [45]:
logs_consolidated = []
i = 0
while i < len(logs)-1:
    base_log = logs[i]
    base_log.update(logs[i+1])
    logs_consolidated.append(base_log)
    i += 2
    
df_logs = pl.DataFrame(logs_consolidated).sort("epoch")
df_logs.write_parquet("llm_scratch_train_logs.parquet")
df_logs

loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.3567,7.730082,0.000994,0.5,1739,1.147596,8.6645,2308.258,36.124,,,,,
1.2124,11.117578,0.000976,1.0,3478,1.131216,5.213,3836.568,60.042,,,,,
1.1525,3.543578,0.000946,1.5,5217,1.124591,5.2206,3830.961,59.955,,,,,
1.1506,7.620698,0.000905,2.0,6956,1.105348,5.1827,3858.967,60.393,,,,,
1.1172,4.802451,0.000854,2.5,8695,1.090652,5.2121,3837.231,60.053,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.665,1.33698,0.000055,8.5,29563,1.193457,5.1953,3849.634,60.247,,,,,
0.6619,5.775957,0.000025,9.0,31302,1.197769,5.182,3859.506,60.401,,,,,
0.6089,4.370686,0.000006,9.5,33041,1.220957,5.1857,3856.752,60.358,,,,,
0.6102,7.304399,6.6088e-10,10.0,34780,1.225109,5.1692,3869.081,60.551,,,,,


In [46]:
from safetensors.torch import save_model

save_model(model, "imdb_embeddings_llm_scratch.safetensors")

## Test Model


In [49]:
_ = model.to("cuda:0").eval()  # to disable BatchNorm1D

eval_input_ids = torch.stack([f[0] for f in test_dataset])
eval_attention_masks = torch.stack([f[1] for f in test_dataset])
actual_values = torch.stack([f[2] for f in test_dataset])

with torch.no_grad():
    output = model(input_ids=eval_input_ids.to("cuda:0"),
                   attention_mask=eval_attention_masks.to("cuda:0"))
    preds = output.detach().cpu().numpy()

test_results = (pl.DataFrame({"Predicted": preds, "Actual": actual_values.cpu().numpy()})
                .with_columns(
                    abs_diff=(pl.col("Predicted") - pl.col("Actual")).abs(),
                    square_error = ((pl.col("Actual") - pl.col("Predicted")) ** 2)
                )
               )
                
test_results

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.50 GiB. GPU 0 has a total capacity of 21.95 GiB of which 6.93 GiB is free. Including non-PyTorch memory, this process has 15.01 GiB memory in use. Of the allocated memory 13.40 GiB is allocated by PyTorch, and 1.37 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Mean Absolute Error (MAE)
test_results["abs_diff"].mean()

In [None]:
# Mean Square Error (MSE)
test_results["square_error"].mean()