In [21]:
import logging
import os
from pathlib import Path

import colorlog
import torch
from datatrove.utils.dataset import DatatroveFolderDataset
from torch import Tensor
from torch.optim import AdamW, Optimizer
from torch.optim.lr_scheduler import LRScheduler
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.utils.logging import set_verbosity
from torch.utils.data import DataLoader
from src.optim import get_wsd_scheduler
from src.utilities import get_logger

In [3]:
# Configure the logger and configure colorlog
logger = get_logger("training", "info")

In [4]:
tok_path = Path("/home/pl487/rdd/outputs/tokenizers/2024-08-28T16-34-11/tok-vocab32000")
tok = AutoTokenizer.from_pretrained(str(tok_path))



In [14]:
architecure_id = "HuggingFaceTB/SmolLM-135M"
config = AutoConfig.from_pretrained(architecure_id)
config.vocab_size = tok.vocab_size
config.eos_token_id = tok.eos_token_id
config.hidden_size = 128 * 4
config.intermediate_size = config.hidden_size * 2
config.max_position_embeddings = 128 * 4
config.num_attention_heads = 8
config.num_key_value_heads = config.num_attention_heads // 2
config.num_hidden_layers = 8

hub_model_id = f"smollm-135m-vocab{tok.vocab_size}"

model = AutoModelForCausalLM.from_config(
    config, 
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

# model.push_to_hub(hub_model_id, revision="step0")

logger.info(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
logger.info(f"Num parameters: {model.num_parameters() / 1e6:.1f}M")

[[36m2024-08-29 17:24:38,811[0m][[34mtraining[0m][[32mINFO[0m] - Memory footprint: 70.54 MB[0m
[[36m2024-08-29 17:24:38,813[0m][[34mtraining[0m][[32mINFO[0m] - Num parameters: 35.3M[0m


In [16]:
training_args = TrainingArguments(
    # =======
    # logging
    # =======
    output_dir=f"training_outputs/{hub_model_id}",
    logging_strategy="steps",
    logging_first_step=True,
    log_level="passive", # takes it from global
    include_num_input_tokens_seen=True,
    report_to="tensorboard",
    hub_model_id=hub_model_id,
    hub_strategy="all_checkpoints",
    hub_private_repo=True,
    # =====
    # setup
    # =====
    evaluation_strategy="no",
    seed=42,
    bf16=True,
    bf16_full_eval=True,
    tf32=True,
    torch_compile=True,
    # =============
    # checkpointing
    # =============
    save_strategy="steps",
    save_steps=50,
    save_safetensors=True,
    push_to_hub=True,
    # ============
    # optimisation
    # ============
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    optim="adamw_torch",
    learning_rate=2e-5,
    weight_decay=0.1,
    adam_beta1=0.9,
    adam_beta2=0.95,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    # lr_scheduler_type=None,
    lr_scheduler_kwargs=dict(
        final_lr_factor=0.0,
        init_div_factor=100,
        frac_decay=0.1,
        decay_type="sqrt",
    ),  # use to pass 
    warmup_steps=2_000,
    num_train_epochs=1,
    max_steps=100,
    # ===========
    # dataloading
    # ===========
    # dataloader_num_workers=os.cpu_count() - 1,
    # dataloader_pin_memory=True,
)



In [27]:
class LMTrainer(Trainer):
    
    def create_optimizer(self) -> Optimizer:
        # need to set self.optimizer
        
        # Get params that require grad
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        
        # Create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': self.args.weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        
        logger.info(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        logger.info(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        
        # Create AdamW optimizer and use the fused version
        self.optimizer = AdamW(
            optim_groups, 
            lr=self.args.learning_rate, 
            betas=(self.args.adam_beta1, self.args.adam_beta2), 
            eps=self.args.adam_epsilon, 
            fused=True,
        )
        
        return self.optimizer
    
    def create_scheduler(self, num_training_steps: int, optimizer: Optimizer = None) -> LRScheduler:
        if self.args.lr_scheduler_type is None:
            return get_wsd_scheduler(
                optimizer=self.optimizer if optimizer is None else optimizer,
                num_warmup_steps=self.args.warmup_steps,
                num_training_steps=num_training_steps,
                **self.args.lr_scheduler_kwargs,
            )
        
        return super().create_scheduler(num_training_steps, optimizer)

    def get_train_dataloader(self) -> DataLoader:
        target_repo = "hf://datasets/pietrolesci/fineweb-edu-10BT"
        ds = DatatroveFolderDataset(
            folder_path=f"{target_repo}/{tok_path.name}", 
            seq_len=config.max_position_embeddings, 
            shuffle=True,
            seed=42,
            token_size=2 if config.vocab_size < 65_000 else 4,
        )

        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": self.data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        return self.accelerator.prepare(DataLoader(ds, **dataloader_params))
    
    def compute_loss(self, model, inputs, return_outputs=False) -> Tensor:
        input_ids = batch["input_ids"]
        labels = input_ids.clone()
        outputs = model(input_ids=input_ids, labels=labels)
        return outputs.loss
                

    

In [25]:
target_repo = "hf://datasets/pietrolesci/fineweb-edu-10BT"
ds = DatatroveFolderDataset(
    folder_path=f"{target_repo}/{tok_path.name}", 
    seq_len=config.max_position_embeddings, 
    shuffle=True,
    seed=42,
    token_size=2 if config.vocab_size < 65_000 else 4,
)

In [26]:
next(iter(ds))["input_ids"]

torch.Size([257])

In [None]:
batch = next(iter(ds))["input_ids"].to("cuda")
model = model.to("cuda")

In [None]:
model.forward(input_ids=batch.unsqueeze(-1)).logits.shape

In [28]:
trainer = Trainer(model, args=training_args)

max_steps is given, it will override any value given in num_train_epochs


In [29]:
trainer.train()

ValueError: Trainer: training requires a train_dataset.