In [1]:
import importlib
import os
import sys

## Intro
The goal of this notebook is to show how to use an arbitrary model with the LeMa training loop.

In this case, we will adapt [nanogpt](https://github.com/karpathy/nanoGPT), and train it using both the Lema and HuggingFace training loops

## Setup

This notebook assumes that you have already installed the `lema` package. If you haven't, you can install it by running `!pip install lema`.

We start then by cloning the nanoGPT repository, and adding nanoGPT to our python path


In [2]:
module_folder = "/tmp/lema/nanoGPT"

# Clone the nanoGPT repo
if not os.path.isdir(module_folder):
    !mkdir -p $module_folder
    !git clone https://github.com/karpathy/nanoGPT $module_folder
else:
    print("nanoGPT already cloned!")

sys.path.append(module_folder)

nanoGPT already cloned!


Next we install the required dependencies. 

In [3]:
if importlib.util.find_spec("tiktoken") is not None:
    print("tiktoken is already installed!")
else:
    !pip install tiktoken

tiktoken is already installed!


## Adapting nanoGPT model

In [4]:
import torch.nn.functional as F
from model import GPT, GPTConfig  # import from ~/nanoGPT/model.py

from lema.core import registry


@registry.register("lema-nanoGPT", registry_type=registry.RegistryType.MODEL)
class LemaNanoGPT(GPT):
    def __init__(self, **kwargs):
        """Initializes an instance of the class."""
        gpt_config = GPTConfig()
        gpt_config.bias = False

        super().__init__(gpt_config)

    def forward(self, input_ids, labels=None, attention_mask=None):
        """Performs the forward pass of the model."""
        # Update the return format to be compatible with our Trainer.
        logits, loss = super().forward(idx=input_ids, targets=labels)
        outputs = {"logits": logits}
        if loss:
            outputs["loss"] = loss
        return outputs

    def criterion(self):
        """Returns the criterion used for calculating the loss."""
        return F.cross_entropy

ModuleNotFoundError: No module named 'model'

In [5]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


[2024-08-14 14:50:24,171] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to mps (auto detect)


W0814 14:50:24.322000 8625376256 torch/distributed/elastic/multiprocessing/redirects.py:28] NOTE: Redirects are currently not supported in Windows or MacOs.


In [7]:
# name = "HuggingFaceFW/ablation-model-fineweb-v1"
name = "meta-llama/Meta-Llama-3.1-8B"
# name = "facebook/opt-125m"
config = transformers.AutoConfig.from_pretrained(name)
model = AutoModelForCausalLM.from_pretrained(name, torch_dtype="bfloat16")
tokenizer = AutoTokenizer.from_pretrained(name)
# model.forward = torch.compile(model.forward)
config

Downloading shards: 100%|██████████| 4/4 [11:22<00:00, 170.68s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  8.29it/s]


LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3.1-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.43.3",
  "use_cache": true,
  "vocab_size": 128256
}

In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
    (rotary_emb

## Training

Ok now we are ready to train our model! we can start from the default gpt2 config, and edit as needed.

In [None]:
import lema
from lema.core.types import TrainerType, TrainingConfig

In [None]:
# Starting from the default GPT-2 config
config_path = "../configs/lema/gpt2.pt.mac.yaml"
config = TrainingConfig.from_yaml(config_path)

# Update to use our newly registered nanoGPT model
config.model.model_name = "lema-nanoGPT"  # needs to match the registered model name

# We do not have a custom tokenizer, but we can use the GPT-2 tokenizer from HuggingFace
config.model.tokenizer_name = "gpt2"

config.training.trainer_type = TrainerType.LEMA
config.training.max_steps = 10
config.training.logging_steps = 1
config.training.gradient_accumulation_steps = 1
config.training.enable_wandb = False
config.training.enable_tensorboard = False

In [None]:
lema.train(config)