In [1]:
import os

from huggingface_hub import login
from omegaconf import OmegaConf

from lema.builders import (
    build_dataset,
    build_model,
    build_peft_model,
    build_tokenizer,
    build_trainer,
)
from lema.core.types import TrainingConfig
from lema.utils.saver import save_model

%load_ext autoreload
%autoreload 2

In [2]:
access_token = os.environ.get("HF_TOKEN")
login(token=access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/optas/.cache/huggingface/token
Login successful


In [3]:
config_filename = "../configs/lema/zephyr.7b.sft.yaml"
base_config = OmegaConf.structured(TrainingConfig)
file_config = TrainingConfig.from_yaml(config_filename)
config = OmegaConf.merge(base_config, file_config)
config: TrainingConfig = OmegaConf.to_object(config)
print(config.training)
print(config.peft)

PeftParams(lora_r=16, lora_alpha=16, lora_dropout=0.05, lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], lora_bias='none', lora_task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, q_lora=False, q_lora_bits=4)


In [4]:
# TODO-finalize in config file
config.peft.q_lora = False
config.training.per_device_train_batch_size = 1
config.training.max_steps = 2

In [5]:
tokenizer = build_tokenizer(config)
tokenizer



LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
# # Set reasonable default for models without max length
# if tokenizer.model_max_length > 100_000: # shall this condition be checked for diff.
#  Zephyr models? Now is not.
#     tokenizer.model_max_length = 2048

print("tokenizer.model_max_length", tokenizer.model_max_length)
print("tokenizer pad_token/eos_token", tokenizer.pad_token, tokenizer.eos_token)
print("tokenizer.padding_side", tokenizer.padding_side)
print("tokenizer.chat_template", tokenizer.chat_template)

tokenizer.model_max_length 2048
tokenizer pad_token/eos_token </s> </s>
tokenizer.padding_side left
tokenizer.chat_template {% for message in messages %}
{% if message['role'] == 'user' %}
{{ '<|user|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'system' %}
{{ '<|system|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}
{{ '<|assistant|>
'  + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}
{{ '<|assistant|>' }}
{% endif %}
{% endfor %}


In [15]:
# Load data & preprocessing
dataset = build_dataset(
    dataset_config=config.data,
    tokenizer=tokenizer,
    **config.data.preprocessing_function_kwargs,
)


if True:
    import numpy as np  # hack

    print(len(dataset))
    np.random.seed(1234)
    ridx = np.random.choice(len(dataset), 1024, replace=False)
    dataset = dataset.select(ridx)
    print(len(dataset))

dataset

Map (num_proc=6):   0%|          | 0/207865 [00:00<?, ? examples/s]

207865
1024


Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'text'],
    num_rows: 1024
})

In [9]:
# TODO - update our code base if we use optimum
# Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file. # noqa
# WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed. # TODO update in main repo # noqa

In [10]:
model = build_model(config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# Consider adding special tokens
# tokenizer.additional_special_tokens  #'<|assistant|>', <|system|>
# tokenizer.encode("<|system|>")  # We already wrap <bos> and <eos>
# # in the chat template
# # add_special_tokens=
# tokenizer.encode("|system|")

# # For ChatML we need to add special tokens and resize the embedding layer
# if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path: # noqa
#     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) # noqa
#     model, tokenizer = setup_chat_format(model, tokenizer)
#     model_kwargs = None

In [12]:
if config.training.use_peft:
    model = build_peft_model(
        model, config.training.enable_gradient_checkpointing, config.peft
    )

if config.training.enable_gradient_checkpointing:
    model.enable_input_require_grads()

In [13]:
# TODO update if need be for accelerator
trainer_cls = build_trainer(config.training.trainer_type)

In [16]:
trainer = trainer_cls(
    model=model,
    tokenizer=tokenizer,
    args=config.training.to_hf(),
    train_dataset=dataset,
    **config.data.trainer_kwargs,
)

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [17]:
dataset[0]["text"]

"<|system|>\n</s>\n<|user|>\nHow many miles of coastline are there in Point Reyes National Seashore?</s>\n<|assistant|>\nThere are approximately 80 miles of coastline in Point Reyes National Seashore.</s>\n<|user|>\nCan you recommend any specific beaches or hiking trails along the coastline in Point Reyes National Seashore?</s>\n<|assistant|>\nYes, here are some recommended beaches and hiking trails along the coastline in Point Reyes National Seashore:\n\nBeaches:\n1. Limantour Beach - a beautiful and wide beach with soft sand perfect for sunbathing, picnicking, and walking.\n2. Drake's Beach - a popular beach for families, with picnic areas, restrooms, and a visitor center nearby.\n3. Point Reyes Beach - a long and wild beach with rolling waves and stunning views, perfect for surfing or fishing.\n4. South Beach - a remote and secluded beach with plenty of wildlife, including harbor seals and sea lions.\n\nHiking Trails:\n1. Tomales Point Trail - a scenic hike along the coast that offe

In [18]:
trainer.train()

  0%|          | 0/2 [00:00<?, ?it/s]

{'train_runtime': 150.9662, 'train_samples_per_second': 0.013, 'train_steps_per_second': 0.013, 'train_loss': 1.116136908531189, 'epoch': 0.0}


TrainOutput(global_step=2, training_loss=1.116136908531189, metrics={'train_runtime': 150.9662, 'train_samples_per_second': 0.013, 'train_steps_per_second': 0.013, 'total_flos': 109778152931328.0, 'train_loss': 1.116136908531189, 'epoch': 0.001953125})

In [20]:
# Save final checkpoint & training state
trainer.save_state()

save_model(
    config=config,
    trainer=trainer,
)

[2024-05-30 18:56:33,607][lema][INFO][saver.py:25] Model has been saved at output/zephyr.7b.sft.
