In [None]:
# TODO : add accelerate, bitsandbytes in toml

In [1]:
import os

from huggingface_hub import login
from omegaconf import OmegaConf

from lema.builders import (
    build_dataset,
    build_model,
    build_peft_model,
    build_tokenizer,
    build_trainer,
)
from lema.core.types import TrainingConfig
from lema.utils.saver import save_model

%load_ext autoreload
%autoreload 2

In [2]:
access_token = os.environ.get("HF_TOKEN")
login(token=access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/optas/.cache/huggingface/token
Login successful


In [3]:
config_filename = "../configs/lema/zephyr.7b.sft.yaml"
base_config = OmegaConf.structured(TrainingConfig)
file_config = TrainingConfig.from_yaml(config_filename)
config = OmegaConf.merge(base_config, file_config)
config: TrainingConfig = OmegaConf.to_object(config)
print(config.training)
print(config.peft)

PeftParams(lora_r=16, lora_alpha=16, lora_dropout=0.05, lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], lora_modules_to_save=None, lora_bias='none', lora_task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, q_lora=False, q_lora_bits=4)


In [11]:
# TODO-finalize in config file
config.peft.q_lora = True
config.training.per_device_train_batch_size = 2
config.training.max_steps = 2
config.peft.q_lora_bits = 8

In [5]:
tokenizer = build_tokenizer(config.model)
tokenizer



LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
# if tokenizer.model_max_length > 100_000: # shall this condition be checked for diff.
#  Zephyr models? Now is not.

print("tokenizer.model_max_length", tokenizer.model_max_length)
print("tokenizer pad_token/eos_token", tokenizer.pad_token, tokenizer.eos_token)
print("tokenizer.padding_side", tokenizer.padding_side)
print("tokenizer.chat_template", tokenizer.chat_template)

tokenizer.model_max_length 2048
tokenizer pad_token/eos_token </s> </s>
tokenizer.padding_side left
tokenizer.chat_template {% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|' + message['role'] + '|>\n' + message['content'] | trim + eos_token + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}


In [7]:
# Load data & preprocessing
dataset = build_dataset(data_params=config.data, tokenizer=tokenizer)


if True:
    import numpy as np  # hack to subsample

    print(len(dataset))
    np.random.seed(1234)
    ridx = np.random.choice(len(dataset), 1024, replace=False)
    dataset = dataset.select(ridx)
    print(len(dataset))

dataset

207865
1024


Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'text'],
    num_rows: 1024
})

In [8]:
# TODO - update our code base if we use optimum
# Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file. # noqa
# WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed. # TODO update in main repo # noqa

In [12]:
model = build_model(config)

[2024-06-06 02:40:42,608][lema][INFO][models.py:55] Building model using device_map: auto...


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
tokenizer.all_special_tokens
tokenizer.encode("|system|")

# TODO Consider adding special tokens like '<|assistant|>', '<|system|>'
# via tokenizer.additional_special_tokens -- need to check Mistral

# from alignment team:
# tokenizer.encode("<|system|>")  # We already wrap <bos> and <eos>
# # in the chat template

# Future TODO.
# # For ChatML we need to add special tokens and resize the embedding layer
# if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path: # noqa
#     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) # noqa
#     model, tokenizer = setup_chat_format(model, tokenizer)
#     model_kwargs = None

In [None]:
if config.training.use_peft:
    model = build_peft_model(
        model, config.training.enable_gradient_checkpointing, config.peft
    )

if config.training.enable_gradient_checkpointing:
    model.enable_input_require_grads()

In [None]:
# TODO update if need be for accelerator
trainer_cls = build_trainer(config.training.trainer_type)

In [None]:
trainer = trainer_cls(
    model=model,
    tokenizer=tokenizer,
    args=config.training.to_hf(),
    train_dataset=dataset,
    **config.data.trainer_kwargs,
)

In [None]:
dataset[0]["text"]

In [None]:
trainer.train()

In [None]:
# Save final checkpoint & training state
trainer.save_state()

save_model(
    config=config,
    trainer=trainer,
)