In [1]:
from datasets import load_dataset
import random
import numpy as np
from peft import LoftQConfig, get_peft_model, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from typing import Optional
from dataclasses import dataclass, field
from transformers import TrainingArguments
import torch

In [2]:
def _create_unsloth_optimizer(
    model,
    optimizer_cls,
    optimizer_kwargs,
    embedding_lr = 5e-5,
):
    lr = optimizer_kwargs["lr"]
    weight_decay = optimizer_kwargs.get("weight_decay", 0.0)

    param_groups = \
    {
        "non_embeddings" : {},
        "embeddings"     : {},
    }

    for name, param in model.named_parameters():
        if not param.requires_grad: continue
        if name.endswith("modules_to_save.default.weight"):
            partial_name = name[:-len(".modules_to_save.default.weight")]
            partial_name = partial_name[partial_name.rfind(".")+1:]
            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.")
            param_groups["embeddings"]    [name] = param
        else:
            param_groups["non_embeddings"][name] = param
        pass
    pass

    optimizer_grouped_parameters = [
        {
            "params"       : list(param_groups["non_embeddings"].values()),
            "weight_decay" : weight_decay,
            "lr"           : lr,
        },
        {
            "params"       : list(param_groups["embeddings"].values()),
            "weight_decay" : weight_decay,
            "lr"           : embedding_lr,
        },
    ]
    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
    return optimizer

@dataclass
class UnslothTrainingArguments(SFTConfig):
    embedding_learning_rate : Optional[float] = field(
        default = None,
        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
    )

class UnslothTrainer(SFTTrainer):
    def create_optimizer(self):
        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
        if embedding_learning_rate is None: return super().create_optimizer()

        if self.optimizer is None:
            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
            self.optimizer = _create_unsloth_optimizer(
                self.model,
                optimizer_cls,
                optimizer_kwargs,
                embedding_learning_rate,
            )
        
        return self.optimizer

In [3]:
# PARAMS
SEED = 42
model_id =   "meta-llama/Llama-3.1-8B" #"meta-llama/Llama-3.1-8B"
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# Training params
batch_size = 16 # 8
gradient_accumulation_steps = 1 #128
warmup_ratio = 0.05
max_steps = 10
learning_rate = 5e-4 # 5e-4
embedding_learning_rate = learning_rate/2
weight_decay = 0.01 # 0.01
# LoRA params
lora_r = 256 # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
lora_alpha = 1

    # reproducibility
    ## torch
torch.manual_seed(SEED)

    ## python
random.seed(SEED)

    ## numpy
np.random.seed(SEED)

In [4]:
from transformers import BitsAndBytesConfig

#create config
bb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
)


In [5]:
#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

#load model
#TODO load in 4 bit support?
#float16 set by default (better for gradient accumulation according to deepspeed)
#TODO max_seq_length not here
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bb_config)
#LoftQ config
loftq_config = LoftQConfig(loftq_bits=4)
#init LoRA
#TODO checkpointing not here
#TODO loftq not working? Compatibility issue?
lora_config = LoraConfig(r=lora_r, 
                             lora_alpha=lora_alpha, 
                             target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                                             "gate_proj", "up_proj", "down_proj",
                                             "embed_tokens", "lm_head"],
                             lora_dropout=0,
                             bias="none",
                             use_rslora=True,
                            #  init_lora_weights="loftq",
                            #  loftq_config=loftq_config
                             )

model = get_peft_model(model, lora_config)


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# prepare data
# this dataset has already fixed encoding using ftfy (as is used by me in the preprocessing steps of other datasets)
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="train", streaming=True)
#we need only texts
dataset = dataset.remove_columns(["id", "dump", "url", "date", "file_path", "language", "language_score", "language_script", "minhash_cluster_size", "top_langs"])
#shuffle to be sure we select "random sample"
#dataset = dataset.shuffle(seed=42)
dataset = dataset.take(1000)
def preprocess_function(examples):
    return {"text": [example + tokenizer.eos_token for example in examples["text"]]}
dataset = dataset.map(preprocess_function, batched=True)
print(dataset.column_names)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

None


In [7]:
tokenizer.convert_ids_to_tokens(tokenizer("Ahoj, jak se máš?"+tokenizer.eos_token)["input_ids"])

['<|begin_of_text|>',
 'A',
 'ho',
 'j',
 ',',
 'Ġjak',
 'Ġse',
 'ĠmÃ¡',
 'Å¡',
 '?',
 '<|end_of_text|>']

In [6]:
from datasets import load_from_disk, load_dataset

dataset = load_from_disk("/mnt/personal/mlynatom/data/pretraining/fineweb-2_ces_Latn_19531250_llama_preprocessed")
dataset = dataset.select(range(1953125))
#dataset = dataset.select(range(1000))
#dataset = dataset.to_iterable_dataset()
dataset

Loading dataset from disk:   0%|          | 0/134 [00:00<?, ?it/s]

Dataset({
    features: ['text'],
    num_rows: 1953125
})

In [7]:
RUN_NAME = f"{model_id.split('/')[-1]}-cs"
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = UnslothTrainingArguments(
            packing = True,
            max_seq_length = max_seq_length,
            dataset_text_field = "text",
            dataset_num_proc = 8,
            per_device_train_batch_size = batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_ratio = warmup_ratio,
            #num_train_epochs = 1, # Set this for 1 full training run.
            max_steps = 50,
            learning_rate = learning_rate,
            embedding_learning_rate = embedding_learning_rate,
            #fp16 = True,
            bf16 = True,
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = weight_decay,
            lr_scheduler_type = "cosine",
            seed = SEED,
            output_dir = f"models/cp_{RUN_NAME}",
            report_to = "none", # Use this for WandB etc
            run_name=RUN_NAME,
            gradient_checkpointing = True,
            # eval_strategy = args.eval_strategy,
            # eval_steps = args.eval_steps,
        ),
    )

Loading dataset shards:   0%|          | 0/47 [00:00<?, ?it/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
#Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = NVIDIA A100-SXM4-80GB. Max memory = 79.254 GB.
8.572 GB of memory reserved.


In [9]:
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    profile_memory=True,
    record_shapes=True,
    #with_stack=True
) as prof:
    # Run training step
    trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.556
2,2.6747
3,2.462
4,2.5607
5,2.4946
6,2.3459
7,2.5869
8,2.4356
9,2.3883
10,2.3574




In [10]:
print(prof.key_averages().table(sort_by="cuda_memory_usage"))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    aten::empty_strided         0.41%        1.615s         0.44%        1.719s       6.459us       0.000us         0.00%       0.000us       0.000us       4.71 Gb       4.71 Gb   42422.50 Gb   42422.50 G

In [14]:
prof.key_averages()

[]

In [12]:
print(prof.key_averages().table(sort_by="cuda_memory_usage"))




In [10]:
#training
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 79.25 GiB of which 277.44 MiB is free. Process 3668578 has 42.71 GiB memory in use. Including non-PyTorch memory, this process has 36.20 GiB memory in use. Of the allocated memory 28.48 GiB is allocated by PyTorch, and 7.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
#training
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.6284
2,2.4826
3,2.5386
4,2.4988
5,2.6106
6,2.4956
7,2.4838
8,2.4788
9,2.4329
10,2.5393




In [10]:
#training
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.6284
2,2.4826
3,2.5389
4,2.4987
5,2.6102
6,2.4959
7,2.4828
8,2.4772
9,2.4326
10,2.5391




In [10]:
#training
trainer_stats = trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.3697
2,2.6132
3,3.0689
4,2.4373
5,2.3021
6,2.4707
7,2.4861
8,2.5515
9,2.4843
10,2.4491




In [None]:
#Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")