In [None]:
# !pip install -qq --upgrade pip
# !pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub

In [2]:
# from google.colab import userdata
# from huggingface_hub import login

# login(token=userdata.get('HF_TOKEN'))

In [1]:
import torch
import os

from peft import (
    PeftModel,
    PeftConfig,
    LoraConfig,
    TaskType,
    get_peft_model,
    get_peft_config,
)
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer

base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
cache_dir = "./cache"

## Load Base Model

In [2]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    cache_dir=cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id, trust_remote_code=True, cache_dir=cache_dir
)

base_model

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [3]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token id: {tokenizer.pad_token_id}")

print(f"EOS token: {tokenizer.eos_token}")
print(f"EOS token id: {tokenizer.eos_token_id}")

Pad token: <|eot_id|>
Pad token id: 128009
EOS token: <|eot_id|>
EOS token id: 128009


## Load and Apply LoRA

In [4]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [5]:
peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()
peft_model

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_feat

## Load Dataset and Format

In [6]:
# dataset = load_dataset("philschmid/dolly-15k-oai-style")
dataset = load_dataset("MBZUAI/Bactrian-X", "en")

for split in dataset:
    dataset[split] = dataset[split].select(range(10))
    print(f"{split}: {len(dataset[split])}")

# dataset =

dataset, dataset["train"][0]

train: 10


(DatasetDict({
     train: Dataset({
         features: ['instruction', 'input', 'id', 'output'],
         num_rows: 10
     })
 }),
 {'instruction': 'Give three tips for staying healthy.',
  'input': None,
  'id': 'alpaca-1',
  'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'}

In [7]:
INSTRUCTION_TEMPLATE_WITH_INPUT = """### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

INSTRUCTION_TEMPLATE_WITHOUT_INPUT = """### Instruction:
{instruction}

### Response:
"""


def tokenize_function(examples):
    results = {
        "input_ids": [],
        "labels": [],
        "attention_mask": [],
    }

    for i in range(len(examples["instruction"])):
        cur_instruction = examples["instruction"][i]
        cur_input = examples["input"][i]
        cur_output = examples["output"][i]

        if cur_input:
            cur_prompt = INSTRUCTION_TEMPLATE_WITH_INPUT.format(
                instruction=cur_instruction, input=cur_input
            )
        else:
            cur_prompt = INSTRUCTION_TEMPLATE_WITHOUT_INPUT.format(
                instruction=cur_instruction
            )

        cur_prompt_tokenized = tokenizer(
            cur_prompt, return_tensors="pt", add_special_tokens=True
        )
        cur_output_tokenized = tokenizer(
            cur_output, return_tensors="pt", add_special_tokens=False
        )

        cur_prompt_ids = cur_prompt_tokenized["input_ids"][0]
        cur_output_ids = cur_output_tokenized["input_ids"][0]
        cur_output_ids = torch.cat(
            [cur_output_ids, torch.tensor([tokenizer.eos_token_id])], dim=0
        )

        input_ids = torch.cat([cur_prompt_ids, cur_output_ids], dim=0)
        label = torch.cat(
            [torch.full_like(cur_prompt_ids, fill_value=-100), cur_output_ids], dim=0
        )

        results["input_ids"].append(input_ids)
        results["labels"].append(label)
        results["attention_mask"].append(torch.ones_like(input_ids))

    return results


tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=os.cpu_count(),
)

num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.


Map (num_proc=10):   0%|          | 0/10 [00:00<?, ? examples/s]

In [8]:
print(tokenized_dataset["train"][0])
print(
    tokenizer.decode(
        tokenized_dataset["train"][0]["input_ids"], skip_special_tokens=False
    )
)

{'input_ids': [128000, 14711, 30151, 512, 36227, 2380, 10631, 369, 19994, 9498, 382, 14711, 6075, 512, 16, 13, 45614, 264, 24770, 323, 78216, 10173, 25, 7557, 2771, 701, 21644, 527, 29408, 315, 264, 8205, 315, 26390, 323, 24822, 11, 16025, 13128, 11, 4459, 41936, 11, 323, 9498, 50127, 13, 1115, 8779, 311, 3493, 701, 2547, 449, 279, 7718, 37493, 311, 734, 520, 1202, 1888, 323, 649, 1520, 5471, 21249, 19338, 382, 17, 13, 3365, 425, 304, 5912, 7106, 5820, 25, 33918, 374, 16996, 369, 20958, 3831, 25896, 11, 24569, 11, 323, 41713, 2890, 13, 71715, 369, 520, 3325, 220, 3965, 4520, 315, 24070, 91490, 10368, 477, 220, 2075, 4520, 315, 71920, 10368, 1855, 2046, 382, 18, 13, 2175, 3403, 6212, 25, 25531, 3403, 4367, 6212, 374, 16996, 369, 7106, 323, 10723, 1664, 33851, 13, 1102, 8779, 311, 37377, 20247, 11, 7417, 25702, 734, 11, 323, 11815, 9498, 6650, 323, 22852, 734, 13, 71715, 369, 220, 22, 12, 24, 4207, 315, 6212, 1855, 3814, 13, 128009], 'labels': [-100, -100, -100, -100, -100, -100, -100, -

## Training Arguments

In [9]:
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=1,
    save_steps=2,
    max_steps=20,
    save_total_limit=2,
    report_to="none",
    push_to_hub=False,
)

## Custom data collator

In [10]:
from transformers import DataCollatorWithPadding
from typing import Any, Dict, List


class RightPaddingDataCollator(DataCollatorWithPadding):
    """The default data collator pads only inputs, not including the labels."""

    def __init__(self, tokenizer, max_length: int = 1024):
        super().__init__(tokenizer, max_length=max_length)

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_ids, labels, attention_mask = [], [], []
        max_batch_len = max(len(f["input_ids"]) for f in features)

        for sample in features:
            # Convert to torch tensors
            cur_input_ids = torch.tensor(sample["input_ids"], dtype=torch.long)
            cur_labels = torch.tensor(sample["labels"], dtype=torch.long)
            cur_attention_mask = torch.ones_like(cur_input_ids)

            # Next, we pad the inputs and labels to the maximum length within the batch
            pad_token_id = self.tokenizer.pad_token_id
            padding_length = max_batch_len - len(cur_input_ids)
            cur_input_ids = torch.cat(
                [
                    cur_input_ids,
                    torch.full(
                        (padding_length,), fill_value=pad_token_id, dtype=torch.long
                    ),
                ]
            )
            cur_labels = torch.cat(
                [
                    cur_labels,
                    torch.full((padding_length,), fill_value=-100, dtype=torch.long),
                ]
            )
            cur_attention_mask = torch.cat(
                [cur_attention_mask, torch.zeros((padding_length,), dtype=torch.long)]
            )

            # Truncate the inputs and labels to the maximum length
            cur_input_ids = cur_input_ids[:max_batch_len]
            cur_labels = cur_labels[:max_batch_len]
            cur_attention_mask = cur_attention_mask[:max_batch_len]

            # Append to the return lists
            input_ids.append(cur_input_ids)
            labels.append(cur_labels)
            attention_mask.append(cur_attention_mask)

        # Return formatted batch.
        return {
            "input_ids": torch.stack(input_ids),
            "labels": torch.stack(labels),
            "attention_mask": torch.stack(attention_mask),
        }


data_collator = RightPaddingDataCollator(tokenizer)

## Train the Model

In [11]:
from trl import SFTTrainer

# trainer = Trainer(
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=None,
    data_collator=data_collator,
)

Converting train dataset to ChatML:   0%|          | 0/10 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()



Step,Training Loss
1,0.5836
2,0.6802
3,0.6336
4,0.5919
5,0.6182
6,0.5928
7,0.6577
8,0.4386
9,0.6114
10,0.5956




TrainOutput(global_step=20, training_loss=0.6010845243930817, metrics={'train_runtime': 23.3482, 'train_samples_per_second': 6.853, 'train_steps_per_second': 0.857, 'total_flos': 177131290828800.0, 'train_loss': 0.6010845243930817})

In [13]:
peft_model.push_to_hub("tmnam20/peft-lora-causal-lm-1b")

adapter_model.safetensors:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tmnam20/peft-lora-causal-lm-1b/commit/2f1dc64fbc2b36e7c6787d7a2a1f5b0d59aaa7a0', commit_message='Upload model', commit_description='', oid='2f1dc64fbc2b36e7c6787d7a2a1f5b0d59aaa7a0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tmnam20/peft-lora-causal-lm-1b', endpoint='https://huggingface.co', repo_type='model', repo_id='tmnam20/peft-lora-causal-lm-1b'), pr_revision=None, pr_num=None)

In [14]:
tokenizer.push_to_hub("tmnam20/peft-lora-causal-lm-1b")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tmnam20/peft-lora-causal-lm-1b/commit/2f1dc64fbc2b36e7c6787d7a2a1f5b0d59aaa7a0', commit_message='Upload tokenizer', commit_description='', oid='2f1dc64fbc2b36e7c6787d7a2a1f5b0d59aaa7a0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tmnam20/peft-lora-causal-lm-1b', endpoint='https://huggingface.co', repo_type='model', repo_id='tmnam20/peft-lora-causal-lm-1b'), pr_revision=None, pr_num=None)