In [82]:
!pip install -U bitsandbytes transformers peft accelerate datasets



In [86]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

In [88]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

DATA_DIR = "/kaggle/input/day2-dataset"
TRAIN_FILE = ("/kaggle/input/day2-dataset/train.jsonl", "train.jsonl")
VAL_FILE = ("/kaggle/input/day2-dataset/val.jsonl", "val.jsonl")

ADAPTER_DIR = "./adapters"
os.makedirs(ADAPTER_DIR, exist_ok=True)

MAX_SEQ_LEN = 512


In [89]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [91]:
TRAIN_FILE = "/kaggle/input/day2-dataset/train.jsonl"
VAL_FILE   = "/kaggle/input/day2-dataset/val.jsonl"


from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": TRAIN_FILE,
        "validation": VAL_FILE,
    }
)

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 100
    })
})


In [93]:
def format_prompt(example):
    return (
        "### Instruction:\n"
        f"{example['instruction']}\n\n"
        "### Input:\n"
        f"{example['input']}\n\n"
        "### Response:\n"
        f"{example['output']}"
    )

In [94]:
def tokenize_fn(example):
    text = format_prompt(example)
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding=False,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_ds = dataset.map(
    tokenize_fn,
    remove_columns=dataset["train"].column_names,
    batched=False,
)

print(tokenized_ds)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})


In [95]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)

In [96]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

In [97]:
def print_trainable_params(model):
    trainable = 0
    total = 0
    for _, param in model.named_parameters():
        total += param.numel()
        if param.requires_grad:
            trainable += param.numel()
    print(f"Trainable params: {trainable}")
    print(f"Total params: {total}")
    print(f"Trainable %: {100 * trainable / total:.2f}%")

print_trainable_params(model)

Trainable params: 2252800
Total params: 617859072
Trainable %: 0.36%


In [98]:
!pip install -U transformers accelerate peft



In [99]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/outputs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    fp16=False,
    optim="paged_adamw_8bit",
    report_to="none",
    run_name="tinyllama-qlora",
)

In [100]:
from transformers import DataCollatorWithPadding

class MinimalCausalLMCollator(DataCollatorWithPadding):
    def __call__(self, features):
        labels = [f["labels"] for f in features]
        for f in features:
            f.pop("labels")

        batch = super().__call__(features)

        max_len = batch["input_ids"].shape[1]
        batch["labels"] = torch.tensor(
            [l + [-100] * (max_len - len(l)) for l in labels]
        )
        return batch

In [102]:
data_collator = MinimalCausalLMCollator(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)

In [103]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,6.7603
20,7.7196
30,7.2161
40,7.0237
50,7.2806
60,7.7202
70,6.9493
80,7.0736
90,7.5797
100,7.3641


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=339, training_loss=7.10164290121523, metrics={'train_runtime': 4075.7747, 'train_samples_per_second': 0.662, 'train_steps_per_second': 0.083, 'total_flos': 8532511111348224.0, 'train_loss': 7.10164290121523, 'epoch': 3.0})

In [107]:
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

print(f"Adapter saved to {ADAPTER_DIR}")

Adapter saved to ./adapters


In [108]:
!zip -r /kaggle/working/adapter.zip /kaggle/working/adapters

  adding: kaggle/working/adapters/ (stored 0%)
  adding: kaggle/working/adapters/chat_template.jinja (deflated 60%)
  adding: kaggle/working/adapters/special_tokens_map.json (deflated 79%)
  adding: kaggle/working/adapters/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/adapters/tokenizer.json (deflated 85%)
  adding: kaggle/working/adapters/tokenizer_config.json (deflated 69%)
  adding: kaggle/working/adapters/adapter_config.json (deflated 57%)
  adding: kaggle/working/adapters/README.md (deflated 66%)
  adding: kaggle/working/adapters/tokenizer.model (deflated 55%)
