In [39]:
!pip install -U transformers datasets peft accelerate bitsandbytes




In [40]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model


In [41]:
ds = load_dataset("yahma/alpaca-cleaned")


In [42]:
ds["train"] = ds["train"].shuffle(seed=42).select(range(2000))


In [43]:
def format_prompt(example):
    if example["input"]:
        text = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""
    else:
        text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}"""
    return {"text": text}

ds = ds.map(format_prompt)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [44]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

model.gradient_checkpointing_enable()
model.config.use_cache = False
model.to("cuda")


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [45]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [46]:
MAX_LEN = 256

def tokenize_fn(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_ds = ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=ds["train"].column_names
)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [47]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [48]:
training_args = TrainingArguments(
    output_dir="./alpaca-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    bf16=False,
    logging_steps=20,
    save_steps=500,
    save_total_limit=1,
    optim="adamw_torch",
    report_to="none",
)


In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    data_collator=data_collator
)


In [50]:
trainer.train()


Step,Training Loss
20,1.466681
40,1.243422
60,1.206615
80,1.229237
100,1.192537
120,1.181069
140,1.222995
160,1.287058
180,1.120814
200,1.212815


TrainOutput(global_step=250, training_loss=1.226088996887207, metrics={'train_runtime': 433.3105, 'train_samples_per_second': 4.616, 'train_steps_per_second': 0.577, 'total_flos': 3181482344448000.0, 'train_loss': 1.226088996887207, 'epoch': 1.0})

In [51]:
trainer.save_model("./alpaca-lora")
tokenizer.save_pretrained("./alpaca-lora")

('./alpaca-lora/tokenizer_config.json',
 './alpaca-lora/chat_template.jinja',
 './alpaca-lora/tokenizer.json')

In [52]:
tests = [
    "Explain polymorphism in OOP",
    "Write a Python function to check prime numbers",
    "What is overfitting in machine learning?"
]

for t in tests:
    prompt = f"### Instruction:\n{t}\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=120)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    print("-"*60)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


### Instruction:
Explain polymorphism in OOP

### Response:
Pol ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
------------------------------------------------------------
### Instruction:
Write a Python function to check prime numbers

### Response:
Here ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###