In [None]:
!pip install -q -U bitsandbytes 'optimum==1.13.1'
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/trl@flash-attn-sft # to enable flash attention
!pip install -q datasets

In [None]:
# See https://huggingface.co/docs/transformers/perf_train_gpu_one for efficient training tips

In [None]:
!pip install --index-url https://download.pytorch.org/whl/nightly/cu118 --pre 'torch>=2.1.0dev'

In [None]:
from datasets import load_dataset , Dataset, concatenate_datasets 
import numpy as np
import pandas as pd
import random

rd_ds = load_dataset("databricks/databricks-dolly-15k")
rd_df = pd.DataFrame(rd_ds['train'])
display(rd_df)

In [None]:
def generate_prompt(example):
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""

    if example["context"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['context']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )


rd_df["prompt"] = rd_df.apply(generate_prompt, axis=1)

rd_df["response"] = rd_df["response"] + "\n### End"

rd_df = rd_df[["prompt", "response"]]

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from transformers.trainer_callback import TrainerCallback
import os
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
import mlflow

df = rd_df.copy()
df["text"] = df["prompt"] + df["response"]
df.drop(columns=["prompt", "response"], inplace=True)

In [None]:
from datasets import load_dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.05, seed=42)

In [None]:
target_modules = ['gate_proj','down_proj','up_proj'] # , 'k_proj', 'lm_head', 'q_proj', 'v_proj', 'o_proj'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-13b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    r=16,#or r=16
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules = target_modules,
    task_type="CAUSAL_LM",
)

# Done in sfttrainer
#model = get_peft_model(model, lora_config)
#model.print_trainable_parameters()

In [None]:
base_dir = "out"

per_device_train_batch_size = 4
gradient_accumulation_steps = 8 # virtual batch size = 4 * 8 = 32
optim = 'paged_adamw_8bit'
learning_rate = 4e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

In [None]:
# with 14260 examples, we get approx 446 steps for each epoch:
14260 / 8 / 4 

In [None]:
from transformers import TrainingArguments

# https://huggingface.co/docs/transformers/v4.33.0/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir=base_dir,
    #save_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=100,
    num_train_epochs = 1,
    logging_strategy="steps",
    logging_steps=100,
    #max_steps=100,
    per_device_eval_batch_size=2,
    eval_accumulation_steps=8,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    warmup_steps=100,
    lr_scheduler_type=lr_scheduler_type,
    group_by_length=False
)

In [None]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=2048,
    args=training_args,
    use_flash_attn=True,
    packing=True,
    peft_config=lora_config
)

In [None]:
# for name, module in trainer.model.named_modules():
#     if "norm" in name:
#         module = module.to(torch.float32)

In [None]:
#mlflow.end_run()

In [None]:
trainer.train()