In [None]:
#This notebook has been tested with Conda Pytorch Python 3.10 kernel on a ml.g5.24xlarge instance, you may be able to go smaller to a ml.g5.12xlarge

#!pip install transformers bitsandbytes git+https://github.com/huggingface/peft sentencepiece datasets

In [None]:
#Do this so that your cache directory does not fill up, there is a Sagemaker limit of 5Gb

!export TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/
!export HF_DATASETS_CACHE=/home/ec2-user/SageMaker/
!export HF_HOME=/home/ec2-user/SageMaker/

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel, LoraModel

def create_model(huggingface_id, tokenizer_id=None, four_bit=True, use_cpu=False):
    if tokenizer_id is None:
        tokenizer_id = huggingface_id

    #Convert the model to Magic nf4 format
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    if not four_bit:
        bnb_config = None
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
    tokenizer.pad_token = tokenizer.eos_token
    
    device_map = {"":0}
    if use_cpu:
        device_map = {"": "cpu"}
    
    model = AutoModelForCausalLM.from_pretrained(huggingface_id, quantization_config=bnb_config, \
                                                 device_map=device_map,trust_remote_code=True)
    model.gradient_checkpointing_enable()
    return model, tokenizer

def load_lora(model, lora_path):
    model = PeftModel.from_pretrained(model, lora_path)
    return model
    
def loraify_model(model, rank=8, alpha=32, dropout=0.05):
    model = prepare_model_for_kbit_training(model)
    config = LoraConfig(
        r=rank,
        lora_alpha=alpha,
        # Add a lora adapter to the par 
        target_modules=["q_proj", "v_proj"], # For LLaMA
        # target_modules=["query_key_value", "xxx"], # For RedPajama
        lora_dropout=dropout, 
        bias="none", 
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)
    return model



def train_model(model, tokenizer, dataset, lora_name, batch_size=4, gradient_accum=4, max_steps=10000, save_steps=200, logging_steps=5, num_train_epochs=10):    
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accum,
            warmup_steps=2,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=logging_steps,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            save_steps=save_steps,
            num_train_epochs=num_train_epochs,
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    model.config.use_cache = False
    trainer.train()
    model.save_pretrained(lora_name)
    
def generate_completion(model, tokenizer, prompt, max_length):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Prepare a model for Q-Lora format by attaching an adapter
model, tokenizer = create_model("openlm-research/open_llama_7b")
model = loraify_model(model, rank=8, alpha=16, dropout=0.1)


In [None]:
from datasets import load_dataset


#Load your instruction dataset here
file_name="instruction_data.json"
dataset = load_dataset("json", data_files={"train": file_name})

In [None]:
#take a peek
dataset

In [None]:
#Merge the intruction, input and output columns. Once merged tokenize and drop old columns

def tokenize_function(examples):
        return tokenizer(examples["instruction"] + examples["input"] + examples["output"], \
                         padding="max_length", truncation=True, max_length=256)
dataset = dataset.map(tokenize_function, batched=True, remove_columns=['instruction', 'output', 'input'])

In [None]:
dataset

In [None]:
dataset = dataset["train"]

In [None]:

#Train your Q-Lora model
train_model(model, tokenizer, dataset, "open_llama_chat")

In [None]:
# Load you intial model plus Q-Lora adapter
model, tokenizer = create_model("openlm-research/open_llama_7b")
model = load_lora(model, "open_llama_chat")
completion = generate_completion(model, tokenizer, "LoRA is", 100)
print(completion)