In [None]:
import os, torch
import transformers
import pandas as pd
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM

# Load Data & Weights

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
alpaca_data = load_dataset(
                        "json", 
                        data_files="./AlpacaDataCleaned/alpaca_data.json"
                        )

In [None]:
sample_instructions = alpaca_data["train"]["instruction"]
sample_inputs = alpaca_data["train"]["input"]
sample_outputs = alpaca_data["train"]["output"]

df_sample = pd.DataFrame(
    {
        "instruction": sample_instructions,
        "input": sample_inputs,
        "output": sample_outputs,
    }
)

df_sample.head()

In [None]:
def generate_prompt(data_point): # Prompt with and without instructions
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

                    ### Instruction:
                    {data_point["instruction"]}

                    ### Input:
                    {data_point["input"]}

                    ### Response:
                    {data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

                    ### Instruction:
                    {data_point["instruction"]}

                    ### Response:
                    {data_point["output"]}"""

alpaca_data = alpaca_data.map(lambda data_point : {"prompt": tokenizer(generate_prompt(data_point))})
alpaca_data

# Hparams

In [None]:
EPOCHS = 1
LORA_R = 4
BATCH_SIZE = 32
LORA_ALPHA = 16
CUTOFF_LEN = 256  
LORA_DROPOUT = 0.05
LEARNING_RATE = 2e-5  
MICRO_BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE

# Configure Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
                                        "EleutherAI/gpt-j-6B",
                                        add_eos_token=True, 
                                        )

model = GPTJForCausalLM.from_pretrained(
                                        "EleutherAI/gpt-j-6B",
                                        load_in_8bit=True, # load in 8bit
                                        device_map="auto", 
                                        )


model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)

# Configure Lora Finetuning

In [None]:
config = LoraConfig(
                    r=LORA_R,
                    lora_alpha=LORA_ALPHA,
                    target_modules=["q_proj", "v_proj"],
                    lora_dropout=LORA_DROPOUT,
                    bias="none",
                    task_type="CAUSAL_LM",
                    )
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
data = load_dataset(
                    "json", 
                    data_files="AlpacaDataCleaned/alpaca_data_cleaned.json"
                    )

In [None]:
data = data.shuffle().map(
                        lambda data_point: tokenizer(
                                                    generate_prompt(data_point),
                                                    truncation=True,
                                                    max_length=CUTOFF_LEN,
                                                    padding="max_length",
                                                    )
                        )

In [None]:
trainer = transformers.Trainer(
                            model=model,
                            train_dataset=data["train"],
                            args=transformers.TrainingArguments(
                                                            per_device_train_batch_size=MICRO_BATCH_SIZE,
                                                            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
                                                            warmup_steps=100,
                                                            num_train_epochs=EPOCHS,
                                                            learning_rate=LEARNING_RATE,
                                                            fp16=True,
                                                            logging_steps=1,
                                                            output_dir="lora-dolly",
                                                            save_total_limit=3,
                                                            ),
                            data_collator=transformers.DataCollatorForLanguageModeling(
                                                                                    tokenizer, 
                                                                                    mlm=False
                                                                                    ),
                            )

model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("models/gptj6b-lora-dolly")

In [None]:
notebook_login()
model.push_to_hub(
                 "zuu/dolly-lora", 
                 use_auth_token=True
                 )