Setup development environment

Our first step is to install Hugging Face Libraries and Pytorch, including trl, transformers and datasets. If you haven't heard of trl yet, don't worry. It is a new library on top of transformers and datasets, which makes it easier to fine-tune, rlhf, align open LLMs.

In [1]:
# Install Pytorch & other libraries
%pip install torch tensorboard

# Install Hugging Face libraries
%pip install transformers
%pip install datasets
%pip install accelerate
%pip install evaluate
%pip install bitsandbytes

# install peft & trl from github
%pip install trl
%pip install peft


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
%pip install wheel
%pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


Create and prepare the dataset

In [21]:
import pandas
import json

# Read excel document
excel_data_df = pandas.read_excel('./data/unit-test-train-data.xlsx', sheet_name='Sheet 1')

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": "Write UnitTest"},
      {"role": "user", "content": sample['prompt']},
      {"role": "assistant", "content": sample['completion']}
    ]
  }

updated_data = list(map(create_conversation, excel_data_df.to_dict(orient='records')))

# Print out the result
print('Excel Sheet to JSON:\n', json.dumps(updated_data))


# Define file to write to and 'w' for write option -> json.dump() 
# defining the list to write from and file to write to
with open('retrain/train_dataset.json', 'w') as json_file:
    json.dump(updated_data, json_file)

Excel Sheet to JSON:
 [{"messages": [{"role": "system", "content": "Write UnitTest"}, {"role": "user", "content": "Review the shared code context and generate a suite of multiple unit tests for the functions in shared code context using the detected test framework and libraries. Code context: //go:generate go-enum --sql --marshal --file $GOFILE package img import ( \t\"bytes\" \t\"context\" \t\"errors\" \t\"fmt\" \t\"image\" \t\"io\" \t\"github.com/disintegration/imaging\" \t\"github.com/dsoprea/go-exif/v3\" \t\"github.com/marusama/semaphore/v2\" \texifcommon \"github.com/dsoprea/go-exif/v3/common\" ) // ErrUnsupportedFormat means the given image format is not supported. var ErrUnsupportedFormat = errors.New(\"unsupported image format\") // Service type Service struct { \tsem semaphore.Semaphore } func New(workers int) *Service { \treturn &Service{ \t\tsem: semaphore.New(workers), \t} } // Format is an image file format. /* ENUM( jpeg png gif tiff bmp ) */ type Format int func (x Forma

In [3]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500)) #12500

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# print(dataset["train"][0]["messages"]) #345

# save datasets to disk
dataset["train"].to_json("retrain/train_dataset.json", orient="records")
dataset["test"].to_json("retrain/test_dataset.json", orient="records")



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 12500/12500 [00:01<00:00, 7736.46 examples/s]


[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_21 (attendance VARCHAR, game_site VARCHAR)', 'role': 'system'}, {'content': 'How many people were at the game that took place at the Kingdome?', 'role': 'user'}, {'content': 'SELECT attendance FROM table_name_21 WHERE game_site = "the kingdome"', 'role': 'assistant'}]


Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 48.20ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 73.58ba/s]


1194474

Fine-tune LLM using trl and the SFTTrainer

In [4]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="retrain/train_dataset.json", split="train")

# print(dataset)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# Hugging Face model id
model_id = "codellama/CodeLlama-7b-Instruct-hf" # "codellama/CodeLlama-7b-hf" # or `mistralai/Mistral-7B-v0.1`

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_compute_dtype=torch.bfloat16
# )

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)



from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)
# peft_config = LoraConfig(
#     r=16,  # dimension of the updated matrices
#     lora_alpha=64,  # parameter for scaling
#     target_modules=[
#     "q_proj",
#     "up_proj",
#     "o_proj",
#     "k_proj",
#     "down_proj",
#     "gate_proj",
#     "v_proj"],
#     lora_dropout=0.1,  # dropout probability for layers
#     bias="none",
#     task_type="CAUSAL_LM",
# )



from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./models/codeLlama-7b-instruct-hf-text-to-sql", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision -- bf16=True,   
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)




from trl import SFTTrainer

max_seq_length = 3072 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)




# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

# free the memory again
del model
del trainer
torch.cuda.empty_cache()




Generating train split: 10000 examples [00:00, 422429.65 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.76s/it]
Generating train split: 399 examples [00:02, 198.63 examples/s]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
10,1.1726
20,0.6141
30,0.5504
40,0.5165
50,0.5081
60,0.5024
70,0.4837
80,0.449




Merge LoRA adapter in to the original model

In [None]:
#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
import torch
from peft import AutoPeftModelForCausalLM

output_dir = './models/codeLlama-7b-instruct-hf-text-to-sql'

# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained(output_dir,safe_serialization=True, max_shard_size="10GB")