Setup development environment

Our first step is to install Hugging Face Libraries and Pytorch, including trl, transformers and datasets. If you haven't heard of trl yet, don't worry. It is a new library on top of transformers and datasets, which makes it easier to fine-tune, rlhf, align open LLMs.

In [1]:
# Install Pytorch & other libraries
%pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  # "trl==0.7.10" # \
  # "peft==0.7.1" \

# install peft & trl from github
%pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
%pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
  Cloning https://github.com/huggingface/trl (to revision a3c5b7178ac4f65569975efadc97db2f3749c65e) to /tmp/pip-req-build-3rsg_a8_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-3rsg_a8_
  Running command git rev-parse -q --verify 'sha^a3c5b7178ac4f65569975efadc97db2f3749c65e'
  Running command git fetch -q https://github.com/huggingface/trl a3c5b7178ac4f65569975efadc97db2f3749c65e
  Running command git checkout -q a3c5b7178ac4f65569975efadc97db2f3749c65e
  Resolved https://github.com/huggingface/trl to commit a3c5b7178ac4f65569975efadc97db2f3749c65e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?2

In [2]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
%pip install wheel
%pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


Create and prepare the dataset

In [12]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500)) #12500

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

dataset = dataset.train_test_split(test_size=100/12500)

# print(dataset["train"][0]["messages"]) #345

# save datasets to disk
dataset["train"].select(range(100)).to_json("retrain/train_dataset.json", orient="records")
dataset["test"].to_json("retrain/test_dataset.json", orient="records")



Map: 100%|██████████| 12500/12500 [00:01<00:00, 7021.89 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 275.34ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 351.46ba/s]


48217

Fine-tune LLM using trl and the SFTTrainer

In [13]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="retrain/train_dataset.json", split="train")

# print(dataset)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# Hugging Face model id
model_id = "codellama/CodeLlama-7b-Instruct-hf" # "codellama/CodeLlama-7b-hf" # or `mistralai/Mistral-7B-v0.1`

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_compute_dtype=torch.bfloat16
# )

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)



from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)
# peft_config = LoraConfig(
#     r=16,  # dimension of the updated matrices
#     lora_alpha=64,  # parameter for scaling
#     target_modules=[
#     "q_proj",
#     "up_proj",
#     "o_proj",
#     "k_proj",
#     "down_proj",
#     "gate_proj",
#     "v_proj"],
#     lora_dropout=0.1,  # dropout probability for layers
#     bias="none",
#     task_type="CAUSAL_LM",
# )



from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./models/codeLlama-7b-text-to-sql", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision -- bf16=True,   
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)




from trl import SFTTrainer

max_seq_length = 3072 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)




# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

# free the memory again
del model
del trainer
torch.cuda.empty_cache()




Generating train split: 100 examples [00:00, 17690.77 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.41s/it]
Generating train split: 3 examples [00:00, 54.04 examples/s]
You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss




Merge LoRA adapter in to the original model

In [14]:
#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
import torch
from peft import AutoPeftModelForCausalLM

output_dir = './models/codeLlama-7b-text-to-sql'

# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained(output_dir,safe_serialization=True, max_shard_size="2GB")

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.69s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
