In [None]:
!pip install -q accelerate==0.21.0 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.40.2 --progress-bar off
!pip install -q transformers==4.31.0 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off

In [None]:
!huggingface-cli login

In [1]:
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer

# Prepare the dataset

In [2]:
dataset = load_dataset("b-mc2/sql-create-context")

# Prepare the model 

In [3]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

In [4]:
def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model name
    :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU device and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)

    # Set padding token as EOS token
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [5]:
################################################################################
# transformers parameters
################################################################################

# The pre-trained model from the Hugging Face Hub to load and fine-tune
model_name = "meta-llama/Llama-2-7b-hf"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [7]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

model, tokenizer = load_model(model_name, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [8]:
def create_prompt_formats(sample):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset

    :param sample: Prompt or sample from the instruction dataset
    """

    # Initialize static strings for the prompt template
    SQL_QUERY_PROMPT_TEMPLATE = """### Instructions:
    Your task is to convert a question into a SQL query, given a SQLlite database schema.
    Adhere to these rules:
    - **Deliberately go through the question and database schema word by word** to appropriately answer the question
    - **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.
    - When creating a ratio, always cast the numerator as float
    - Just reponse the SQL query only, no need for other words.
    ### Input:
    Generate a SQL query that answers the question `{question}`.
    This query will run on a database whose schema is represented in this string:
    {context}

    ### Response:
    {response}
    """

    # Combine a prompt with the static strings
    sample["text"] = SQL_QUERY_PROMPT_TEMPLATE.format(question=sample['question'], 
                                                      context=sample['context'],
                                                      response=sample['answer'],
                                                     )

    return sample

In [9]:
print(create_prompt_formats(dataset['train'][randrange(len(dataset))]))

{'question': 'How many heads of the departments are older than 56 ?', 'answer': 'SELECT COUNT(*) FROM head WHERE age > 56', 'context': 'CREATE TABLE head (age INTEGER)', 'text': '### Instructions:\n    Your task is to convert a question into a SQL query, given a SQLlite database schema.\n    Adhere to these rules:\n    - **Deliberately go through the question and database schema word by word** to appropriately answer the question\n    - **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.\n    - When creating a ratio, always cast the numerator as float\n    - Just reponse the SQL query only, no need for other words.\n    ### Input:\n    Generate a SQL query that answers the question `How many heads of the departments are older than 56 ?`.\n    This query will run on a database whose schema is represented in this string:\n    CREATE TABLE head (age INTEGER)\n\n    ### Response:\n    SELECT COUNT(*) F

In [10]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [11]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

# Preprocess the dataset

In [12]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["context", "question", "answer", "text"],
    )

    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

In [13]:
# Random seed
seed = 33

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

print(preprocessed_dataset)

print(preprocessed_dataset['train'][0])

Found max lenth: 4096
Preprocessing dataset...
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 78577
    })
})
{'input_ids': [1, 835, 2799, 582, 1953, 29901, 13, 1678, 3575, 3414, 338, 304, 3588, 263, 1139, 964, 263, 3758, 2346, 29892, 2183, 263, 3758, 29880, 568, 2566, 10938, 29889, 13, 1678, 2087, 4150, 304, 1438, 6865, 29901, 13, 1678, 448, 3579, 29928, 5037, 495, 2486, 748, 1549, 278, 1139, 322, 2566, 10938, 1734, 491, 1734, 1068, 304, 7128, 2486, 1234, 278, 1139, 13, 1678, 448, 3579, 11403, 6137, 10785, 2129, 1068, 304, 5557, 22363, 537, 29889, 1152, 1342, 29892, 421, 6404, 1591, 29896, 29889, 1054, 29896, 29892, 1591, 29906, 29889, 1054, 29896, 3895, 1591, 29896, 8780, 1591, 29906, 6732, 1591, 29896, 29889, 333, 353, 1591, 29906, 29889, 333, 1412, 13, 1678, 448, 1932, 4969, 263, 11959, 29892, 2337, 4320, 278, 4825, 1061, 408, 5785, 13, 1678, 448, 3387, 337, 1713, 278, 3758, 2346, 871, 29892, 694, 817, 363, 916, 3838, 29889, 

# Creating PEFT Configuration

In [14]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

# Finding Modules for LoRA Application

In [15]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

# Calculating Trainable Parameters

In [16]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [17]:
# Fine tune function

In [18]:
def fine_tune(model,
          tokenizer,
          dataset,
          lora_r,
          lora_alpha,
          lora_dropout,
          bias,
          task_type,
          per_device_train_batch_size,
          gradient_accumulation_steps,
          warmup_steps,
          max_steps,
          learning_rate,
          fp16,
          logging_steps,
          output_dir,
          optim):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param dataset: Preprocessed training dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training 
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model = model,
        train_dataset = dataset,
        args = TrainingArguments(
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = fp16,
            logging_steps = logging_steps,
            output_dir = output_dir,
            optim = optim,
        ),
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    model.config.use_cache = False

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok = True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [19]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
task_type = "CAUSAL_LM"

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
max_steps = 20

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True

# Log every X updates steps
logging_steps = 1

In [20]:
fine_tune(model, 
          tokenizer, 
          preprocessed_dataset['train'], 
          lora_r, 
          lora_alpha, 
          lora_dropout, 
          bias, 
          task_type, 
          per_device_train_batch_size, 
          gradient_accumulation_steps, 
          warmup_steps, 
          max_steps, 
          learning_rate, 
          fp16, 
          logging_steps, 
          output_dir, 
          optim)


LoRA module names: ['k_proj', 'up_proj', 'v_proj', 'gate_proj', 'down_proj', 'o_proj', 'q_proj']


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


All Parameters: 3,540,389,888 || Trainable Parameters: 39,976,960 || Trainable Parameters %: 1.1291682911958425
Training...


Step,Training Loss
1,2.2217
2,2.3144
3,2.0878
4,1.59
5,1.3118
6,0.9991
7,0.8355
8,0.6137
9,0.5274
10,0.5273


***** train metrics *****
  epoch                    =        0.0
  total_flos               =   391118GF
  train_loss               =     0.8228
  train_runtime            = 0:00:50.77
  train_samples_per_second =      1.576
  train_steps_per_second   =      0.394
{'train_runtime': 50.777, 'train_samples_per_second': 1.576, 'train_steps_per_second': 0.394, 'total_flos': 419959777443840.0, 'train_loss': 0.8228124812245369, 'epoch': 0.0}
Saving last checkpoint of the model...


In [21]:
# Load fine-tuned weights
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, 
                                                 device_map = "auto", 
                                                 torch_dtype = torch.bfloat16,
                                                 #token='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'
                                                          )
# Merge the LoRA layers with the base model
model = model.merge_and_unload()

# Save fine-tuned model at a new location
output_merged_dir = "results/llama2_7b_finetuned_sql_context/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                         token='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')
tokenizer.save_pretrained(output_merged_dir)

# Fine-tuned model name on Hugging Face Hub
new_model = "NamTrinh/llama2_7b_finetuned_sql_context"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:

# Push fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, create_pr=1,
                  token="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")
tokenizer.push_to_hub(new_model, create_pr=1,
                      token="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")

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NamTrinh/llama2_7b_finetuned_sql_context/commit/73211d2031fcdb7d44129a557b9a116651d194ec', commit_message='Upload tokenizer', commit_description='', oid='73211d2031fcdb7d44129a557b9a116651d194ec', pr_url='https://huggingface.co/NamTrinh/llama2_7b_finetuned_sql_context/discussions/4', pr_revision='refs/pr/4', pr_num=4)