In [13]:
import sys
import site
from pathlib import Path
import logging
import os
import sys
from math import ceil
from typing import Optional, Tuple
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
import transformers
from transformers import (
    DataCollatorForSeq2Seq,
    LlamaTokenizer,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    AutoModelForCausalLM, 
    AutoTokenizer
)

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
import config

In [3]:
transformers.logging.set_verbosity_error()

In [4]:
local_model_id = config.BASE_MODEL.replace("/", "--")
local_model_path = os.path.join(config.MODEL_CACHE_PATH, local_model_id)
print(f"local model path is: {local_model_path}")

local model path is: /scratch/umeleti/code/LLM/Text2SQL/MODEL_CACHE/NousResearch--CodeLlama-7b-hf


In [41]:
import wandb
os.environ["WANDB_NOTEBOOK_NAME"] = 'finetuning'

os.environ["WANDB_PROJECT"] = f"text-to-sql-finetune-model-name_{config.BASE_MODEL.replace('/', '_')}"
wandb.login()

ENABLE_WANDB = True

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/umeleti/.netrc


In [14]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
            local_model_path,
            quantization_config=quantization_config)

model = prepare_model_for_kbit_training(model)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.41s/it]


In [29]:
def generate_prompt_sql(input_question, context, output=""):
    """
    Generates a prompt for fine-tuning the LLM model for text-to-SQL tasks.

    Parameters:
        input_question (str): The input text or question to be converted to SQL.
        context (str): The schema or context in which the SQL query operates.
        output (str, optional): The expected SQL query as the output.

    Returns:
        str: A formatted string serving as the prompt for the fine-tuning task.
    """
    return f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables. 

You must output the SQL query that answers the question.

### Input:
{input_question}

### Context:
{context}

### Response:
{output}"""

In [15]:
LORA_CONFIG = LoraConfig(
    r=16,  # rank
    lora_alpha=32,  # scaling factor
    target_modules=["q_proj", "k_proj", "v_proj"], 
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, LORA_CONFIG).to(device)

In [61]:
tokenizer = LlamaTokenizer.from_pretrained(local_model_path)
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [50]:
DATA_PATH = "b-mc2/sql-create-context"
MODEL_PATH = "./final_model"
ADAPTER_PATH = "./lora_adapters"

In [20]:
data = load_dataset(DATA_PATH)

In [37]:
def tokenize_data(data_points, add_eos_token=True, train_on_inputs=False, cutoff_len=512):
        question = data_points["question"]
        context = data_points["context"]
        answer = data_points["answer"]
        combined_text = generate_prompt_sql(question, context, answer)
        tokenized = tokenizer(
                combined_text,
                truncation=True,
                max_length=cutoff_len,
                padding=False,
                return_tensors=None)

        if (tokenized["input_ids"][-1] != tokenizer.eos_token_id and add_eos_token
                and len(tokenized["input_ids"]) < cutoff_len):
                tokenized["input_ids"].append(tokenizer.eos_token_id)
                tokenized["attention_mask"].append(1)
        tokenized["labels"] = tokenized["input_ids"].copy()

        return tokenized

In [52]:
tokenizer.eos_token_id

2

In [62]:
val_set_size = 100
train_val_split = data["train"].train_test_split(
                test_size=val_set_size, shuffle=True, seed=42
            )
train_data = train_val_split["train"].shuffle().map(tokenize_data)
val_data = train_val_split["test"].shuffle().map(tokenize_data)


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78477/78477 [00:47<00:00, 1642.79 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1506.37 examples/s]


In [63]:
per_device_batch_size=4
warmup_steps=20
learning_rate=2e-5
max_steps=200
gradient_accum_steps=4
save_steps = 20
eval_steps = 20
max_grad_norm = 0.3
save_total_limit = 3
logging_steps = 20

training_args = TrainingArguments(
            per_device_train_batch_size=per_device_batch_size,
            gradient_accumulation_steps=gradient_accum_steps,
            warmup_steps=warmup_steps,
            save_steps=save_steps,
            save_strategy="steps",
            eval_steps=eval_steps,
            evaluation_strategy="steps",
            max_steps=max_steps,
            learning_rate=learning_rate,
            #max_grad_norm=max_grad_norm,
            bf16=True,
            #lr_scheduler_type="cosine",
            load_best_model_at_end=True,
            ddp_find_unused_parameters=False,
            group_by_length=True,
            save_total_limit=save_total_limit,
            logging_steps=logging_steps,
            optim="adamw_hf",
            output_dir="./lora_adapters",
            logging_dir="./logs",
            report_to="wandb" if ENABLE_WANDB else [],
        )

In [64]:
trainer = Trainer(
                model=model,
                train_dataset=train_data,
                eval_dataset=val_data,
                args=training_args,
                data_collator=DataCollatorForSeq2Seq(
                    tokenizer,
                    pad_to_multiple_of=8,
                    return_tensors="pt",
                    padding=True,
                ),
            )
model.config.use_cache = False

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [65]:
tokenizer.pad_token

'<unk>'

In [66]:
trainer.train()
model.save_pretrained(MODEL_PATH)



{'loss': 2.3016, 'learning_rate': 2e-05, 'epoch': 0.0}
{'eval_loss': 2.350579261779785, 'eval_runtime': 3.1337, 'eval_samples_per_second': 31.911, 'eval_steps_per_second': 4.148, 'epoch': 0.0}




{'loss': 2.1803, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.01}
{'eval_loss': 1.9797790050506592, 'eval_runtime': 3.1496, 'eval_samples_per_second': 31.75, 'eval_steps_per_second': 4.127, 'epoch': 0.01}




{'loss': 1.7741, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.01}
{'eval_loss': 1.608078956604004, 'eval_runtime': 3.1587, 'eval_samples_per_second': 31.659, 'eval_steps_per_second': 4.116, 'epoch': 0.01}




{'loss': 1.4055, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.02}
{'eval_loss': 1.253481149673462, 'eval_runtime': 3.1671, 'eval_samples_per_second': 31.575, 'eval_steps_per_second': 4.105, 'epoch': 0.02}




{'loss': 1.0596, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.02}
{'eval_loss': 1.1013339757919312, 'eval_runtime': 3.1697, 'eval_samples_per_second': 31.549, 'eval_steps_per_second': 4.101, 'epoch': 0.02}




{'loss': 1.0919, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.02}
{'eval_loss': 1.0054242610931396, 'eval_runtime': 3.1699, 'eval_samples_per_second': 31.547, 'eval_steps_per_second': 4.101, 'epoch': 0.02}




{'loss': 0.8893, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.03}
{'eval_loss': 0.9247572422027588, 'eval_runtime': 3.17, 'eval_samples_per_second': 31.545, 'eval_steps_per_second': 4.101, 'epoch': 0.03}




{'loss': 0.8559, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.03}
{'eval_loss': 0.8862056732177734, 'eval_runtime': 3.1705, 'eval_samples_per_second': 31.541, 'eval_steps_per_second': 4.1, 'epoch': 0.03}




{'loss': 0.8649, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.04}
{'eval_loss': 0.8640143871307373, 'eval_runtime': 3.1718, 'eval_samples_per_second': 31.528, 'eval_steps_per_second': 4.099, 'epoch': 0.04}




{'loss': 0.7085, 'learning_rate': 0.0, 'epoch': 0.04}
{'eval_loss': 0.8583202958106995, 'eval_runtime': 3.1722, 'eval_samples_per_second': 31.524, 'eval_steps_per_second': 4.098, 'epoch': 0.04}




{'train_runtime': 424.0208, 'train_samples_per_second': 7.547, 'train_steps_per_second': 0.472, 'train_loss': 1.3131573629379272, 'epoch': 0.04}




In [55]:
def tokenize_data(data_points, add_eos_token=True, train_on_inputs=False, cutoff_len=512
    ) -> dict:
        """
        Tokenizes dataset of SQL related data points consisting of questions, context, and answers.

        Parameters:
            data_points (dict): A batch from the dataset containing 'question', 'context', and 'answer'.
            add_eos_token (bool): Whether to add an EOS token at the end of each tokenized sequence.
            cutoff_len (int): The maximum length for each tokenized sequence.

        Returns:
            dict: A dictionary containing tokenized 'input_ids', 'attention_mask', and 'labels'.
        """
        try:
            question = data_points["question"]
            context = data_points["context"]
            answer = data_points["answer"]
            if train_on_inputs:
                user_prompt = generate_prompt_sql(question, context)
                tokenized_user_prompt = tokenizer(
                    user_prompt,
                    truncation=True,
                    max_length=cutoff_len,
                    padding=False,
                    return_tensors=None,
                )
                user_prompt_len = len(tokenized_user_prompt["input_ids"])
                if add_eos_token:
                    user_prompt_len -= 1

            combined_text = generate_prompt_sql(question, context, answer)
            tokenized = tokenizer(
                combined_text,
                truncation=True,
                max_length=cutoff_len,
                padding=False,
                return_tensors=None)
            if (tokenized["input_ids"][-1] != tokenizer.eos_token_id
                and add_eos_token
                and len(tokenized["input_ids"]) < cutoff_len):
                
                tokenized["input_ids"].append(tokenizer.eos_token_id)
                tokenized["attention_mask"].append(1)
                
            tokenized["labels"] = tokenized["input_ids"].copy()
            
            if train_on_inputs:
                tokenized["labels"] = [-100] * user_prompt_len + tokenized["labels"][
                    user_prompt_len:
                ]

            return tokenized
            
        except Exception as e:
            logging.error(
                f"Error in batch tokenization: {e}, Line: {e.__traceback__.tb_lineno}"
            )
            raise e