In [1]:
%pip install accelerate peft bitsandbytes transformers trl jsonlines pandas numpy python-dotenv numba



In [2]:
import os
import torch
import pandas as pd
import numpy as np
from accelerate import Accelerator
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import LoraConfig, PeftModel, PeftConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import time
from dotenv import load_dotenv
import random
from utils import *
random.seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
load_dotenv()
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [3]:
from google.colab import userdata
my_token = userdata.get('token')

# Preparing Data for fine-tune

<br>
<br>
Using huggingface dataset "Text-to-sql-v1", we have the instruction, input and response. Next step is to create a prompt dataset to fine-tune llama2 open source model.
<br>
<br>

In [4]:
file_name = 'text-to-sql-dataset.hf'
if os.path.isdir(file_name):
    print(f"{file_name} exists.")
    train_test_valid_dataset = load_from_disk(file_name)
else:
    print(f"{file_name} does not exist.")
    dataset = load_dataset("Clinton/Text-to-sql-v1")
    train_testvalid = dataset['train'].train_test_split(test_size=0.4)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
    # gather everyone if you want to have a single DatasetDict
    train_test_valid_dataset = DatasetDict({
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})
    train_test_valid_dataset.save_to_disk('text-to-sql-dataset.hf')

text-to-sql-dataset.hf exists.


<br>
<br>
Doing a train, validation and test split with ratio 6:2:2
<br>
<br>

In [5]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 157324
    })
    test: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 52442
    })
    valid: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 52442
    })
})

# Fine-tune (include Quantization)

<br>
<br>
Since I work on a mac and the the way to do quantization on mac is different than others, I will do both.
<br>
<br>

<br>
<br>
Load model from huggingface and do quantization
<br>
<br>

In [6]:
#Standard way to do quantization using GPU
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=compute_dtype,
    bnb_8bit_use_double_quant=False,
)

base_model = "meta-llama/Llama-2-7b-chat-hf"
new_model = "Llama-2-7b-chat-hf-text-to-sql"
token = os.getenv('token')

tokenizer = AutoTokenizer.from_pretrained(base_model, token = my_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(base_model,
                                             quantization_config=quant_config,
                                             token = my_token)
model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<br>
<br>
Follow the llama2 prompt template to include system instruction, the input query and output response in the training, validation and testing data.
<br>
<br>

In [7]:
def tokenize_function(input, max_length = 2000):
    prompt = [text for text in input['text']]
    input['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt").input_ids
    #input['labels'] = tokenizer(input["response"], padding='max_length', truncation=True, max_length=max_length, return_tensors="pt").input_ids
    input['labels'] = tokenizer(prompt, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt").input_ids
    return input

In [8]:
train_test_valid_dataset = train_test_valid_dataset.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [9]:
tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['instruction', 'input', 'response', 'source', 'text'])

Map:   0%|          | 0/1574 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

In [10]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['valid'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (1574, 2)
Validation: (525, 2)
Test: (525, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1574
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 525
    })
    valid: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 525
    })
})


<br>
<br>
We use lora method to fine-tune our model.
<br>
<br>

In [11]:
output_dir = 'results'
lora_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_params = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=5
)

In [12]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 0
all model parameters: 6738415616
percentage of trainable model parameters: 0.00%


In [13]:
model = get_peft_model(model, lora_config)
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 16777216
all model parameters: 6755192832
percentage of trainable model parameters: 0.25%


In [14]:
peft_trainer = Trainer(
    model=model,
    args=peft_params,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["valid"]
)
model.config.use_cache = False

In [15]:
peft_trainer.train()



Step,Training Loss
5,1.599
10,0.2786
15,0.2253
20,0.1565
25,0.1632
30,0.1602
35,0.1193
40,0.1537
45,0.0806
50,0.1272


TrainOutput(global_step=394, training_loss=0.10072037044364184, metrics={'train_runtime': 4649.439, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.085, 'total_flos': 1.25116394274816e+17, 'train_loss': 0.10072037044364184, 'epoch': 1.0})

In [16]:
peft_trainer.evaluate()

{'eval_loss': 0.059149619191884995,
 'eval_runtime': 273.7937,
 'eval_samples_per_second': 1.918,
 'eval_steps_per_second': 0.241,
 'epoch': 1.0}

<br>
<br>
Now the fine-tuned model is saved locally.
<br>
<br>

In [17]:
peft_trainer.model.save_pretrained(new_model, token = my_token)
tokenizer.save_pretrained(new_model, token = my_token)

('Llama-2-7b-chat-hf-text-to-sql/tokenizer_config.json',
 'Llama-2-7b-chat-hf-text-to-sql/special_tokens_map.json',
 'Llama-2-7b-chat-hf-text-to-sql/tokenizer.model',
 'Llama-2-7b-chat-hf-text-to-sql/added_tokens.json',
 'Llama-2-7b-chat-hf-text-to-sql/tokenizer.json')

In [19]:
pipe = pipeline("text-generation", model=peft_trainer.model, tokenizer=tokenizer, max_new_tokens=2000)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [20]:
prompt = "[INST]<<SYS>>\nBelow are sql tables schemas paired with instruction that describes a task. \n                Using valid SQLite, write a response that appropriately completes the request \n                for the provided tables. <</SYS>>\n\nWhat was round 7's lowest overall?CREATE TABLE table_name_72 (\n    overall INTEGER,\n    round VARCHAR\n)[/INST]"
result = pipe(f"{prompt}")
print(result[0]['generated_text'])



[INST]<<SYS>>
Below are sql tables schemas paired with instruction that describes a task. 
                Using valid SQLite, write a response that appropriately completes the request 
                for the provided tables. <</SYS>>

What was round 7's lowest overall?CREATE TABLE table_name_72 (
    overall INTEGER,
    round VARCHAR
)[/INST]  SELECT MIN(overall) FROM table_name_72 WHERE round = 7
