# 1. Fine tunning LLaMA2 With Adapters / QA Text2Sql


In [1]:
!pip install fsspec==2024.10.0
!pip install -q -U datasets
!pip install -q -U torch auto-gptq transformers optimum
!pip install -q -U peft trl einops accelerate xformers bitsandbytes
! pip install -q -U rouge_score

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.0 MB/s[0m eta [3

### Imports

In [2]:
import pandas as pd
import json
import torch
import os
import time

# In case Login Required For Model
# from huggingface_hub import login
# from dotenv import load_dotenv

from datasets import load_dataset, Dataset, load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import GPTQConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from time import perf_counter
from rich import print

In [None]:
# If Login Required for Model Access
# load_dotenv("/notebooks/.env")
# os.environ["TOKENIZERS_PARALLELISM"]="false"
# login(token=os.getenv("HUGGINGFACE_TOKEN"))

### first let's load the dataset , model and it 's tokenizer

In [3]:
model_id = "TheBloke/Llama-2-7B-GPTQ"
# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
MODEL_PATH = "/content/drive/MyDrive/Text2SQL/FTClean/"
MODEL_DIR = "sql_gptq_training"+"_30/"
checkpoint_name ="SQL_llama2_gptq_7b_peftv1_"+time.strftime("%Y%m%d_%H%M%S")
OUT_DIR = MODEL_PATH + MODEL_DIR + checkpoint_name
print(OUT_DIR)
#OUT_DIR ="/content/drive/MyDrive/sql_gptq_training"  # Save to Google Drive

In [4]:
print(checkpoint_name)

In [5]:
# GDrive Location for Train/Test Data
DATA_PATH ="/content/drive/MyDrive/Text2SQL/Data/"
DS_DIR = "sql_train_test"
PKL_DIR = "test/"
PKL_FILE ="sql_test.pkl"

In [6]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

In [8]:
def parse(text):
    start_marker = '### Response:'
    end_marker = '### End'
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index + len(start_marker))

    return (text[start_index + len(start_marker):].strip() if start_index != -1 and end_index == -1
            else text[start_index + len(start_marker):end_index].strip() if start_index != -1
            else None)

### Load and Check Data

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Load Training Data from Disk
dataset = load_from_disk(DATA_PATH + DS_DIR)

In [11]:
test_df = pd.read_pickle(DATA_PATH + PKL_DIR + PKL_FILE)

In [12]:
display(dataset['train'])
display(dataset['test'])

Dataset({
    features: ['response', 'question', 'context', 'text', '__index_level_0__'],
    num_rows: 4086
})

Dataset({
    features: ['response', 'question', 'context', 'text', '__index_level_0__'],
    num_rows: 454
})

In [13]:
display(test_df.head(3))
display(test_df.shape)

Unnamed: 0,response,question,context,__index_level_0__,text
0,SELECT College FROM match_season GROUP BY Coll...,Show the name of colleges that have at least t...,CREATE TABLE match_season (College VARCHAR),611,### Instruction:\n You are a powerful text-...
1,SELECT T1.name FROM Person AS T1 JOIN PersonFr...,Find the name of the person who has friends wi...,"CREATE TABLE Person (name VARCHAR, age INTEGER...",2556,### Instruction:\n You are a powerful text-...
2,"SELECT AVG(T1.price), T2.name FROM products AS...",Find the average prices of all products from e...,"CREATE TABLE products (price INTEGER, Manufact...",3026,### Instruction:\n You are a powerful text-...


(454, 5)

### lets load the  Model with Quantization and Tokenizer

In [None]:
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# LLM GPTQ Model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=quantization_config_loading,
                                             device_map="auto")

In [15]:
# Get Model Memory Footprint = ~4GB
print(model.get_memory_footprint()/1e9) # GB

### Let's define LORA and other configration

In [16]:
model.config.use_cache = False
model.config.pretraining_tp = 1
# %%
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# needed for llama 2 tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.use_cache = False # silence the warnings. Please re-enable for inference!

trainable params: 8,388,608 || all params: 270,798,848 || trainable%: 3.0977


In [17]:
args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100, # Change this if T4 Timesout
        learning_rate=2e-4,
        fp16=True, #use mixed precision training
        logging_steps=1,
        output_dir=OUT_DIR,
        overwrite_output_dir=True,
        optim="adamw_hf",
        save_strategy="epoch",
        report_to="none")

In [None]:
# set training arguments - Feel free to adapt it
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    warmup_steps=2,
    max_steps=5, #100, # Change this if T4 Timesout
    optim="adamw_hf",
    learning_rate=2e-4,
    fp16=True, #use mixed precision training
    # predict_with_generate=True, # Needed for Seq2Seq models only
    logging_steps=1, #500,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False, #True
    report_to="none"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=args, # Training Only
    # args=training_args, # Evaluation
    # compute_metrics=compute_metrics, # uncomment for ROGUE Metrics
    train_dataset=dataset['train'],
    # eval_dataset = dataset['test'], # Evaluation Only
    peft_config=config,
    #dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=False,
    #max_seq_length=512
    )

In [20]:
# Takes ~20 mins to finetune
train_result = trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1,2.1254
2,2.1029
3,2.107
4,1.8198
5,1.7039
6,1.5856
7,1.399
8,1.2297
9,1.0561
10,0.8953


In [21]:
#output_dir = os.path.join(args.output_dir, checkpoint_name)
#trainer.model.save_pretrained(output_dir)
output_dir = OUT_DIR  # OUT_DIR already includes the full path
print("Saving model to:", output_dir)

# Save the model
trainer.model.save_pretrained(output_dir)

### Now let's perform inference on an Example of the Test dataset

In [None]:
# To perform inference on the test dataset example load the model from the checkpoint
persisted_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda",
)
persisted_model.config.use_cache = True

In [70]:
ID = 26
print('Question:')
display(dataset['test'][ID]['question'])
print('Context:')
display(dataset['test'][ID]['context'])
print('Response:')
display(dataset['test'][ID]['response'])

'How much did the the player with first name Len and last name Barker earn between 1985 to 1990 in total?'

'CREATE TABLE salary (salary INTEGER, player_id VARCHAR, year VARCHAR); CREATE TABLE player (player_id VARCHAR, name_first VARCHAR, name_last VARCHAR)'

"SELECT SUM(T1.salary) FROM salary AS T1 JOIN player AS T2 ON T1.player_id = T2.player_id WHERE T2.name_first = 'Len' AND T2.name_last = 'Barker' AND T1.year BETWEEN 1985 AND 1990"

In [71]:
text = test_df['text'][ID]
print(text)

In [72]:

text = test_df['text'][ID]
inputs = tokenizer(text, return_tensors="pt").to('cuda')
generation_config = GenerationConfig(
    penalty_alpha=0.5,
    # do_sample = True,
    top_k=1,
    # temperature=0.1,
    repetition_penalty=1.2,
    max_new_tokens=180
)
start_time = perf_counter()
outputs = persisted_model.generate(**inputs, generation_config=generation_config)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))
output = parse(tokenizer.decode(outputs[0]))
result = {'response': output}
print(json.dumps(result))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {round(output_time,2)} seconds")

In [73]:
# display(test_df.head(2))
print('Data:  ' + test_df.loc[ID, 'response'])
print('LLM_:  ' + output)

### Dawnload the model locally

In [27]:
import shutil

# Zip the folder containing the model
shutil.make_archive("sql_gptq_training", 'zip', OUT_DIR)

# Download the zip file
from google.colab import files
files.download("sql_gptq_training.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>