In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, PeftModel, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig, set_seed
from trl import SFTTrainer
import bitsandbytes as bnb
import transformers

import os
import numpy as np
import pandas as pd
import sqlparse
import re
import json

from huggingface_hub import hf_hub_download
from huggingface_hub import HfFileSystem

In [5]:
model_name = '' 
out_name = '' 
prev_checkpoint = None

In [None]:
READ_TOKEN = ""
WRITE_TOKEN = ""

In [None]:
#!huggingface-cli login

### Load Data


In [None]:
spider_id="NESPED-GEN/spider_selector_schemaReduzido"

In [None]:
spider = load_dataset(spider_id, split="train")
spider

In [8]:
df = spider.to_pandas()

# Load Base Model

In [9]:
def download_checkpoint(adapter_model_id, checkpoint):
    fs = HfFileSystem()
    for file in fs.ls(f'{adapter_model_id}/{checkpoint}', detail=False):
        file_name = file.split(checkpoint)[-1]

        hf_hub_download(repo_id=adapter_model_id, filename=(f'{checkpoint}{file_name}'), local_dir='out')

    for file in fs.ls(f'{adapter_model_id}/logs', detail=False):
        file_name = file.split(checkpoint)[-1]

        hf_hub_download(repo_id=adapter_model_id, filename=(f'logs/{file_name.split("/")[-1]}'), local_dir='out')

In [None]:
# download_checkpoint(out_name, prev_checkpoint)

In [None]:
seed=14


if (prev_checkpoint != None):
    try:
        download_checkpoint(out_name, prev_checkpoint)
    except:
        pass
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=READ_TOKEN, map_device="auto", add_eos_token=True, use_fast=True)


if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=compute_dtype,
    device_map="auto",
    quantization_config=bnb_config,

    trust_remote_code=True,
    token=READ_TOKEN,
    # attn_implementation=attn_implementation
)

#### Chat Template - Gerar SQL

In [None]:
# tokenizer.chat_template = """
# {% if messages[0]['role'] == 'system' %}
#     {% set loop_messages = messages[1:] %}
#     {% set system_message = messages[0]['content'] %}
# {% else %}
#     {% set loop_messages = messages %}
#     {% set system_message = 'Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.' %}
# {% endif %}
# {{ '# <|system|>/n/' + system_message + '/n//n/' }}
# {% if messages|selectattr("role", "equalto", "example")|list %}
#     Below are some examples of question and their corresponding SQL queries:/n//n/
# {% else %}
#     /n/
# {% endif %}
# {% for message in loop_messages %}
#     {% if message['role'] == 'example' %}
#         {{ message['content'] }}/n//n/
#     {% elif message['role'] == 'schema' %}
#         # <|schema|>/n/The query will run on a database with the following schema:/n/{{ message['content'] }}/n//n/
#     {% elif message['role'] == 'user' %}
#         # <|user|>/n/[QUESTION]{{ message['content'] }}[/QUESTION]/n//n/
#     {% elif message['role'] == 'assistant' %}
#         # <|assistant|>/n/[SQL]{{ message['content'] }}[/SQL]
#     {% endif %}
# {% endfor %}
# {% if add_generation_prompt %}
#     # <|assistant|>/n/[SQL]
# {% endif %}
# """.replace("\n","").replace("  ", "").replace("/n/", "\n")

In [13]:
import re

def replace_alias(query):
    alias_pattern = re.compile(r'(\bFROM\b|\bJOIN\b)\s+(\w+)\s+AS\s+(\w+)', re.IGNORECASE)

    aliases = {match.group(3): match.group(2) for match in alias_pattern.finditer(query)}

    for alias, table in aliases.items():
        query = re.sub(r'\b' + alias + r'\b', table, query)

    query = re.sub(r'\bAS\s+\w+', '', query, flags=re.IGNORECASE)
    return query

In [24]:
def to_sql(query):
  return sqlparse.format(replace_alias(query), reindent=True, keyword_case='upper')

def apply_template(row, tokenizer=tokenizer, n_examplea=0):
    question = row['question_en']
    schema = row['schema_SQLDatabase']
    sql = to_sql(row['query'])

    system = "\nGiven a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.\n"

    chat = [
            {'role': 'system', 'content': system},
            {'role': 'user', 'content': f"\n# Schema:\n```sql\n{schema}\n```\n\n# Question: {question}\n"},
            {'role': 'assistant', 'content': f"\n```sql\n{sql}\n```\n"}
            ]

    row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)

    return row

In [25]:
df = df.apply(apply_template, axis=1)

In [None]:
#df.head()

In [None]:
print(df['text'][df.index[50]])

In [27]:
_df = pd.DataFrame(columns=['text'])
_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']
_df = Dataset.from_pandas(_df)
_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)
train_dataset, valid_dataset = _df["train"], _df["test"]

#### Chat Template - Schema Linking

In [None]:
def apply_template(row, tokenizer=tokenizer, n_examplea=0):
    question = row['question_en']
    schema = row['schema_SQLDatabase_min']
    schema_linking = row['selector_correct']

    system = "Given a user question and the schema of a database, your task is to generate an JSON with the the names of tables and columns of the schema that the question is referring to."

    chat = [
          {'role': 'system', 'content': system},
          {'role': 'user', 'content': f"# Schema:\n```sql\n{schema}\n```\n\n# Question: {question}"},
          {'role': 'assistant', 'content': f"```json\n{schema_linking}\n```"}
          ]

    row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)

    return row

In [None]:
df = df.apply(apply_template, axis=1)

In [None]:
print(df['text'][df.index[70]])

In [None]:
_df = pd.DataFrame(columns=['text'])
_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']
_df = Dataset.from_pandas(_df)
_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)
train_dataset, valid_dataset = _df["train"], _df["test"]

### Finetuning

In [28]:
from huggingface_hub import login, create_repo
import wandb
import os

token = WRITE_TOKEN
login(token=token)
set_seed(1234)

In [29]:
def find_all_linear_names(model, new_tokens=False):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit) or isinstance(module, bnb.nn.Linear8bitLt):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if(new_tokens):
      lora_module_names.add("lm_head")
    return list(lora_module_names)

In [30]:
modules = find_all_linear_names(model)
print(f"Found {len(modules)} modules to quantize: {modules}")

Found 7 modules to quantize: ['down_proj', 'v_proj', 'gate_proj', 'q_proj', 'k_proj', 'o_proj', 'up_proj']


In [32]:
peft_config = LoraConfig(
    lora_alpha=128, 
    lora_dropout=0.1,
    r=64,
    target_modules=modules,
)

In [33]:
torch.cuda.empty_cache()

In [None]:
from trl.trainer.sft_config import SFTConfig

steps=250
strategy="steps"

trainer = SFTTrainer(
    model= model,
    processing_class=tokenizer,

    train_dataset=train_dataset,
    eval_dataset=valid_dataset,

    peft_config=peft_config,

    args= SFTConfig(
      output_dir="out",

      dataset_text_field="text",
      max_seq_length = 2048,

      save_strategy=strategy,
      save_steps= steps,

      logging_strategy=strategy,
      logging_steps=steps,
      logging_dir="out/logs",

      eval_strategy=strategy,
      eval_steps=steps,

      num_train_epochs=1,
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      gradient_accumulation_steps=8, 
      gradient_checkpointing=True,
      fp16=True,
      bf16=False,

      optim="paged_adamw_8bit",
      learning_rate=1e-4, 
      weight_decay=0.001,
      max_grad_norm=0.3,
      max_steps=-1,
      warmup_ratio=0.03,
      group_by_length=True,
      lr_scheduler_type="cosine", 
      seed= seed,


      report_to=["tensorboard"],
      push_to_hub=True,
      hub_strategy="all_checkpoints",
      hub_model_id=out_name,

      label_names=["labels"]
  )

)
if(prev_checkpoint != None):
    trainer.train(f"out/{prev_checkpoint}")
else:
    trainer.train()

In [None]:
trainer.push_to_hub()