## Requirements

In [None]:
# Settings for autoreloading.

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [None]:
!pip install -q -U bitsandbytes
!pip install -q xformers
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q einops
!pip install -q wandb
!pip install -q scipy

In [None]:
!apt-get install git-lfs

In [None]:
%cd /workspace/falcon-7b-sql/src

In [None]:
import wandb
wandb.login()

In [None]:
from huggingface_hub import login

login()

## Dataset

In [None]:
model_id = "tiiuae/falcon-7b"
dataset_id = 'spider'
spider_schema = '/workspace/falcon-7b-sql/data/tables.json'

In [None]:
from data.dataset_handler import get_dataset
dataset = get_dataset(dataset_id, spider_schema, use_fields=True)

In [None]:
from data.inference_dataset import InferenceDataset

from torch.utils.data import DataLoader

eval_batch_size = 1

inference_ds = InferenceDataset(dataset['validation'])
inference_dataloader = DataLoader(inference_ds, batch_size=eval_batch_size, shuffle=False)

## Inference

In [None]:
import torch
from transformers import BitsAndBytesConfig
from model import get_pretrained_model_and_tokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_id = 'maidacundo/falcon_qlora_sql'

model, tokenizer = get_pretrained_model_and_tokenizer(model_id, bnb_config, lora_id)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def preprocess_function(examples):
    return tokenizer(examples['input_text'].split('<|sql|>')[-1])

tokenized_ds = dataset['validation'].map(
    preprocess_function,
    remove_columns=dataset['validation'].column_names,
)

input_ids_lengths = []

for prompt in tokenized_ds:
  input_ids_lengths.append(len(prompt['input_ids']))


percentiles = np.percentile(input_ids_lengths, range(0, 101, 10))

plt.plot(range(0, 101, 10), percentiles)
plt.xlabel("Percentile")
plt.ylabel("Input IDs Length")
plt.title("Percentile Distribution of Length")
plt.grid(True)
plt.show()

In [None]:
from utils.inference_utils import get_pipeline, generate_pipeline
pipeline = get_pipeline(model, tokenizer)
results = generate_pipeline(pipeline, inference_dataloader, tokenizer.eos_token_id, tokenizer.pad_token_id, max_new_tokens=160) # max new tokens is 160 because the 90th percentile of the input ids length is 160