# Notebook for fine tuning flan-t5

In [1]:
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr





In [2]:
from functools import partial
from datasets import load_dataset

### Log in huggingface hub to push models

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Helper functions

In [4]:
import re

def transform_sql_schema_to_list(sql_schema):
    # Initialize an empty dictionary to store the table and column information
    schema_dict = {}

    # Split the SQL schema into individual CREATE TABLE statements
    create_statements = sql_schema.split(";")

    # Regular expression pattern to extract table and column names
    pattern = r"CREATE TABLE (\w+) \((.*?)\)"

    # Iterate through each CREATE TABLE statement
    for statement in create_statements:
        match = re.match(pattern, statement.strip())
        if match:
            table_name = match.group(1)
            column_definitions = match.group(2)
            columns = [column.strip().split()[0] for column in column_definitions.split(",")]
            schema_dict[table_name] = columns

    # Convert the schema_dict into the desired format
    result_list = [{table_name: columns} for table_name, columns in schema_dict.items()]

    return result_list

# Example usage:
sql_schema = """
CREATE TABLE head (age INTEGER);
CREATE TABLE body (height FLOAT, weight FLOAT);
"""

transformed_schema = transform_sql_schema_to_list(sql_schema)
print(transformed_schema)

[{'head': ['age']}, {'body': ['height', 'weight']}]


# Inference with model

In [5]:
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("juierror/flan-t5-text2sql-with-schema-v2")
model = AutoModelForSeq2SeqLM.from_pretrained("juierror/flan-t5-text2sql-with-schema-v2")

In [6]:
def get_prompt(tables, question):
    prompt = f"""convert question and table into SQL query. tables: {tables}. question: {question}"""
    return prompt

def prepare_input(question: str, tables: Dict[str, List[str]]):
    tables = [f"""{table_name}({",".join(tables[table_name])})""" for table_name in tables]
    tables = ", ".join(tables)
    prompt = get_prompt(tables, question)
    input_ids = tokenizer(prompt, max_length=512, return_tensors="pt").input_ids
    return input_ids

def inference(question: str, tables: Dict[str, List[str]]) -> str:
    input_data = prepare_input(question=question, tables=tables)
    input_data = input_data.to(model.device)
    outputs = model.generate(inputs=input_data, num_beams=10, top_k=10, max_length=512)
    result = tokenizer.decode(token_ids=outputs[0], skip_special_tokens=True)
    return result

print(inference("how many people with name jui and age less than 25", {
    "people_name": ["id", "name"],
    "people_age": ["people_id", "age"]
}))

print(inference("what is id with name jui and age less than 25", {
    "people_name": ["id", "name", "age"]
}))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


SELECT count(*) FROM people_age AS T1 JOIN people_name AS T2 ON T1.people_id = T2.people_id WHERE T2.name = 'jui' AND T1.age < 25
SELECT id FROM people_name WHERE name = 'jui' AND age < 25


### Preparing the dataset

Text2SQL is a text2text-generation task. This means our model will take a text as input and generate an SQL as output. For this we want to understand how long our input and output will be to be able to efficiently batch our data.

In [7]:
dataset_id = "b-mc2/sql-create-context"
train_ds = load_dataset(dataset_id, split='train[:80%]')
valid_ds = load_dataset(dataset_id, split='train[80%:]')

In [8]:
train_ds.shape

(62862, 3)

In [9]:
valid_ds.shape

(15715, 3)

In [10]:
train_ds[0]

{'question': 'How many heads of the departments are older than 56 ?',
 'answer': 'SELECT COUNT(*) FROM head WHERE age > 56',
 'context': 'CREATE TABLE head (age INTEGER)'}

In [11]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [12]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [13]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets(
    [train_ds, valid_ds]).map(lambda x: 
                                              tokenizer(x["question"]+x["context"], truncation=True), 
                                              batched=True, 
                                              remove_columns=['question', 'answer', 'context' ])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets(
    [train_ds, valid_ds]).map(lambda x: 
                                              tokenizer(x["answer"], truncation=True), 
                                              batched=True, 
                                              remove_columns=['question', 'answer', 'context'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Max source length: 265
Max target length: 261


In [14]:
train_ds[0]

{'question': 'How many heads of the departments are older than 56 ?',
 'answer': 'SELECT COUNT(*) FROM head WHERE age > 56',
 'context': 'CREATE TABLE head (age INTEGER)'}

In [15]:
import numpy as np
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [f"""convert question and table into SQL query. tables: \
             {sample['context'][i]}. question: {sample['question'][i]}""" 
              for i, _ in enumerate(sample['context'])]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # print(model_inputs)
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], 
                       max_length=max_target_length, 
                       padding=padding, 
                       truncation=True,
                      )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized_dataset = train_ds.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'context'])
valid_tokenized_dataset = valid_ds.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'context'])

# tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'context'])
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")
print(f"Keys of tokenized dataset: {list(valid_tokenized_dataset.features)}")

Map:   0%|          | 0/15715 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']
Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [16]:
valid_tokenized_dataset.shape

(15715, 3)

# Evaluate and fine tune t5

In [17]:
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id = "juierror/flan-t5-text2sql-with-schema-v2"

model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32101, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32101, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [18]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /home/namtrinh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


### Start the finetune

In [20]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
