# Import Libraries

In [1]:
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [2]:
from text_to_sql import (
    TextToSQL,
    Config,
    LLMConfig,
    SLConfig,
    ContextConfig,
    QueryConfig,
)
from dotenv import load_dotenv
from datetime import datetime

import pandas as pd
import os




# Constants

In [3]:
DATABASE = "sakila"
MODEL = "gemini-1.5-pro"
PROVIDER = "gemini"
# TOTAL_TABLES = 14
TOTAL_TABLES = 15
# TOTAL_TABLES = 18
# TOTAL_TABLES = 22

# Load Environment

In [4]:
load_dotenv()

True

# Set Timestamp Experiment

In [5]:
timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M")
output_dir = f"../files/experiment_result/{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Config

In [6]:
db_key = DATABASE.upper().replace("-", "_")
provider_key = PROVIDER.upper().replace("-", "_")

config = Config(
    max_retry_attempt=5,
    rewriter_config=LLMConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
    ),
    query_generator_config=LLMConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
    ),
    schema_linker_config=SLConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
        schema_path=f"../files/schema/{DATABASE}.txt",
        metadata_path=f"../files/metadata/{DATABASE}.json",
    ),
    retrieve_context_config=ContextConfig(data_path=f"../files/dataset/dataset_{DATABASE}.csv"),
    query_executor_config = QueryConfig(
        host=os.getenv(f"DB_HOST_{db_key}"),
        database=os.getenv(f"DB_DATABASE_{db_key}"),
        user=os.getenv(f"DB_USER_{db_key}"),
        password=os.getenv(f"DB_PASSWORD_{db_key}"),
        port=os.getenv(f"DB_PORT_{db_key}"),
    ),
)

# Model

In [7]:
text_to_sql_model = TextToSQL(config=config)

Initializing API client for gemini using model gemini-1.5-pro.
Initializing API client for gemini using model gemini-1.5-pro.
Initializing API client for gemini using model gemini-1.5-pro.


  from google.protobuf import service as _service


# Import Dataset

In [8]:
dataset = pd.read_csv(f"../files/dataset/dataset_schema_linker_{DATABASE}.csv")

In [9]:
dataset["tables_used"] = dataset["tables_used"].apply(eval)

In [10]:
dataset

Unnamed: 0,prompt,tables_used
0,"Can you show me the actor_id, first_name, and ...",[actor]
1,How many different last_name values are there ...,[actor]
2,Who’s the top 1 actor that’s been in the most ...,"[film_actor, actor]"
3,What are the top 5 film genres that brought in...,"[category, rental, payment, film_category, inv..."
4,"How much revenue did each store make, looking ...","[staff, payment, store]"
...,...,...
95,Film mana yang disewa oleh pelanggan dari lebi...,"[rental, customer, city, address, movie_city_r..."
96,Film mana yang paling banyak disewa oleh pelan...,"[inventory, rental, customer, film]"
97,Film top 1 mana yang pendapatannya tinggi tapi...,"[rental, film_revenue, payment, film, inventor..."
98,Ada pelanggan yang nyewa dari semua kategori t...,"[customer_no_pg13, rental, customer, category,..."


# Experiment

In [11]:
def extract_table_names_from_schema(schema: dict) -> list:
    """
    Given a schema dictionary (from predict_schema_only),
    return a list of lowercase table names.
    """
    return schema.get("potential_related_tables", [])

In [12]:
def calculate_reduction(predicted_tables):
    return 1 - (len(predicted_tables) / TOTAL_TABLES)

In [13]:
accuracies = []
predicted_tables_list = []

for _, row in dataset.iterrows():
    prompt = row["prompt"]
    true_tables = set(row["tables_used"])

    predicted_schema = text_to_sql_model.predict_schema_only(prompt)
    predicted_tables = set(extract_table_names_from_schema(predicted_schema))

    # Save predicted tables
    predicted_tables_list.append(list(predicted_tables))

    # Calculate accuracy
    intersection_count = len(true_tables.intersection(predicted_tables))
    total_true_tables = len(true_tables)

    if total_true_tables > 0:
        accuracy = intersection_count / total_true_tables
    else:
        accuracy = 1.0 if not predicted_tables else 0.0

    accuracies.append(accuracy)

In [14]:
dataset["predicted_tables"] = predicted_tables_list
dataset["schema_accuracy"] = accuracies
dataset['schema_reduction'] = dataset['predicted_tables'].apply(calculate_reduction)

In [15]:
final_accuracy = sum(accuracies) / len(accuracies)
print(f"Schema Prediction Accuracy (Intersection-Based): {final_accuracy:.2%}")

average_reduction = dataset['schema_reduction'].mean()
print(f"Average Schema Reduction: {average_reduction:.2%}")

Schema Prediction Accuracy (Intersection-Based): 81.68%
Average Schema Reduction: 28.53%


# Save

In [16]:
dataset.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_schema_linker.csv", index=False)

In [17]:
dataset

Unnamed: 0,prompt,tables_used,predicted_tables,schema_accuracy,schema_reduction
0,"Can you show me the actor_id, first_name, and ...",[actor],"[film_category, film_actor, actor, payment, ad...",1.000000,0.333333
1,How many different last_name values are there ...,[actor],"[film_actor, actor, film]",1.000000,0.800000
2,Who’s the top 1 actor that’s been in the most ...,"[film_actor, actor]","[film_category, film_actor, actor, inventory, ...",1.000000,0.533333
3,What are the top 5 film genres that brought in...,"[category, rental, payment, film_category, inv...","[film_category, film_actor, staff, actor, paym...",1.000000,0.200000
4,"How much revenue did each store make, looking ...","[staff, payment, store]","[film_category, film_actor, staff, payment, ad...",1.000000,0.266667
...,...,...,...,...,...
95,Film mana yang disewa oleh pelanggan dari lebi...,"[rental, customer, city, address, movie_city_r...","[film_category, film_actor, actor, inventory, ...",0.428571,0.400000
96,Film mana yang paling banyak disewa oleh pelan...,"[inventory, rental, customer, film]","[film_category, film_actor, staff, actor, paym...",1.000000,0.200000
97,Film top 1 mana yang pendapatannya tinggi tapi...,"[rental, film_revenue, payment, film, inventor...","[film_category, film_actor, staff, actor, paym...",0.666667,0.200000
98,Ada pelanggan yang nyewa dari semua kategori t...,"[customer_no_pg13, rental, customer, category,...","[film_category, film_actor, actor, payment, ad...",0.750000,0.200000


In [18]:
dataset_1 = dataset[dataset["schema_accuracy"] == 1]

In [19]:
dataset_1

Unnamed: 0,prompt,tables_used,predicted_tables,schema_accuracy,schema_reduction
0,"Can you show me the actor_id, first_name, and ...",[actor],"[film_category, film_actor, actor, payment, ad...",1.0,0.333333
1,How many different last_name values are there ...,[actor],"[film_actor, actor, film]",1.0,0.8
2,Who’s the top 1 actor that’s been in the most ...,"[film_actor, actor]","[film_category, film_actor, actor, inventory, ...",1.0,0.533333
3,What are the top 5 film genres that brought in...,"[category, rental, payment, film_category, inv...","[film_category, film_actor, staff, actor, paym...",1.0,0.2
4,"How much revenue did each store make, looking ...","[staff, payment, store]","[film_category, film_actor, staff, payment, ad...",1.0,0.266667
5,Which language shows up the most in our movie ...,"[language, film]","[film_category, film_actor, actor, inventory, ...",1.0,0.6
6,Who are the top 5 customers based on how many ...,"[rental, customer]","[staff, city, payment, address, customer, inve...",1.0,0.466667
8,Are there any movies in our collection that ha...,"[inventory, rental, film]","[film_category, film_actor, staff, actor, paym...",1.0,0.2
9,How much total revenue has each actor generate...,"[rental, payment, film_actor, inventory, actor]","[film_category, film_actor, staff, actor, paym...",1.0,0.133333
10,Which staff member handled the highest total p...,"[staff, payment]","[film_category, film_actor, staff, payment, ad...",1.0,0.266667


In [20]:
new_dataset = pd.read_csv(f"../files/dataset/dataset_{DATABASE}.csv")

In [21]:
new_dataset

Unnamed: 0,Question,Answer,Summary,Alternative Prompt 1 (English),Alternative Prompt 2 (Bahasa Indonesia),Expected Result,Checked
0,Which actors have the first name ‘Scarlett’? S...,"SELECT actor_id, first_name, last_name\r\nFROM...",This SQL query retrieves actor details for tho...,"Can you show me the actor_id, first_name, and ...","Bisa kasih daftar actor_id, first_name, dan la...","['actor_id', 'first_name', 'last_name']",True
1,How many distinct actors last names are there?...,SELECT COUNT(DISTINCT last_name) FROM actor;,This SQL query calculates the count of distinc...,How many different last_name values are there ...,Total ada berapa nama belakang (last_name) akt...,['count'],True
2,Who is the top 1 actor who has appeared in the...,"SELECT \n a.actor_id, \n a.first_name, \...",This SQL query identifies the actor who has ap...,Who’s the top 1 actor that’s been in the most ...,Siapa aktor top 1 yang paling sering main film...,"['actor_id', 'first_name', 'last_name']",True
3,List the top 5 film genres by total gross reve...,"SELECT \n c.name AS genre, \n SUM(p.amou...",This SQL query calculates the top 5 highest-gr...,What are the top 5 film genres that brought in...,Apa aja 5 genre film dengan pemasukan tertingg...,['genre'],True
4,How much total revenue did each store generate...,"SELECT s.store_id, SUM(p.amount) AS total_reve...",This SQL query calculates the total revenue ge...,"How much revenue did each store make, looking ...",Tiap toko dapet pemasukan berapa kalau dilihat...,"['store_id', 'total_revenue']",True
5,Which language is the top 1 most used in films...,SELECT \n l.name\nFROM \n language l\n ...,This SQL query identifies the most frequently ...,Which language shows up the most in our movie ...,Bahasa apa yang paling sering muncul di daftar...,['name'],True
6,List the top five customers by number of rente...,"SELECT\n c.customer_id,\n c.first_name,\...",This SQL query identifies the top 5 customers ...,Who are the top 5 customers based on how many ...,Siapa sih 5 pelanggan yang paling sering nyewa...,"['customer_id', 'first_name', 'last_name', 'to...",True
7,Which customers have rented films from more th...,"SELECT \r\n c.customer_id, \r\n c.first_...",This SQL query identifies customers who have r...,Which customers have rented from more than 3 c...,Pelanggan mana aja yang udah nyewa film dari l...,"['customer_id', 'first_name', 'last_name', 'ca...",True
8,Which films have never been rented across any ...,"SELECT \r\n f.film_id, \r\n f.title, \r\...",This SQL query identifies films that have neve...,Are there any movies in our collection that ha...,Ada nggak film di koleksi kita yang belum pern...,"['film_id', 'title', 'release_year']",True
9,What is the total revenue generated by each ac...,"SELECT \n a.actor_id, \n a.first_name, \...",This SQL query calculates and ranks actors by ...,How much total revenue has each actor generate...,Berapa total pemasukan yang dihasilkan tiap ak...,"['actor_id', 'first_name', 'last_name', 'total...",True


In [22]:
prompt_set = set(dataset_1['prompt'])

def match_prompt(row):
    alt1 = row.get("Alternative Prompt 1 (English)", "")
    alt2 = row.get("Alternative Prompt 2 (Bahasa Indonesia)", "")
    return alt1 in prompt_set and alt2 in prompt_set

filtered_df = new_dataset[new_dataset.apply(match_prompt, axis=1)]
filtered_df

Unnamed: 0,Question,Answer,Summary,Alternative Prompt 1 (English),Alternative Prompt 2 (Bahasa Indonesia),Expected Result,Checked
0,Which actors have the first name ‘Scarlett’? S...,"SELECT actor_id, first_name, last_name\r\nFROM...",This SQL query retrieves actor details for tho...,"Can you show me the actor_id, first_name, and ...","Bisa kasih daftar actor_id, first_name, dan la...","['actor_id', 'first_name', 'last_name']",True
1,How many distinct actors last names are there?...,SELECT COUNT(DISTINCT last_name) FROM actor;,This SQL query calculates the count of distinc...,How many different last_name values are there ...,Total ada berapa nama belakang (last_name) akt...,['count'],True
2,Who is the top 1 actor who has appeared in the...,"SELECT \n a.actor_id, \n a.first_name, \...",This SQL query identifies the actor who has ap...,Who’s the top 1 actor that’s been in the most ...,Siapa aktor top 1 yang paling sering main film...,"['actor_id', 'first_name', 'last_name']",True
3,List the top 5 film genres by total gross reve...,"SELECT \n c.name AS genre, \n SUM(p.amou...",This SQL query calculates the top 5 highest-gr...,What are the top 5 film genres that brought in...,Apa aja 5 genre film dengan pemasukan tertingg...,['genre'],True
4,How much total revenue did each store generate...,"SELECT s.store_id, SUM(p.amount) AS total_reve...",This SQL query calculates the total revenue ge...,"How much revenue did each store make, looking ...",Tiap toko dapet pemasukan berapa kalau dilihat...,"['store_id', 'total_revenue']",True
5,Which language is the top 1 most used in films...,SELECT \n l.name\nFROM \n language l\n ...,This SQL query identifies the most frequently ...,Which language shows up the most in our movie ...,Bahasa apa yang paling sering muncul di daftar...,['name'],True
6,List the top five customers by number of rente...,"SELECT\n c.customer_id,\n c.first_name,\...",This SQL query identifies the top 5 customers ...,Who are the top 5 customers based on how many ...,Siapa sih 5 pelanggan yang paling sering nyewa...,"['customer_id', 'first_name', 'last_name', 'to...",True
8,Which films have never been rented across any ...,"SELECT \r\n f.film_id, \r\n f.title, \r\...",This SQL query identifies films that have neve...,Are there any movies in our collection that ha...,Ada nggak film di koleksi kita yang belum pern...,"['film_id', 'title', 'release_year']",True
9,What is the total revenue generated by each ac...,"SELECT \n a.actor_id, \n a.first_name, \...",This SQL query calculates and ranks actors by ...,How much total revenue has each actor generate...,Berapa total pemasukan yang dihasilkan tiap ak...,"['actor_id', 'first_name', 'last_name', 'total...",True
10,Which staff member has processed the highest t...,"SELECT \r\n st.staff_id,\r\n SUM(p.amoun...",This SQL query identifies the staff member who...,Which staff member handled the highest total p...,Siapa staf yang memproses pembayaran dengan to...,"['staff_id', 'total_amount_processed']",True


In [23]:
len(filtered_df)

18

In [24]:
MAX_RETRIES = 5
RETRY_DELAY = 2

In [25]:
import time
import ast

EA = 0
total_questions = len(filtered_df) * 2
results_list = []

for idx, row in filtered_df.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_baseline(user_prompt=question)
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: Can you show me the actor_id, first_name, and last_name of actors with the first name Scarlett?
Generated SQL Query: SELECT actor_id, first_name, last_name FROM actor WHERE first_name ILIKE 'Scarlett';
Execution Accuracy: 1.0000

Processing Question 1.2: Bisa kasih daftar actor_id, first_name, dan last_name dari aktor yang nama depannya Scarlett?
Generated SQL Query: SELECT actor_id, first_name, last_name FROM actor WHERE first_name = 'Scarlett';
Execution Accuracy: 0.0000

Processing Question 2.1: How many different last_name values are there among all the actors?
Generated SQL Query: SELECT COUNT(DISTINCT last_name) FROM actor
Execution Accuracy: 1.0000

Processing Question 2.2: Total ada berapa nama belakang (last_name) aktor yang beda-beda?
Generated SQL Query: SELECT COUNT(DISTINCT last_name) FROM actor
Execution Accuracy: 1.0000

Processing Question 3.1: Who’s the top 1 actor that’s been in the most films? Just show me their actor_id, first_name, and las

In [26]:
df_results_partial_baseline = pd.DataFrame(results_list)
df_results_partial_baseline.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_partial_baseline.csv", index=False)

In [27]:
import time
import ast

EA = 0
total_questions = len(filtered_df) * 2
results_list = []

for idx, row in filtered_df.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.predict_sql_schema_only(user_prompt=question)
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: Can you show me the actor_id, first_name, and last_name of actors with the first name Scarlett?
Generated SQL Query: SELECT actor_id, first_name, last_name FROM actor WHERE first_name ILIKE 'Scarlett';
Execution Accuracy: 1.0000

Processing Question 1.2: Bisa kasih daftar actor_id, first_name, dan last_name dari aktor yang nama depannya Scarlett?
Generated SQL Query: SELECT actor_id, first_name, last_name FROM actor WHERE first_name = 'Scarlett';
Execution Accuracy: 0.0000

Processing Question 2.1: How many different last_name values are there among all the actors?
Generated SQL Query: SELECT COUNT(DISTINCT last_name) FROM actor
Execution Accuracy: 1.0000

Processing Question 2.2: Total ada berapa nama belakang (last_name) aktor yang beda-beda?
Generated SQL Query: SELECT COUNT(DISTINCT last_name) FROM actor
Execution Accuracy: 1.0000

Processing Question 3.1: Who’s the top 1 actor that’s been in the most films? Just show me their actor_id, first_name, and las

In [28]:
df_results_partial_schema_linker = pd.DataFrame(results_list)
df_results_partial_schema_linker.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_partial_schema_linker.csv", index=False)