# Import Libraries

In [1]:
from text_to_sql import (
    TextToSQL,
    Config,
    LLMConfig,
    SLConfig,
    ContextConfig,
    QueryConfig,
)
from dotenv import load_dotenv
from datetime import datetime

import pandas as pd
import os




# Constants

In [2]:
MAX_RETRIES = 5
RETRY_DELAY = 2

# Load Environment

In [3]:
load_dotenv()

True

# Set Timestamp Experiment

In [4]:
timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M")
output_dir = f"./files/experiment_result/{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Config

In [5]:
config = Config(
    max_retry_attempt=5,
    rewriter_config=LLMConfig(
        type="api",
        model="gemini-1.5-flash",
        provider="gemini",
        api_key=os.getenv("API_KEY"),
    ),
    query_generator_config=LLMConfig(
        type="api",
        model="gemini-1.5-flash",
        provider="gemini",
        api_key=os.getenv("API_KEY"),
    ),
    schema_linker_config=SLConfig(
        type="api",
        model="gemini-1.5-flash",
        provider="gemini",
        api_key=os.getenv("API_KEY"),
        schema_path="./files/schema/sakila.txt",
    ),
    retrieve_context_config=ContextConfig(data_path="./files/dataset/dataset_sakila.csv"),
    query_executor_config=QueryConfig(
        host=os.getenv("DB_HOST"),
        database=os.getenv("DB_DATABASE"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        port=os.getenv("DB_PORT"),
    ),
)

# Model

In [6]:
text_to_sql_model = TextToSQL(config=config)

Initializing API client for gemini using model gemini-1.5-flash.
Initializing API client for gemini using model gemini-1.5-flash.
Initializing API client for gemini using model gemini-1.5-flash.


  from google.protobuf import service as _service


# Import Dataset

In [16]:
dataset = pd.read_csv("./files/dataset/dataset_sakila.csv")
dataset = dataset[40:50]

In [17]:
dataset

Unnamed: 0,Question,Answer,Summary,Alternative Prompt 1 (English),Alternative Prompt 2 (Bahasa Indonesia),Expected Result
40,Which movies have been rented by customers fro...,WITH movie_country_rentals AS (\r\n SELECT ...,This SQL query identifies movies that have bee...,Which movies have been rented by customers fro...,Film mana yang disewa oleh pelanggan dari semu...,"['title', 'country_count']"
41,Which movies have been rented the most during ...,"SELECT \r\n f.title, \r\n COUNT(r.rental...",This SQL query identifies the top 10 most rent...,Which movies have been rented the most during ...,Film mana yang paling sering disewa saat libur...,"['title', 'rentals_count']"
42,Which customers have rented movies from every ...,WITH customer_categories AS (\r\n SELECT \r...,This SQL query identifies customers who have r...,Which customers have rented movies from every ...,Ada pelanggan yang nyewa dari semua kategori t...,['customer_name']
43,Which movies have the highest number of rental...,WITH film_rentals AS (\r\n SELECT \r\n ...,This SQL query retrieves the top 10 most rente...,Which movies have the highest number of rental...,Film apa yang paling sering disewa tapi biaya ...,"['title', 'replacement_cost', 'rentals_count']"
44,Which customers have rented movies from every ...,WITH customer_stores AS (\n SELECT \n ...,This SQL query identifies customers who have r...,Which customers have rented movies from every ...,Siapa pelanggan yang nyewa dari semua toko tap...,['customer_name']
45,Which movies have been rented by customers fro...,WITH movie_city_rentals AS (\r\n SELECT \r\...,This SQL query identifies movies that have bee...,Which movies have been rented by customers fro...,Film mana yang disewa oleh pelanggan dari lebi...,"['title', 'city_count']"
46,Which movies have been rented the most by cust...,WITH young_customer_rentals AS (\r\n SELECT...,This SQL query identifies the top 10 most rent...,Which movies have been rented the most by cust...,Film mana yang paling banyak disewa oleh pelan...,"['title', 'rentals_count']"
47,Which movies have the highest revenue but have...,WITH film_revenue AS (\r\n SELECT \r\n ...,This SQL query identifies the top 10 highest-g...,Which movies have the highest revenue but have...,Film mana yang pendapatannya tinggi tapi tidak...,"['title', 'total_revenue']"
48,Which customers have rented movies from every ...,WITH customer_categories AS (\r\n SELECT \r...,This SQL query identifies customers who have r...,Which customers have rented movies from every ...,Ada pelanggan yang nyewa dari semua kategori t...,['customer_name']
49,Which movies have been rented by customers fro...,WITH movie_country_rentals AS (\r\n SELECT ...,This SQL query identifies movies that have bee...,Which movies have been rented by customers fro...,Film mana yang disewa oleh pelanggan dari lebi...,"['title', 'country_count']"


# Experiment Baseline Multistage

In [18]:
import time
import ast

EA = 0
total_questions = len(dataset) * 2
results_list = []

for idx, row in dataset.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_baseline(user_prompt=question, method="Multistage")
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 41.1: Which movies have been rented by customers from every country? Show the movie title and the number of countries.
Generated SQL Query: SELECT f.title, COUNT(DISTINCT c.country) AS num_countries
FROM film f
JOIN inventory i ON f.film_id = i.film_id
JOIN rental r ON i.inventory_id = r.inventory_id
JOIN customer cu ON r.customer_id = cu.customer_id
JOIN address a ON cu.address_id = a.address_id
JOIN city ci ON a.city_id = ci.city_id
JOIN country c ON ci.country_id = c.country_id
GROUP BY f.title
HAVING COUNT(DISTINCT c.country) = (SELECT COUNT(*) FROM country);
Execution Accuracy: 1.0000

Processing Question 41.2: Film mana yang disewa oleh pelanggan dari semua negara? Tampilkan judul film dan jumlah negara.
Generated SQL Query: SELECT f.title, COUNT(DISTINCT c.country) AS number_of_countries
FROM film f
JOIN inventory i ON f.film_id = i.film_id
JOIN rental r ON i.inventory_id = r.inventory_id
JOIN customer cu ON r.customer_id = cu.customer_id
JOIN address a ON c

In [10]:
df_results_baseline_multistage = pd.DataFrame(results_list)
df_results_baseline_multistage.to_csv(f"{output_dir}/sql_execution_results_baseline_multistage.csv", index=False)

# Experiment Baseline Incremental

In [11]:
import time
import ast

EA = 0
total_questions = len(dataset) * 2
results_list = []

for idx, row in dataset.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_baseline(user_prompt=question, method="Incremental")
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: Can you give me actors whose first name is Scarlett?
Related Tables: {'staff', 'store', 'film', 'category', 'film_category', 'film_actor', 'address', 'country', 'inventory', 'city', 'actor', 'language'}
Generated SQL Query: SELECT last_name FROM actor WHERE first_name = 'Scarlett';
Execution Accuracy: 1.0000

Processing Question 1.2: Aktor siapa aja yang nama depannya 'Scarlett'?
Related Tables: {'staff', 'store', 'film_category', 'film_actor', 'customer', 'inventory', 'payment', 'film_text', 'actor', 'language', 'category', 'film', 'address', 'country', 'city', 'rental'}
Generated SQL Query: SELECT last_name FROM actor WHERE first_name = 'Scarlett';
Execution Accuracy: 1.0000

Processing Question 2.1: Hey, how many unique last names do we have among actors?
Related Tables: {'staff', 'store', 'film', 'category', 'film_category', 'film_actor', 'address', 'country', 'inventory', 'city', 'actor', 'language'}
Generated SQL Query: SELECT COUNT(DISTINCT last_name) F

In [15]:
df_results_baseline_incremental = pd.DataFrame(results_list)
df_results_baseline_incremental.to_csv(f"{output_dir}/sql_execution_results_baseline_incremental.csv", index=False)

# Experiment V1 Multistage

In [10]:
import time
import ast

EA = 0
total_questions = len(dataset) * 2
results_list = []

for idx, row in dataset.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_v1(user_prompt=question, method="Multistage")
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: How much money did we make in 1997 total? Show it as '1997 Total Revenues'.
Rewritten Prompt: Calculate the total revenue for 1997, labeling the result '1997 Total Revenues'.

Generated SQL Query: SELECT SUM(od.unit_price * od.quantity * (1.0 - od.discount)) AS "1997 Total Revenues"
FROM order_details od
INNER JOIN orders o ON od.order_id = o.order_id
WHERE EXTRACT(YEAR FROM o.order_date) = 1997;
Execution Accuracy: 1.0000

Processing Question 1.2: Total pendapatan kita tahun 1997 berapa sih? Tampilkan sebagai '1997 Total Revenues'.
Rewritten Prompt: Show total revenue for the year 1997 as '1997 Total Revenues'.

Generated SQL Query: SELECT SUM(od.unit_price * od.quantity * (1.0 - od.discount)) AS "1997 Total Revenues"
FROM order_details od
INNER JOIN orders o ON od.order_id = o.order_id
WHERE EXTRACT(YEAR FROM o.order_date) = 1997;
Execution Accuracy: 1.0000

Processing Question 2.1: How much has each customer paid overall? Show 'CompanyName' and total as 'To

In [17]:
df_results_v1_multistage = pd.DataFrame(results_list)
df_results_v1_multistage.to_csv(f"{output_dir}/sql_execution_results_v1_multistage.csv", index=False)

# Experiment V1 Incremental

In [11]:
import time
import ast

EA = 0
total_questions = len(dataset) * 2
results_list = []

for idx, row in dataset.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_v1(user_prompt=question, method="Incremental")
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: How much money did we make in 1997 total? Show it as '1997 Total Revenues'.
Rewritten Prompt: Retrieve the total revenue for 1997, labeled as '1997 Total Revenues'.

Generated SQL Query: SELECT '1997 Total Revenues: ' || CAST(SUM(od.unit_price * od.quantity * (1.0 - od.discount)) AS VARCHAR) FROM order_details od INNER JOIN orders o ON od.order_id = o.order_id WHERE EXTRACT(YEAR FROM o.order_date) = 1997;
Execution Accuracy: 0.0000

Processing Question 1.2: Total pendapatan kita tahun 1997 berapa sih? Tampilkan sebagai '1997 Total Revenues'.
Rewritten Prompt: Show total revenue for the year 1997 as '1997 Total Revenues'.

Generated SQL Query: SELECT SUM(od.unit_price * od.quantity * (1.0 - od.discount)) AS "1997 Total Revenues" FROM order_details od INNER JOIN orders o ON od.order_id = o.order_id WHERE EXTRACT(YEAR FROM o.order_date) = 1997;
Execution Accuracy: 1.0000

Processing Question 2.1: How much has each customer paid overall? Show 'CompanyName' and tot

KeyboardInterrupt: 

In [10]:
df_results_v1_incremental = pd.DataFrame(results_list)
df_results_v1_incremental.to_csv(f"{output_dir}/sql_execution_results_v1_incremental.csv", index=False)