# Import Libraries

In [57]:
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [58]:
from text_to_sql import (
    TextToSQL,
    Config,
    LLMConfig,
    SLConfig,
    ContextConfig,
    QueryConfig,
)
from dotenv import load_dotenv
from datetime import datetime

import pandas as pd
import os

# Constants

In [59]:
DATABASE = "soccer"
MODEL = "deepseek-chat"
PROVIDER = "deepseek"
# TOTAL_TABLES = 14
# TOTAL_TABLES = 15
TOTAL_TABLES = 18
# TOTAL_TABLES = 22

# Load Environment

In [60]:
load_dotenv()

True

# Set Timestamp Experiment

In [61]:
timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M")
output_dir = f"../files/experiment_result/{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Config

In [62]:
db_key = DATABASE.upper().replace("-", "_")
provider_key = PROVIDER.upper().replace("-", "_")

config = Config(
    max_retry_attempt=5,
    rewriter_config=LLMConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
    ),
    query_generator_config=LLMConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
    ),
    schema_linker_config=SLConfig(
        type="api",
        model=MODEL,
        provider=PROVIDER,
        api_key=os.getenv(f"API_KEY_{provider_key}"),
        schema_path=f"../files/schema/{DATABASE}.txt",
        metadata_path=f"../files/metadata/{DATABASE}.json",
    ),
    retrieve_context_config=ContextConfig(data_path=f"../files/dataset/dataset_{DATABASE}.csv"),
    query_executor_config = QueryConfig(
        host=os.getenv(f"DB_HOST_{db_key}"),
        database=os.getenv(f"DB_DATABASE_{db_key}"),
        user=os.getenv(f"DB_USER_{db_key}"),
        password=os.getenv(f"DB_PASSWORD_{db_key}"),
        port=os.getenv(f"DB_PORT_{db_key}"),
    ),
)

# Model

In [63]:
text_to_sql_model = TextToSQL(config=config)

Initializing API client for deepseek using model deepseek-chat.
Initializing API client for deepseek using model deepseek-chat.
Initializing API client for deepseek using model deepseek-chat.


# Import Dataset

In [64]:
dataset = pd.read_csv(f"../files/dataset/dataset_schema_linker_{DATABASE}.csv")

In [65]:
dataset["tables_used"] = dataset["tables_used"].apply(eval)

In [66]:
dataset

Unnamed: 0,prompt,tables_used
0,Can you show me the players who scored more th...,"[goal_details, player_mast]"
1,Show all matches where the final score was a d...,[match_details]
2,Which teams managed to win all their group sta...,[match_details]
3,Can you list referees who led more than 3 matc...,"[match_mast, referee_mast]"
4,Show me which players were both substituted in...,"[player_mast, player_in_out]"
...,...,...
95,Pemain mana yang pernah main di lebih dari sat...,[player_mast]
96,Pelatih mana yang timnya nggak pernah menang? ...,"[soccer_team, team_coaches]"
97,Siapa 3 wasit dengan total penonton terbanyak ...,[match_mast]
98,"Di tiap pertandingan, berapa banyak pemain yan...","[goal_details, player_booked, player_in_out]"


# Experiment

In [67]:
def extract_table_names_from_schema(schema: dict) -> list:
    """
    Given a schema dictionary (from predict_schema_only),
    return a list of lowercase table names.
    """
    return schema.get("potential_related_tables", [])

In [68]:
def calculate_reduction(predicted_tables):
    return 1 - (len(predicted_tables) / TOTAL_TABLES)

In [69]:
accuracies = []
predicted_tables_list = []

for _, row in dataset.iterrows():
    prompt = row["prompt"]
    true_tables = set(row["tables_used"])

    predicted_schema = text_to_sql_model.predict_schema_only(prompt)
    predicted_tables = set(extract_table_names_from_schema(predicted_schema))

    # Save predicted tables
    predicted_tables_list.append(list(predicted_tables))

    # Calculate accuracy
    intersection_count = len(true_tables.intersection(predicted_tables))
    total_true_tables = len(true_tables)

    if total_true_tables > 0:
        accuracy = intersection_count / total_true_tables
    else:
        accuracy = 1.0 if not predicted_tables else 0.0

    accuracies.append(accuracy)

In [70]:
dataset["predicted_tables"] = predicted_tables_list
dataset["schema_accuracy"] = accuracies
dataset['schema_reduction'] = dataset['predicted_tables'].apply(calculate_reduction)

In [71]:
final_accuracy = sum(accuracies) / len(accuracies)
print(f"Schema Prediction Accuracy (Intersection-Based): {final_accuracy:.2%}")

average_reduction = dataset['schema_reduction'].mean()
print(f"Average Schema Reduction: {average_reduction:.2%}")

Schema Prediction Accuracy (Intersection-Based): 90.33%
Average Schema Reduction: 58.67%


# Save

In [72]:
dataset.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_schema_linker.csv", index=False)

In [73]:
dataset

Unnamed: 0,prompt,tables_used,predicted_tables,schema_accuracy,schema_reduction
0,Can you show me the players who scored more th...,"[goal_details, player_mast]","[soccer_team, playing_position, goal_details, ...",1.000000,0.611111
1,Show all matches where the final score was a d...,[match_details],"[soccer_team, asst_referee_mast, goal_details,...",1.000000,0.500000
2,Which teams managed to win all their group sta...,[match_details],"[soccer_team, team_coaches, asst_referee_mast,...",1.000000,0.555556
3,Can you list referees who led more than 3 matc...,"[match_mast, referee_mast]","[soccer_team, asst_referee_mast, match_captain...",1.000000,0.500000
4,Show me which players were both substituted in...,"[player_mast, player_in_out]","[soccer_team, playing_position, match_captain,...",1.000000,0.611111
...,...,...,...,...,...
95,Pemain mana yang pernah main di lebih dari sat...,[player_mast],"[soccer_team, playing_position, match_captain,...",1.000000,0.722222
96,Pelatih mana yang timnya nggak pernah menang? ...,"[soccer_team, team_coaches]","[soccer_team, team_coaches, playing_position, ...",1.000000,0.555556
97,Siapa 3 wasit dengan total penonton terbanyak ...,[match_mast],"[soccer_team, asst_referee_mast, match_captain...",1.000000,0.555556
98,"Di tiap pertandingan, berapa banyak pemain yan...","[goal_details, player_booked, player_in_out]","[soccer_team, playing_position, goal_details, ...",0.666667,0.611111


In [74]:
dataset_1 = dataset[dataset["schema_accuracy"] == 1]

In [75]:
dataset_1

Unnamed: 0,prompt,tables_used,predicted_tables,schema_accuracy,schema_reduction
0,Can you show me the players who scored more th...,"[goal_details, player_mast]","[soccer_team, playing_position, goal_details, ...",1.0,0.611111
1,Show all matches where the final score was a d...,[match_details],"[soccer_team, asst_referee_mast, goal_details,...",1.0,0.500000
2,Which teams managed to win all their group sta...,[match_details],"[soccer_team, team_coaches, asst_referee_mast,...",1.0,0.555556
3,Can you list referees who led more than 3 matc...,"[match_mast, referee_mast]","[soccer_team, asst_referee_mast, match_captain...",1.0,0.500000
4,Show me which players were both substituted in...,"[player_mast, player_in_out]","[soccer_team, playing_position, match_captain,...",1.0,0.611111
...,...,...,...,...,...
94,Kota mana yang punya pertandingan dengan lebih...,"[goal_details, match_mast, soccer_venue, socce...","[soccer_team, asst_referee_mast, goal_details,...",1.0,0.500000
95,Pemain mana yang pernah main di lebih dari sat...,[player_mast],"[soccer_team, playing_position, match_captain,...",1.0,0.722222
96,Pelatih mana yang timnya nggak pernah menang? ...,"[soccer_team, team_coaches]","[soccer_team, team_coaches, playing_position, ...",1.0,0.555556
97,Siapa 3 wasit dengan total penonton terbanyak ...,[match_mast],"[soccer_team, asst_referee_mast, match_captain...",1.0,0.555556


In [76]:
new_dataset = pd.read_csv(f"../files/dataset/dataset_{DATABASE}.csv")

In [77]:
new_dataset

Unnamed: 0,Question,Answer,Summary,Alternative Prompt 1 (English),Alternative Prompt 2 (Bahasa Indonesia),Expected Result
0,Which players have scored more than 2 goals in...,"SELECT gd.player_id, pm.player_name, gd.match_...",This SQL query identifies players who scored m...,Can you show me the players who scored more th...,Tolong tampilkan pemain yang mencetak lebih da...,"['player_id', 'player_name', 'match_no', 'tota..."
1,Show all matches where the final score was a d...,SELECT \r\n md1.match_no\r\nFROM \r\n ma...,This SQL query identifies football matches whe...,Show all matches where the final score was a d...,Tampilkan semua pertandingan yang berakhir imb...,['match_no']
2,Which teams have won all matches in the group ...,"SELECT team_id, COUNT(*) AS total_wins\n FROM...",This SQL query identifies teams that won all o...,Which teams managed to win all their group sta...,Tim mana saja yang menang terus di babak grup?...,"['team_id', 'total_wins']"
3,Find referees who officiated more than 3 match...,"SELECT rm.referee_id, rm.referee_name, COUNT(*...",This SQL query identifies the most active refe...,Can you list referees who led more than 3 matc...,Saya ingin tahu wasit yang memimpin lebih dari...,"['referee_id', 'referee_name', 'total_matches']"
4,Which players were substituted in and out in t...,"SELECT DISTINCT pio.player_id, pm.player_name,...",This SQL query identifies players who were bot...,Show me which players were both substituted in...,Tampilkan pemain yang pernah diganti masuk dan...,"['player_id', 'player_name', 'match_no']"
5,Identify players who have been booked and also...,"SELECT DISTINCT pb.player_id, pm.player_name, ...",This SQL query identifies players who were bot...,Which players got a card and also scored a goa...,Siapa saja pemain yang dapat kartu dan cetak g...,"['player_id', 'player_name', 'match_no']"
6,Show teams with more goals for than against in...,"SELECT team_id, goal_for, goal_agnst, goal_dif...",This SQL query identifies teams with a positiv...,Can you list teams that scored more goals than...,Tampilkan tim-tim yang jumlah gol masuknya leb...,"['team_id', 'goal_for', 'goal_agnst', 'goal_di..."
7,Which goalkeepers (player_gk) appeared in the ...,"SELECT player_gk AS player_id, COUNT(*) AS mat...",This SQL query analyzes goalkeeper appearances...,Who are the goalkeepers that played the most m...,Kiper mana saja yang paling sering tampil seba...,"['player_id', 'match_count']"
8,List venues with the highest audience attendan...,"SELECT sv.venue_name, sc.city, MAX(mm.audence)...",This SQL query identifies the maximum audience...,Show me the venues where audience attendance r...,Venue mana saja yang penontonnya banyak? Saya ...,"['venue_name', 'city', 'max_audience']"
9,Find players who were never substituted out. S...,"SELECT DISTINCT pm.player_id, pm.player_name\n...",This SQL query identifies players who were nev...,Find the players who were never substituted ou...,Tampilkan pemain yang tidak pernah diganti kel...,"['player_id', 'player_name']"


In [78]:
prompt_set = set(dataset_1['prompt'])

def match_prompt(row):
    alt1 = row.get("Alternative Prompt 1 (English)", "")
    alt2 = row.get("Alternative Prompt 2 (Bahasa Indonesia)", "")
    return alt1 in prompt_set and alt2 in prompt_set

filtered_df = new_dataset[new_dataset.apply(match_prompt, axis=1)]
filtered_df

Unnamed: 0,Question,Answer,Summary,Alternative Prompt 1 (English),Alternative Prompt 2 (Bahasa Indonesia),Expected Result
0,Which players have scored more than 2 goals in...,"SELECT gd.player_id, pm.player_name, gd.match_...",This SQL query identifies players who scored m...,Can you show me the players who scored more th...,Tolong tampilkan pemain yang mencetak lebih da...,"['player_id', 'player_name', 'match_no', 'tota..."
1,Show all matches where the final score was a d...,SELECT \r\n md1.match_no\r\nFROM \r\n ma...,This SQL query identifies football matches whe...,Show all matches where the final score was a d...,Tampilkan semua pertandingan yang berakhir imb...,['match_no']
3,Find referees who officiated more than 3 match...,"SELECT rm.referee_id, rm.referee_name, COUNT(*...",This SQL query identifies the most active refe...,Can you list referees who led more than 3 matc...,Saya ingin tahu wasit yang memimpin lebih dari...,"['referee_id', 'referee_name', 'total_matches']"
4,Which players were substituted in and out in t...,"SELECT DISTINCT pio.player_id, pm.player_name,...",This SQL query identifies players who were bot...,Show me which players were both substituted in...,Tampilkan pemain yang pernah diganti masuk dan...,"['player_id', 'player_name', 'match_no']"
5,Identify players who have been booked and also...,"SELECT DISTINCT pb.player_id, pm.player_name, ...",This SQL query identifies players who were bot...,Which players got a card and also scored a goa...,Siapa saja pemain yang dapat kartu dan cetak g...,"['player_id', 'player_name', 'match_no']"
6,Show teams with more goals for than against in...,"SELECT team_id, goal_for, goal_agnst, goal_dif...",This SQL query identifies teams with a positiv...,Can you list teams that scored more goals than...,Tampilkan tim-tim yang jumlah gol masuknya leb...,"['team_id', 'goal_for', 'goal_agnst', 'goal_di..."
8,List venues with the highest audience attendan...,"SELECT sv.venue_name, sc.city, MAX(mm.audence)...",This SQL query identifies the maximum audience...,Show me the venues where audience attendance r...,Venue mana saja yang penontonnya banyak? Saya ...,"['venue_name', 'city', 'max_audience']"
9,Find players who were never substituted out. S...,"SELECT DISTINCT pm.player_id, pm.player_name\n...",This SQL query identifies players who were nev...,Find the players who were never substituted ou...,Tampilkan pemain yang tidak pernah diganti kel...,"['player_id', 'player_name']"
10,Which matches had more than 2 players from the...,"SELECT match_no, team_id, COUNT(player_id) AS ...",This SQL query identifies teams with disciplin...,Which matches had 3 or more players from the s...,Pertandingan mana yang punya 3 atau lebih pema...,"['match_no', 'team_id', 'booked_count']"
11,Find players who scored goals in both halves o...,"SELECT DISTINCT gd.player_id, pm.player_name, ...",This SQL query identifies players who scored g...,Can you show players who scored in both the fi...,Tampilkan pemain yang cetak gol di babak perta...,"['player_id', 'player_name', 'match_no']"


In [79]:
len(filtered_df)

40

In [80]:
MAX_RETRIES = 5
RETRY_DELAY = 2

In [81]:
import time
import ast

EA = 0
total_questions = len(filtered_df) * 2
results_list = []

for idx, row in filtered_df.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.generate_baseline(user_prompt=question)
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: Can you show me the players who scored more than 2 goals in one game? I’d like to see their player_id, name, match number, and how many goals they scored.
Generated SQL Query: SELECT g.player_id, p.player_name, g.match_no, COUNT(g.goal_id) AS goals_scored
FROM goal_details g
JOIN player_mast p ON g.player_id = p.player_id
GROUP BY g.player_id, p.player_name, g.match_no
HAVING COUNT(g.goal_id) > 2
ORDER BY goals_scored DESC;
Execution Accuracy: 1.0000

Processing Question 1.2: Tolong tampilkan pemain yang mencetak lebih dari 2 gol dalam satu pertandingan. Saya mau lihat player_id, nama pemain, match_no, dan total golnya.
Generated SQL Query: SELECT 
    g.player_id, 
    p.player_name, 
    g.match_no, 
    COUNT(g.goal_id) AS total_goals
FROM 
    goal_details g
JOIN 
    player_mast p ON g.player_id = p.player_id
GROUP BY 
    g.player_id, p.player_name, g.match_no
HAVING 
    COUNT(g.goal_id) > 2
ORDER BY 
    total_goals DESC;
Execution Accuracy: 1.0000

Pr

In [82]:
df_results_partial_baseline = pd.DataFrame(results_list)
df_results_partial_baseline.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_partial_baseline.csv", index=False)

In [83]:
import time
import ast

EA = 0
total_questions = len(filtered_df) * 2
results_list = []

for idx, row in filtered_df.iterrows():
    question_1 = row["Alternative Prompt 1 (English)"]
    question_2 = row["Alternative Prompt 2 (Bahasa Indonesia)"]
    answer = row["Answer"]
    expected_columns = ast.literal_eval(row["Expected Result"])

    for prompt_id, question in enumerate([question_1, question_2], start=1):
        print(f"\nProcessing Question {idx + 1}.{prompt_id}: {question}")
        result = None

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = text_to_sql_model.predict_sql_schema_only(user_prompt=question)
                break
            except Exception as e:
                print(f"[Attempt {attempt}] Failed to generate SQL: {e}")
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    print("Max retries reached. Setting result as 'ERROR'")
                    result = "ERROR"

        print(f"Generated SQL Query: {result}")

        try:
            acc = text_to_sql_model.evaluate(query=result, true_query=answer, expected_columns=expected_columns)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            acc = 0.0

        print(f"Execution Accuracy: {acc:.4f}")

        results_list.append({
            "Question ID": f"{idx + 1}.{prompt_id}",
            "Question": question,
            "Generated SQL Query": result,
            "Expected SQL Query": answer,
            "Execution Accuracy": acc
        })

        EA += acc

# Calculate final execution accuracy
final_accuracy = EA / total_questions if total_questions > 0 else 0
print(f"\nFinal Execution Accuracy: {final_accuracy:.4f}")


Processing Question 1.1: Can you show me the players who scored more than 2 goals in one game? I’d like to see their player_id, name, match number, and how many goals they scored.
Generated SQL Query: SELECT g.player_id, p.player_name, g.match_no, COUNT(g.goal_id) AS goals_scored
FROM goal_details g
JOIN player_mast p ON g.player_id = p.player_id
GROUP BY g.player_id, p.player_name, g.match_no
HAVING COUNT(g.goal_id) > 2
ORDER BY goals_scored DESC;
Execution Accuracy: 1.0000

Processing Question 1.2: Tolong tampilkan pemain yang mencetak lebih dari 2 gol dalam satu pertandingan. Saya mau lihat player_id, nama pemain, match_no, dan total golnya.
Generated SQL Query: SELECT 
    g.player_id,
    p.player_name,
    g.match_no,
    COUNT(g.goal_id) AS total_gol
FROM 
    goal_details g
JOIN 
    player_mast p ON g.player_id = p.player_id
GROUP BY 
    g.player_id, p.player_name, g.match_no
HAVING 
    COUNT(g.goal_id) > 2
ORDER BY 
    total_gol DESC;
Execution Accuracy: 1.0000

Processin

In [84]:
df_results_partial_schema_linker = pd.DataFrame(results_list)
df_results_partial_schema_linker.to_csv(f"{output_dir}/{MODEL}_{DATABASE}_partial_schema_linker.csv", index=False)