In [128]:
!pip install -U openai sqlalchemy psycopg2 pandas tabulate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!psql -U postgres -h localhost -c "DROP DATABASE IF EXISTS padel_league_local;"
!psql -U postgres -h localhost -c "CREATE DATABASE padel_league_local OWNER padel_user_local;"

DROP DATABASE
CREATE DATABASE


In [3]:
!pg_restore -h localhost -U padel_user_local -d padel_league_local -v padel_league_backup.dump

pg_restore: connecting to database for restore
pg_restore: creating TYPE "public.prefered_hand_enum"
pg_restore: while PROCESSING TOC:
pg_restore: from TOC entry 735; 1247 16656 TYPE prefered_hand_enum padel_user
pg_restore: error: could not execute query: ERROR:  must be member of role "padel_user"
Command was: ALTER TYPE public.prefered_hand_enum OWNER TO padel_user;

pg_restore: creating TYPE "public.prefered_position_enum"
pg_restore: from TOC entry 738; 1247 16662 TYPE prefered_position_enum padel_user
pg_restore: error: could not execute query: ERROR:  must be member of role "padel_user"
Command was: ALTER TYPE public.prefered_position_enum OWNER TO padel_user;

pg_restore: creating TYPE "public.teams"
pg_restore: from TOC entry 741; 1247 16670 TYPE teams padel_user
pg_restore: error: could not execute query: ERROR:  must be member of role "padel_user"
Command was: ALTER TYPE public.teams OWNER TO padel_user;

pg_restore: creating TABLE "public.alembic_version"
pg_restore: from T

In [None]:
from sqlalchemy import create_engine, text
import pandas as pd
from openai import OpenAI
import psycopg2
import json
import re
import time


# --- OpenAI setup ---
client = OpenAI(api_key=None)

# --- Postgres setup ---
user = "padel_user_local"
password = "localpassword"
host = "localhost"
port = "5432"
database = "padel_league_local"

pg_engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
CONN_STR = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"


In [65]:
with open('sql_agent_prompt.txt', 'r') as file:
    SCHEMA_DESCRIPTION = file.read().rstrip()

In [66]:
def generate_sql(user_question: str) -> str:
    prompt = f"""
    You are an expert in SQL and padel league data.

    {SCHEMA_DESCRIPTION}

    Write a valid PostgreSQL query that answers the following question.
    Return **only** a fenced markdown code block. No extra explanation.

    Question: {user_question}
    Answer:
    """
    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
        reasoning_effort='minimal',
        verbosity='low'
    )
    return response.choices[0].message.content.strip()

In [67]:
def run_sql(query: str):
    conn = psycopg2.connect(
        dbname=database, user=user, password=password, host=host, port=port
    )
    try:
        return pd.read_sql_query(query, conn)
    finally:
        conn.close()

In [68]:
def summarize_answer(user_question: str, df: pd.DataFrame) -> str:
    if df is None or df.empty:
        return "Não encontrei dados que correspondam a essa pergunta."

    result_str = df.head(10).to_json(orient="records")
    prompt = f"""
És um assistente um bocado bruto que responde a perguntas sobre uma liga de padel amadora.

A tua tarefa é escrever uma resposta curta e coerente em português de Portugal, com base nos dados fornecidos.
Podes ser cínico, bruto e sarcástico. 

Pergunta: {user_question}
Resultado da consulta: {result_str}

Resposta (em português de Portugal):
"""
    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
        reasoning_effort='minimal'
    )
    return response.choices[0].message.content.strip()

In [69]:
with open("queries.json", "r", encoding="utf-8") as f:
    questions = json.load(f)

print('-------- Questions ----------')
for question in questions:
    try:
        df = run_sql(question['sql_query'])
        print(f"Question: {question['question']}")
        #print(f"Query: {question['sql_query']}")
        print(f'Answer: {df}')
        print('--------------------------------')
    except:
        print(f"Couldnt answer question: {question['question']}")

-------- Questions ----------
Question: Quem são os 5 jogadores com mais pontos de ranking?
Answer:    player_id        name            full_name  ranking_points  \
0         33        Fred     Frederico Beirão            7089   
1         28  Bernardo C     Bernardo Castro             5641   
2         12   Miguel SG  Miguel Sousa Guedes            4675   
3         27    Malafaya       Diogo Malafaya            3951   
4         13       Dudas   Dinis Brito e Faro            3543   

   ranking_position  
0                 1  
1                 2  
2                 3  
3                 4  
4                 5  
--------------------------------
Question: Quantos jogadores existem na liga?
Answer:    total_jogadores
0               72
--------------------------------
Question: Quais são os nomes das divisões atuais?
Answer:                                       name
0  2º Masters de Outono 2025 - 1ª Divisão 
1   2º Masters de Outono 2025 - 2ª Divisão
2   2º Masters de Outono 2025 - 3

  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)


In [None]:
def extract_sql_block(text: str) -> str:
    """
    Extracts the SQL inside ```sql ... ``` from an LLM response.
    Raises a clean exception if no SQL is found.
    """
    match = re.search(r"```sql\s*(.*?)\s*```", text, re.DOTALL)
    if not match:
        raise ValueError("No SQL code block found in LLM output.")
    return match.group(1).strip()

def retry_call(fn, retries=3, delay=0.5, backoff=2):
    last_err = None
    for attempt in range(retries):
        try:
            return fn()
        except Exception as e:
            last_err = e
            if attempt == retries - 1:
                raise
            time.sleep(delay)
            delay *= backoff
    raise last_err

def llm_judge(question: str, expected_df: pd.DataFrame, predicted_df: pd.DataFrame) -> str:

    expected_str = expected_df.to_markdown(index=False)
    predicted_str = predicted_df.to_markdown(index=False)

    prompt = f"""
You are an expert evaluator of SQL query correctness.

Your task: Determine whether the predicted SQL query result correctly answers the user's question.

You will receive:
1. The question.
2. The ground-truth correct result (as a table).
3. The model-generated result (as a table).

### IMPORTANT RULES ###
- The answer is CORRECT even if the columns differ, as long as the **content matches semantically**.
- Extra columns are allowed.
- Column name mismatches should be ignored.
- Ordering only matters if the question implies an order (e.g., "top", "ordered by").
- Missing irrelevant columns (e.g., ranking_position) does NOT make the answer wrong.
- Compare only the meaningful content needed to answer the question.

Respond with a JSON object like:
{{
  "verdict": "CORRECT" or "WRONG",
  "reason": "short 1–2 sentence explanation"
}}

### QUESTION
{question}

### GROUND TRUTH RESULT
{expected_str}

### MODEL PREDICTED RESULT
{predicted_str}

Evaluate now.
"""
    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
        reasoning_effort='minimal',
        verbosity='low'
    )

    # Attempt to parse JSON safely
    try:
        decision = json.loads(response.choices[0].message.content)
        return decision
    except:
        return {"verdict": "ERROR", "reason": "Judge output not JSON parsable"}

def evaluate_generated_sql(questions, retries=2):
    """
    For each question:
      - runs the ground-truth SQL
      - generates SQL via generate_sql + extract_sql_block (with retries)
      - executes the generated SQL (with retries)
      - uses llm_judge() to decide CORRECT/WRONG
    Returns a DataFrame with rich metadata per question.
    """
    results = []

    for q in questions:
        user_question = q["question"]

        # 1) Ground truth
        try:
            ground_truth = retry_call(lambda: run_sql(q["sql_query"]), retries=retries)
        except Exception as e:
            # If ground-truth fails, the eval is meaningless for this question
            results.append({
                "question": user_question,
                "status": "GROUND_TRUTH_ERROR",
                "sql_query": None,
                "raw_llm_output": None,
                "result": None,
                "ground_truth": None,
                "pass": False,
                "judge_reason": "",
                "error": f"Ground truth failed: {e}",
            })
            continue

        # 2) Generate SQL via LLM
        try:
            llm_output = retry_call(lambda: generate_sql(user_question), retries=retries)
            sql_query = extract_sql_block(llm_output)
        except Exception as e:
            results.append({
                "question": user_question,
                "status": "LLM_ERROR",
                "sql_query": None,
                "raw_llm_output": llm_output if "llm_output" in locals() else None,
                "result": None,
                "ground_truth": ground_truth,
                "pass": False,
                "judge_reason": "",
                "error": str(e),
            })
            continue

        # 3) Execute generated SQL
        try:
            gen_df = retry_call(lambda: run_sql(sql_query), retries=retries)
        except Exception as e:
            results.append({
                "question": user_question,
                "status": "SQL_ERROR",
                "sql_query": sql_query,
                "raw_llm_output": llm_output,
                "result": None,
                "ground_truth": ground_truth,
                "pass": False,
                "judge_reason": "",
                "error": str(e),
            })
            continue

        # 4) Judge with LLM
        judge = llm_judge(user_question, ground_truth, gen_df)
        verdict = judge.get("verdict", "ERROR")
        reason = judge.get("reason", "")

        results.append({
            "question": user_question,
            "status": verdict,                  # "CORRECT", "WRONG", or "ERROR"
            "sql_query": sql_query,             # extracted SQL
            "raw_llm_output": llm_output,       # full text from generate_sql
            "result": gen_df,                   # model-generated dataframe
            "ground_truth": ground_truth,       # reference dataframe
            "pass": verdict == "CORRECT",
            "judge_reason": reason,
            "error": "",
        })

    return pd.DataFrame(results)

def dataframes_equal(df1, df2):
    """
    Robust comparison:
    - ignore column order
    - ignore row order
    - ignore index
    - compare values as strings (safer for numeric/text mismatches)
    """
    if not isinstance(df1, pd.DataFrame) or not isinstance(df2, pd.DataFrame):
        return False

    try:
        df1_norm = df1.sort_index(axis=1).sort_values(by=list(df1.columns), ignore_index=True).astype(str)
        df2_norm = df2.sort_index(axis=1).sort_values(by=list(df2.columns), ignore_index=True).astype(str)
        return df1_norm.equals(df2_norm)
    except:
        return False
    
def compute_precision_score(evaluation_df: pd.DataFrame):
    """
    precision = (# of CORRECT queries) / (total queries evaluated)
    """
    total = len(evaluation_df)
    passed = evaluation_df["pass"].sum() if "pass" in evaluation_df.columns else 0

    precision = passed / total if total > 0 else 0.0

    return {
        "total_queries": total,
        "passed": int(passed),
        "failed": int(total - passed),
        "precision": round(precision, 4),
    }

In [71]:
def logically_equal(df_pred, df_gt):
    """
    Flexible DataFrame comparison:
      - Normalize column names
      - Compare only shared columns
      - Ignore column order
      - Sort rows
      - Ignore dtype differences
    Returns True if logically equal.
    """

    if df_pred is None or df_gt is None:
        return False

    # Normalize column names
    pred = df_pred.copy()
    gt = df_gt.copy()

    pred.columns = [c.lower().strip() for c in pred.columns]
    gt.columns = [c.lower().strip() for c in gt.columns]

    # Fix common synonym columns
    rename_map = {
        "id": "player_id",
        "playerid": "player_id",
        "fullname": "full_name"
    }
    pred.rename(columns=rename_map, inplace=True)

    # Determine shared columns
    common_cols = sorted(set(pred.columns).intersection(gt.columns))

    if not common_cols:
        return False

    # Subset and sort
    pred_sub = pred[common_cols].sort_values(by=common_cols).reset_index(drop=True)
    gt_sub = gt[common_cols].sort_values(by=common_cols).reset_index(drop=True)

    # Convert both to string for robust comparison
    pred_norm = pred_sub.astype(str)
    gt_norm = gt_sub.astype(str)

    # Compare
    return pred_norm.equals(gt_norm)

def annotate_failures(df):
    """
    Adds a unified failure_reason column based on:
      - LLM_ERROR
      - SQL_ERROR
      - CORRECT
      - WRONG
    """

    reasons = []

    for _, row in df.iterrows():
        status = row.get("status", None)

        if status in ("LLM_ERROR", "SQL_ERROR"):
            reasons.append(status)
            continue

        if status == "CORRECT":
            reasons.append("PASS")
            continue

        if status == "WRONG":
            reasons.append("WRONG_RESULT")
            continue

        # unexpected / fallback
        reasons.append("UNKNOWN")

    df["failure_reason"] = reasons
    return df

def inspect_failure(df, question_text):
    """
    Prints a detailed inspection for a given question.
    """
    row = df[df["question"] == question_text].iloc[0]

    print("\n==========================")
    print("QUESTION:", row["question"])
    print("STATUS:", row["status"])
    print("REASON:", row["failure_reason"])
    print("==========================")

    # For LLM extraction errors
    if row["status"] == "LLM_ERROR":
        print("\n❌ LLM could not generate SQL.")
        print("ERROR:", row.get("error", ""))
        return

    # For SQL execution errors
    if row["status"] == "SQL_ERROR":
        print("\n--- SQL Generated ---")
        print(row.get("sql_query", "(none)"))
        print("\n❌ SQL Execution Error:")
        print(row.get("error", ""))
        return

    # For CORRECT/WRONG cases
    print("\n--- SQL Generated ---")
    print(row.get("sql_query"))

    print("\n--- Judge Reason ---")
    print(row.get("judge_reason", ""))

    print("\n--- Expected Result ---")
    print(row.get("ground_truth"))

    print("\n--- Actual Result ---")
    print(row.get("generated_result"))

    # Optional: try to show diff
    try:
        diff = row["ground_truth"].compare(row["generated_result"])
        print("\n--- DataFrame Differences ---")
        print(diff)
    except Exception:
        print("\n--- DataFrame Differences ---")
        print("Could not compute a clean diff.")
        
def summary_report(df):
    print("\n===== FAILURE SUMMARY =====")
    print(df["failure_reason"].value_counts())

    print("\n===== FAILURE DETAILS =====")
    for reason in df["failure_reason"].unique():
        subset = df[df["failure_reason"] == reason]
        print(f"\n### {reason} ({len(subset)})")
        for q in subset["question"]:
            print("-", q)

In [72]:
eval_df = evaluate_generated_sql(questions)
eval_df = annotate_failures(eval_df)

print(compute_precision_score(eval_df))
summary_report(eval_df)

  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)


{'total_queries': 25, 'passed': 7, 'failed': 18, 'precision': np.float64(0.28)}

===== FAILURE SUMMARY =====
failure_reason
WRONG_RESULT    13
PASS             7
SQL_ERROR        5
Name: count, dtype: int64

===== FAILURE DETAILS =====

### PASS (7)
- Quem são os 5 jogadores com mais pontos de ranking?
- Quantos jogadores existem na liga?
- Quais são os nomes das divisões atuais?
- Que campos foram usados nos jogos realizados?
- Quais são os jogadores da Divisão 1 e os respetivos pontos?
- Quem é o líder atual de cada divisão?
- Mostra a classificação média (pontos) dos jogadores por divisão.

### WRONG_RESULT (13)
- Quando começou a edição mais recente da liga?
- Quantos jogos já foram jogados?
- Que jogadores têm mais de 100 pontos e jogaram pelo menos 5 partidas?
- Quantos jogos foram jogados em cada divisao?
- Quais foram os três jogadores com mais vitórias na última edição da liga?
- Que jogador melhorou mais a sua classificação entre duas edições consecutivas?
- Mostra as próxima

In [63]:
eval_df = evaluate_generated_sql(questions)
eval_df = annotate_failures(eval_df)

print(compute_precision_score(eval_df))
summary_report(eval_df)

  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)


{'total_queries': 25, 'passed': 8, 'failed': 17, 'precision': np.float64(0.32)}

===== FAILURE SUMMARY =====
failure_reason
WRONG_RESULT    13
PASS             8
SQL_ERROR        4
Name: count, dtype: int64

===== FAILURE DETAILS =====

### PASS (8)
- Quem são os 5 jogadores com mais pontos de ranking?
- Quantos jogadores existem na liga?
- Quais são os nomes das divisões atuais?
- Quantos jogos já foram jogados?
- Quais são os jogadores da Divisão 1 e os respetivos pontos?
- Quantas vitórias tem cada jogador da Divisão 3?
- Quais foram os três jogadores com mais vitórias na última edição da liga?
- Mostra as próximas partidas (não jogadas) ordenadas por data e divisão.

### WRONG_RESULT (13)
- Quando começou a edição mais recente da liga?
- Que campos foram usados nos jogos realizados?
- Quem ganhou o último jogo da Divisão 2?
- Qual é o jogador com mais jogos jogados?
- Que jogadores têm mais de 100 pontos e jogaram pelo menos 5 partidas?
- Quantos jogos foram jogados em cada divisao

In [54]:
eval_df = evaluate_generated_sql(questions)
eval_df = annotate_failures(eval_df)

print(compute_precision_score(eval_df))
summary_report(eval_df)

  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)


{'total_queries': 25, 'passed': 11, 'failed': 14, 'precision': np.float64(0.44)}

===== FAILURE SUMMARY =====
failure_reason
WRONG_RESULT    12
PASS            11
SQL_ERROR        2
Name: count, dtype: int64

===== FAILURE DETAILS =====

### PASS (11)
- Quem são os 5 jogadores com mais pontos de ranking?
- Quantos jogadores existem na liga?
- Quais são os nomes das divisões atuais?
- Quando começou a edição mais recente da liga?
- Que campos foram usados nos jogos realizados?
- Quais são os jogadores da Divisão 1 e os respetivos pontos?
- Quantas vitórias tem cada jogador da Divisão 3?
- Qual é o jogador com mais jogos jogados?
- Quem é o líder atual de cada divisão?
- Quais foram os três jogadores com mais vitórias na última edição da liga?
- Quem tem melhor percentagem de vitórias nesta edição?

### WRONG_RESULT (12)
- Quantos jogos já foram jogados?
- Quem ganhou o último jogo da Divisão 2?
- Que jogadores têm mais de 100 pontos e jogaram pelo menos 5 partidas?
- Quantos jogos foram

In [16]:
inspect_failure(eval_df, "Quando começou a edição mais recente da liga?")


QUESTION: Quando começou a edição mais recente da liga?
STATUS: LLM_ERROR
REASON: LLM_ERROR

❌ LLM could not generate SQL.
ERROR: Completions.create() got an unexpected keyword argument 'reasoning_verbosity'


In [17]:
eval_df

Unnamed: 0,question,status,sql_query,raw_llm_output,result,ground_truth,pass,judge_reason,error,failure_reason
0,Quem são os 5 jogadores com mais pontos de ran...,LLM_ERROR,,,,player_id name full_name ...,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
1,Quantos jogadores existem na liga?,LLM_ERROR,,,,total_jogadores 0 72,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
2,Quais são os nomes das divisões atuais?,LLM_ERROR,,,,name 0 ...,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
3,Quando começou a edição mais recente da liga?,LLM_ERROR,,,,edicao data_ini...,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
4,Que campos foram usados nos jogos realizados?,LLM_ERROR,,,,field 0 Campo 2 1 Campo 1,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
5,Quantos jogos já foram jogados?,LLM_ERROR,,,,jogos_jogados 0 2356,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
6,Quais são os jogadores da Divisão 1 e os respe...,LLM_ERROR,,,,jogador pontos 0 Joao perneta 21....,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
7,Quem ganhou o último jogo da Divisão 2?,LLM_ERROR,,,,jogador_vencedor date_hour 0 Talin...,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
8,Quantas vitórias tem cada jogador da Divisão 3?,LLM_ERROR,,,,name vitorias 0 Afonso Xavie...,False,,Completions.create() got an unexpected keyword...,LLM_ERROR
9,Qual é o jogador com mais jogos jogados?,LLM_ERROR,,,,name jogos 0 Talinho 345,False,,Completions.create() got an unexpected keyword...,LLM_ERROR


In [141]:
run_sql("""
WITH current_edition AS (
    SELECT e.id, e.name
    FROM editions e
    JOIN divisions d ON d.edition_id = e.id
    WHERE d.has_ended = FALSE
    GROUP BY e.id, e.name
    ORDER BY e.id DESC
    LIMIT 1
),
start_date AS (
    SELECT MIN(d.beginning_datetime) AS edition_start
    FROM divisions d
    WHERE d.edition_id = (SELECT id FROM current_edition)
)
SELECT edition_start
FROM start_date;
""")

  return pd.read_sql_query(query, conn)


Unnamed: 0,edition_start
0,2025-10-21


In [151]:
d = eval_df.to_dict()
for i in range(25):
    print('--------------------------')
    print(d['question'][i])
    try:
        print(run_sql(d['sql_query'][i]))
    except:
        print(d['error'][i])
    print('--------------------------')

--------------------------
Quem são os 5 jogadores com mais pontos de ranking?
   player_id player_name  ranking_points
0         33        Fred            7089
1         28  Bernardo C            5641
2         12   Miguel SG            4675
3         27    Malafaya            3951
4         13       Dudas            3543
--------------------------
--------------------------
Quantos jogadores existem na liga?
   total_players
0             72
--------------------------
--------------------------
Quais são os nomes das divisões atuais?
                                      name
0   2º Masters de Outono 2025 - 5ª Divisão
1   2º Masters de Outono 2025 - 3ª Divisão
2   2º Masters de Outono 2025 - 4ª Divisão
3   2º Masters de Outono 2025 - 2ª Divisão
4  2º Masters de Outono 2025 - 1ª Divisão 
--------------------------
--------------------------
Quando começou a edição mais recente da liga?
  edition_start
0    2025-10-21
--------------------------
--------------------------
Que campos for

  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)


In [150]:
d.keys()

dict_keys(['question', 'status', 'sql_query', 'raw_llm_output', 'result', 'ground_truth', 'pass', 'judge_reason', 'error', 'failure_reason'])