# Import Libraries

In [None]:
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [None]:
from text_to_sql import LLMConfig
from core import GeneralLLM
from dotenv import load_dotenv
import pandas as pd

In [None]:
load_dotenv()

# Constants

In [None]:
MODEL = "deepseek-chat"
PROVIDER = "deepseek"

# Import Dataset

In [None]:
dataset = pd.read_csv(f"../files/dataset/dataset_intent.csv")

In [None]:
dataset

# Initialize LLM

In [None]:
# Configurations
provider_key = PROVIDER.upper().replace("-", "_")
general_config = LLMConfig(
    type="api",
    model=MODEL,
    provider=PROVIDER,
    api_key=os.getenv(f"API_KEY_{provider_key}"),
)

# Initialize agents
llm_agent = GeneralLLM(config=general_config)

# Define Function

In [None]:
def detect_intent_tool_baseline(query, llm_agent):
    """
    Baseline: Directly ask the model if the query is about data retrieval.
    Return 1 for 'data' intent, 0 for 'other' (no explanation, no reasoning).
    """

    system_prompt = "Classify this query as '1' if it's about retrieving data from a database, or '0' if not."
    user_prompt = f"Query: {query}\nAnswer:"

    result = llm_agent.generate(system_prompt=system_prompt.strip(), user_prompt=user_prompt.strip())
    final = result.strip()

    if final not in ["0", "1"]:
        final = "1"

    return int(final)

In [None]:
def detect_intent_tool(query, llm_agent):
    """
    Detect whether the query is about retrieving data from a database (for SQL generation).
    Return 'data' if it is, otherwise 'other'.
    """

    system_prompt = f"""
    You are an expert assistant that decides if a user query is asking to retrieve data from a database.
    Return ONLY one of the following labels:

    - data: if the query is asking about facts, entities, statistics, filters, aggregations, rankings, comparisons, counts, sums, averages, or any structured information that could be stored in tables
    - other: if the query is not about retrieving data (e.g., definitions, instructions, general advice, or how-to questions)

    THINK STEP BY STEP:
    1. Does the query ask for specific information that could be stored in database tables?
    2. Would answering this require looking up records or performing calculations on stored data?
    3. If yes to either, it's 'data'

    Here are some EXAMPLES:

    Query: Which actors have the first name 'Scarlett'?  
    → data (lookup in actor table)

    Query: How many distinct actor last names are there?  
    → data (count operation)

    Query: What is SQL?  
    → other (definition)

    Query: What does SELECT do in SQL?  
    → other (explanation)

    Query: Which customers rented more than 3 categories?  
    → data (filtering and counting)

    Query: Compare sales between 2022 and 2023  
    → data (comparison of stored data)

    Query: How do I write a JOIN query?  
    → other (instruction)

    Query: What's the average salary by department?  
    → data (aggregation)

    Now, based on the query below, return only `data` or `other` (lowercase only — no explanation).

    Query:
    \"\"\"{query}\"\"\"
    """

    result = llm_agent.generate(system_prompt=system_prompt.strip(), user_prompt="")
    final = result.strip().lower()

    if final not in ["data", "other"]:
        final = "data"

    if final == "data":
        return 1
    else:
        return 0

# Experiment

## Baseline

In [None]:
predictions = []
for _, row in dataset.iterrows():
    prediction = detect_intent_tool_baseline(row["prompt"], llm_agent)
    predictions.append(prediction)

dataset["predicted_is_related_baseline"] = predictions

accuracy = (dataset["predicted_is_related_baseline"] == dataset["is_related"]).mean()
print(f"Baseline Accuracy: {accuracy:.2%}")

## With Prompt Engineering

In [None]:
predictions = []
for _, row in dataset.iterrows():
    prediction = detect_intent_tool(row["prompt"], llm_agent)
    predictions.append(prediction)

dataset["predicted_is_related"] = predictions
accuracy = (dataset["predicted_is_related"] == dataset["is_related"]).mean()

print(f"Prompt Engineering Accuracy: {accuracy:.2%}")

# Save Prediction

In [None]:
dataset.to_csv(f"../files/experiment_result/intent_detection/{MODEL}-result.csv", index=False)