In [83]:
SELECT_TABLES_PROMPT = """
You are given table descriptions. Pick the tables most relevant to answer the
question and provide a reason for each table selected.
Return your output as a **valid JSON array only**.
Do not include ```json or ``` or any other formatting.
Just return the JSON.
Return JSON as a list of objects with "table" and "reason" keys.
Question: {question}
Descriptions:\n{descriptions}
"""

SELECT_COLUMNS_PROMPT = """
Given the selected tables and their columns (with types), pick only the columns
needed and provide a reason for each column selected.
Return your output as a **valid JSON array only**.
Do not include ```json or ``` or any other formatting.
Just return the JSON.

Return JSON as {{table: [{{column: column_name, reason: reason}}...]}}.
Question: {question}
Columns:\n{columns}
"""

EXTRACT_DETAILS_PROMPT = """
Extract concrete filter values and constraints from the question, aligned to the
selected tables/columns.
Return your output as a **valid JSON array only**.
Do not include ```json or ``` or any other formatting.
Just return the JSON.
Return JSON with keys: filters (list of {{table, column, value}}).
Question: {question}
Selected: {selected}
"""

VERIFY_DETAILS_PROMPT = """
You will be given candidate filters and a list of actual distinct values from
the DB for each referenced column. Remove or correct any impossible values.
Return your output as a **valid JSON array only**.
Do not include ```json or ``` or any other formatting.
Just return the JSON.
Return the same JSON structure with only verified/adjusted filters.
Candidate: {candidate}
Catalog values: {catalog}
"""
GENERATE_SQL_PROMPT = """
You are an expert SQL generator. Using only the verified details and available
tables/columns, write a single ANSI SQL query (SQLite dialect) to answer the
question.
- Use table aliases.
- Only use verified values.
- If join is required, infer foreign keys by name.
- Include ORDER BY / LIMIT if specified.
- Return **only** the SQL, no backticks.
Verified details: {details}
Available columns: {columns}
"""

In [84]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv('.env')

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [89]:
from __future__ import annotations
import json
from typing import Dict, Any, List
from langchain_core.tools import BaseTool
from langchain.pydantic_v1 import Extra
import db


class SelectRelevantTables(BaseTool):
    name: str = "select_relevant_tables"
    description: str = "Select relevant tables based on table descriptions. Input: question string. Output: JSON list of objects with table names and reasons."
    
    class Config:
        extra = Extra.allow
    
    def __init__(self, llm):
        super().__init__(llm=llm)
        self.llm = llm 

    def _run(self, question: str) -> str:
        descs = db.get_table_descriptions()
        body = SELECT_TABLES_PROMPT.format(
            question=question,
            descriptions=json.dumps(descs, indent=2)
        )
        out = self.llm.invoke(body).content
        return out

    async def _arun(self, question: str) -> str:
        return self._run(question)

class SelectRelevantColumns(BaseTool):
    name: str = "select_relevant_columns"
    description: str = "Pick relevant columns from selected tables. Input: JSON with question and tables(only names list). Output: JSON {table: [{{column, reason}}...]}"

    class Config:
        extra = Extra.allow
    
    def __init__(self, llm):
        super().__init__(llm=llm)
        self.llm = llm 

    def _run(self, payload: str) -> str:
        obj = json.loads(payload)
        question: str = obj["question"]
        tables: List[str] = obj["tables"]
        cols = []
        for t in tables:
            cols += {t :db.get_column_descriptions(t)}
        body = SELECT_COLUMNS_PROMPT.format(
            question=question,
            columns=json.dumps(cols, indent=2)
        )
        out = self.llm.invoke(body).content
        return out

    async def _arun(self, payload: str) -> str:
        return self._run(payload)

class ExtractDetails(BaseTool):
    name: str = "extract_details"
    description: str = "Extract filters from the question aligned to selected columns. Input: JSON with question and selected_columns. Output: JSON details."

    class Config:
      extra = Extra.allow
    
    def __init__(self, llm):
      super().__init__(llm = llm)
      self.llm = llm 

    def _run(self, payload: str) -> str:
        obj = json.loads(payload)
        question = obj["question"]
        selected = obj["selected_columns"]
        body = EXTRACT_DETAILS_PROMPT.format(
            question=question,
            selected=json.dumps(selected, indent=2)
        )
        return self.llm.invoke(body).content

    async def _arun(self, payload: str) -> str:
        return self._run(payload)

class VerifyDetails(BaseTool):
    name: str = "verify_details"
    description: str = "Verify candidate details with actual DB values. Input: JSON {candidate, columns}. Output: JSON (verified details)."

    class Config:
      extra = Extra.allow
    
    def __init__(self, llm):
      super().__init__(llm = llm)
      self.llm = llm 

    def _run(self, payload: str) -> str:
        obj = json.loads(payload)
        candidate = obj["candidate"]

        # Build catalog of distinct values for each filter column
        catalog = []
        for f in candidate.get("filters", []):
            table = f.get("table")
            column = f.get("column")
            try:
                values = db.get_distinct_values(table, column, limit=100)
            except Exception:
                values = []
            catalog.append({"table": table, "column": column, "values": values})

        body = VERIFY_DETAILS_PROMPT.format(
            candidate=json.dumps(candidate, indent=2),
            catalog=json.dumps(catalog, indent=2)
        )
        return self.llm.invoke(body).content

    async def _arun(self, payload: str) -> str:
        return self._run(payload)

class GenerateSQL(BaseTool):
    name: str = "generate_sql"
    description: str = "Generate SQL using verified details and available columns. Input: JSON {details, columns}. Output: SQL string."

    class Config:
      extra = Extra.allow
    
    def __init__(self, llm):
      super().__init__(llm = llm)
      self.llm = llm 

    def _run(self, payload: str) -> str:
        obj = json.loads(payload)
        details = obj["details"]
        columns = obj["columns"]
        body = GENERATE_SQL_PROMPT.format(
            details=json.dumps(details, indent=2),
            columns=json.dumps(columns, indent=2)
        )
        sql = self.llm.invoke(body).content.strip()
        return sql

    async def _arun(self, payload: str) -> str:
        return self._run(payload)
    

class ExecuteSQL(BaseTool):
    name: str = "execute_sql"
    description: str = "Execute SQL and return results. Input: SQL string. Output: JSON results."

    class Config:
      extra = Extra.allow
    
    def __init__(self, llm):
      super().__init__(llm = llm)
      self.llm = llm 

    def _run(self, sql: str) -> str:
        results = db.execute_query(sql)
        return json.dumps(results, indent=2)

    async def _arun(self, sql: str) -> str:
        return self._run(sql)

In [75]:

select_relevant_tables = SelectRelevantTables(llm=llm)

select_relevant_columns = SelectRelevantColumns(llm=llm)

select_relevant_columns.invoke('{"question": "List all employees in the Sales department.", "tables": [{"table":"employees"}, {"table":departments"}]')

JSONDecodeError: Expecting value: line 1 column 104 (char 103)

In [76]:


print(select_relevant_tables.name)
print(select_relevant_tables.description)
print(select_relevant_tables.args)
print(select_relevant_tables.return_direct)

select_relevant_tables
Select relevant tables based on table descriptions. Input: question string. Output: JSON list of objects with table names and reasons.
{'question': {'title': 'Question', 'type': 'string'}}
False


In [46]:
select_relevant_tables.invoke({"question": "List the names of all employees in the Sales department."})

'[\n  {\n    "table": "Departments",\n    "reason": "This table contains department names, which is necessary to filter for the \'Sales department\'."\n  },\n  {\n    "table": "Professors",\n    "reason": "This table stores the names of academic faculty, who are considered employees in this context, and links them to departments, allowing us to retrieve the names of employees in the specified department."\n  }\n]'

In [86]:
system_prompt = """You are an intelligent SQL generation agent that strictly follows a structured reasoning workflow to answer natural language questions using a relational database.  

Your workflow is as follows:  
1. **Identify Relevant Tables** – Start by analyzing the database table descriptions to select only the tables relevant to the user’s question.  
2. **Select Relevant Columns** – From the chosen tables, identify which columns are needed to answer the question.  
3. **Extract Details from User Query** – Extract entities, values, and filters (e.g., customer name, product category, date range) that are necessary to form the SQL query.  
4. **Verify Extracted Details** – Cross-check the extracted details with actual values in the database (using sample values or validation tools) to ensure correctness and avoid hallucinations.  
5. **Generate SQL Query** – Construct a valid SQL query using the relevant tables, columns, and verified details. Always ensure joins, conditions, and aggregations are logically correct.  
6. **Execute SQL Query** – Run the query against the database and return the results in a clear, structured format.  

### Rules:
- Always respect this workflow — never skip a step.  
- Never assume values; always validate details against real database values before using them in SQL.  
- Provide explanations of your reasoning when moving between steps.  
- If information is missing or ambiguous, ask clarifying questions before generating SQL."""


In [91]:
from langchain_ollama.llms import OllamaLLM


llm = OllamaLLM(model="qwen3:1.7b")

In [92]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent


from langchain_core.prompts import PromptTemplate

template = system_prompt + """
You have access to the following tools:
{tools}
Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of {tool_names}
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}"""

prompt = PromptTemplate.from_template(template)


tools = [
    SelectRelevantTables(llm=llm),
    SelectRelevantColumns(llm=llm),
    ExtractDetails(llm=llm),
    VerifyDetails(llm=llm),
    GenerateSQL(llm=llm)]

agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools , verbose=True)

agent_executor.invoke({"input": "List the names of all profeessors in Computer Science"})

# Use with chat history
# from langchain_core.messages import AIMessage, HumanMessage
# agent_executor.invoke(
#     {
#         "input": "what's my name?",
#         # Notice that chat_history is a string
#         # since this prompt is aimed at LLMs, not chat models
#         "chat_history": "Human: My name is Bob\nAI: Hello Bob!",
#     }
# )



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m<think>
Okay, let's tackle this question step by step. The user wants to list all the names of professors in Computer Science. 

First, I need to figure out which tables are relevant. The question mentions professors and Computer Science, so I should check the database tables related to professors. Typically, there might be a "professors" table with columns like name, department, etc. But since the exact table structure isn't provided, I'll use the select_relevant_tables tool to determine the correct tables.

Next, I need to select the relevant columns. If the professors table has a department column, that's important. Also, the name column is obvious. But I need to make sure the columns are correct based on the tables selected.

Then, extract the details. The user is asking for professors in Computer Science, so the filter would be department = 'Computer Science'. But I need to verify this with the actual database to ensure 

AttributeError: 'str' object has no attribute 'content'

In [63]:
prompt = hub.pull("hwchase17/react")

prompt



PromptTemplate(input_variables=['agent_scratchpad', 'input', 'tool_names', 'tools'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'react', 'lc_hub_commit_hash': 'd15fe3c426f1c4b3f37c9198853e4a86e20c425ca7f4752ec0c9b0e97ca7ea4d'}, template='Answer the following questions as best you can. You have access to the following tools:\n\n{tools}\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [{tool_names}]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\nBegin!\n\nQuestion: {input}\nThought:{agent_scratchpad}')

In [None]:


prompt


PromptTemplate(input_variables=['agent_scratchpad', 'input', 'tool_names', 'tools'], input_types={}, partial_variables={}, template='Answer the following questions as best you can. You have access to the following tools:\n\n{tools}\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [{tool_names}]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\nBegin!\n\nQuestion: {input}\nThought:{agent_scratchpad}')