In [3]:
# !pip install google-genai

# JSON

In [4]:
import json

# Load the JSON file
json_file_path = "metadata/paglia_metadata.json"

with open(json_file_path, "r") as file:
    pagila_metadata = json.load(file)

# Display the structure of the JSON file (showing only a subset if it's large)
pagila_metadata


{'database_name': 'paglia',
 'description': 'A database for managing a DVD rental store.',
 'tables': [{'table_name': 'actor',
   'description': 'Stores information about actors.',
   'columns': [{'column_name': 'actor_id',
     'data_type': 'integer',
     'is_primary_key': True,
     'description': 'Unique identifier for the actor.'},
    {'column_name': 'first_name',
     'data_type': 'text',
     'description': "Actor's first name."},
    {'column_name': 'last_name',
     'data_type': 'text',
     'description': "Actor's last name."},
    {'column_name': 'last_update',
     'data_type': 'timestamp with time zone',
     'description': 'Timestamp of the last update to the record.'}]},
  {'table_name': 'address',
   'description': 'Stores address information.',
   'columns': [{'column_name': 'address_id',
     'data_type': 'integer',
     'is_primary_key': True,
     'description': 'Unique identifier for the address.'},
    {'column_name': 'address',
     'data_type': 'text',
     'de

In [9]:
names = list(map(str, list(pagila_metadata.keys())))
names

['database_name',
 'description',
 'tables',
 'views',
 'materialized_views',
 'functions',
 'relationships']

In [23]:
metadata = json.dumps(pagila_metadata)
len(metadata)

23036

# Gemini 

In [93]:
import os
from google import genai
from google.genai import types
from dotenv import load_dotenv
load_dotenv()

True

In [137]:
def get_client():
    return genai.Client(api_key=os.getenv('API_KEY'))
def get_response(contents, client, model = 'gemini-2.0-flash-001'):
    return client.models.generate_content(model=model, contents=contents, config={'temperature' : 0.1}).text
def get_prompt(question):
    
    prompt_template = """
    You are a PostgreSQL expert.

    Please help to generate a PostgreSQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions.

    ===Tables metadata in JSON like format
    {metadata}


    ===Response Guidelines
    1. If the provided context is sufficient, please generate a valid query without any explanations for the question. The query should start with a comment containing the question being asked.
    2. If the provided context is insufficient, please explain why it can't be generated.
    3. Please use the most relevant table(s).
    5. Please format the query before responding.
    6. Please always respond with a valid well-formed JSON object with the following format

    ===Response Format if Question made sense and could be converted to a valid query.
    {{  
        "query": "SELECT first_name, last_name FROM actor;",
        "explanation": "An explanation of query."
    }}
    ===Response Format if Question was unclear/invalid and valid/correct query generation is difficult.
    {{  
        "query": null,
        "explanation": "An explanation of failing to generate the query."
    }}

    ===Question
    {question}
    """
    
    values = {
        "metadata": metadata,
        "question": question
    }

    return prompt_template.format(**values)


In [138]:
client = get_client()


In [151]:
question = 'List all customers who are from the city of New York.'
response = get_response(get_prompt(question), client)
response

'```json\n{\n  "query": "SELECT c.first_name, c.last_name\\nFROM customer AS c\\nJOIN address AS a ON c.address_id = a.address_id\\nJOIN city AS ci ON a.city_id = ci.city_id\\nWHERE ci.city = \'New York\';",\n  "explanation": "The query joins the customer, address, and city tables to find customers located in New York. It selects the first and last names of customers from the customer table, joins it with the address table on address_id, and then joins the address table with the city table on city_id. Finally, it filters the results to include only customers where the city is \'New York\'."\n}\n```'

In [152]:
print(response)

```json
{
  "query": "SELECT c.first_name, c.last_name\nFROM customer AS c\nJOIN address AS a ON c.address_id = a.address_id\nJOIN city AS ci ON a.city_id = ci.city_id\nWHERE ci.city = 'New York';",
  "explanation": "The query joins the customer, address, and city tables to find customers located in New York. It selects the first and last names of customers from the customer table, joins it with the address table on address_id, and then joins the address table with the city table on city_id. Finally, it filters the results to include only customers where the city is 'New York'."
}
```


In [153]:
response_dict = json.loads(response[response.index('{'):response.index('}') + 1])
response_dict

{'query': "SELECT c.first_name, c.last_name\nFROM customer AS c\nJOIN address AS a ON c.address_id = a.address_id\nJOIN city AS ci ON a.city_id = ci.city_id\nWHERE ci.city = 'New York';",
 'explanation': "The query joins the customer, address, and city tables to find customers located in New York. It selects the first and last names of customers from the customer table, joins it with the address table on address_id, and then joins the address table with the city table on city_id. Finally, it filters the results to include only customers where the city is 'New York'."}

In [154]:
response_dict['query']

"SELECT c.first_name, c.last_name\nFROM customer AS c\nJOIN address AS a ON c.address_id = a.address_id\nJOIN city AS ci ON a.city_id = ci.city_id\nWHERE ci.city = 'New York';"