# Functions

In [7]:
from generator import *


ModuleNotFoundError: No module named 'generator'

# Test

In [None]:
client = get_client()
connection = connect_to_db(connection_parameters)

In [None]:
question = "Show all rentals made in the last 7 days"

## Giving Metadata

In [None]:
response = get_response(get_prompt(question, metadata=schema_metadata), client)
response

{'question': 'Show all rentals made in the last 7 days',
 'query': "SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';",
 'explanation': "This query selects all columns from the 'rental' table where the 'rental_date' is within the last 7 days.  NOW() provides the current timestamp, and INTERVAL '7 days' subtracts 7 days from it."}

In [None]:
print(response['query'])

SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';


In [None]:
fetch_query_as_df(connection, response['query'])

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update


## Giving Definition

In [None]:
response = get_response(get_prompt(question, definition=schema_definition), client)
response

{'question': 'Show all rentals made in the last 7 days',
 'query': "SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';",
 'explanation': "This query selects all columns from the rental table where the rental_date is within the last 7 days. NOW() - INTERVAL '7 days' calculates the date 7 days ago, and the query filters for rentals with a rental_date greater than or equal to that date."}

In [None]:
print(response['query'])

SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';


In [None]:
fetch_query_as_df(connection, response['query'])

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update


## Giving Both

In [None]:
response = get_response(get_prompt(question, definition=schema_definition, metadata=schema_metadata), client)
response

{'question': 'Show all rentals made in the last 7 days',
 'query': "SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';",
 'explanation': "This query selects all columns from the 'rental' table where the 'rental_date' is within the last 7 days.  NOW() provides the current timestamp, and INTERVAL '7 days' subtracts 7 days from it."}

In [None]:
print(response['query'])

SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';


In [None]:
fetch_query_as_df(connection, response['query'])

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update


# Giving Text Summary

In [None]:
response = get_response(get_prompt(question, metadata=everything), client)
response

NameError: name 'everything' is not defined

In [None]:
print(response['query'])

SELECT * FROM rental WHERE rental_date >= NOW() - INTERVAL '7 days';


In [None]:
fetch_query_as_df(connection, response['query'])

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update


In [None]:
everything = schema_definition + '\n' + schema_metadata + '\n'  + summary

# Running All

In [None]:
questions = list(pd.read_csv(r"C:\Users\Anush\Desktop\Projects\text-to-sql\evaluation.csv")['Natural Language Query', 'Difficulty'])
questions[:3]

NameError: name 'pd' is not defined

In [None]:
from time import sleep
answers = {}
for question in questions:
    sleep(1)
    answers[question] = get_response(get_prompt(question, metadata=everything), client)

In [None]:
for question, response in answers.items():
    print(response)
    print('-' * 50)
    try:
        print(fetch_query_as_df(connection, response['query']).iloc[:5])
    except Exception as e:
        print(e)
    print('-' * 50)
    print('\n' * 3)

{'question': "List all actors' first and last names.", 'query': 'SELECT first_name, last_name FROM actor;', 'explanation': "This query selects the first_name and last_name columns from the actor table, effectively listing all actors' first and last names."}
--------------------------------------------------
  first_name     last_name
0   PENELOPE       GUINESS
1       NICK      WAHLBERG
2         ED         CHASE
3   JENNIFER         DAVIS
4     JOHNNY  LOLLOBRIGIDA
--------------------------------------------------




{'question': 'Show the titles of all films in the database.', 'query': 'SELECT title FROM film;', 'explanation': "This query selects the 'title' column from the 'film' table, which contains the titles of all films in the database."}
--------------------------------------------------
              title
0  ACADEMY DINOSAUR
1    ACE GOLDFINGER
2  ADAPTATION HOLES
3  AFFAIR PREJUDICE
4       AFRICAN EGG
--------------------------------------------------




{'question': 'G

# Testing Agents


In [None]:
from app.backend.agents import *

In [2]:
easy_question = "Find all actors who have appeared in more than 10 films."
hard_question = "List all actors who have appeared in at least one film in each category."

In [3]:
rewriter = RewriterAgent()
question = rewriter.run({'question' : hard_question})
print(rewriter.get_dictionary(question))

{'question': 'List all actors who have appeared in at least one film in each category.', 'rewritten': 'Find the first name and last name of all actors who have acted in at least one film in every category present in the film_category table. For each actor, ensure that for every category_id in the film_category table, there exists a film_id that the actor has acted in, as recorded in the film_actor table.'}


In [4]:
generator = GeneratorAgent()
generator_response = generator.run(rewriter.get_dictionary(question))
print(generator_response)

{
  "question": "List all actors who have appeared in at least one film in each category.",
  "rewritten": "Find the first name and last name of all actors who have acted in at least one film in every category present in the film_category table. For each actor, ensure that for every category_id in the film_category table, there exists a film_id that the actor has acted in, as recorded in the film_actor table.",
  "query": "SELECT\n  a.first_name,\n  a.last_name\nFROM actor AS a\nWHERE NOT EXISTS(\n  SELECT\n    c.category_id\n  FROM category AS c\n  WHERE NOT EXISTS(\n    SELECT\n      1\n    FROM film_actor AS fa\n    JOIN film_category AS fc\n      ON fa.film_id = fc.film_id\n    WHERE\n      fa.actor_id = a.actor_id AND fc.category_id = c.category_id\n  )\n);",
  "explanation": "The query identifies actors who have appeared in at least one film in each category. It uses a double negative logic with nested NOT EXISTS clauses. The outer NOT EXISTS checks if there is any category for w

In [5]:
validator = ValidatorAgent()
validator_response = validated = validator.run(generator.get_dictionary(generator_response))
print(validator_response)

{
  "question": "List all actors who have appeared in at least one film in each category.",
  "rewritten": "Find the first name and last name of all actors who have acted in at least one film in every category present in the film_category table. For each actor, ensure that for every category_id in the film_category table, there exists a film_id that the actor has acted in, as recorded in the film_actor table.",
  "query": "SELECT\n  a.first_name,\n  a.last_name\nFROM actor AS a\nWHERE NOT EXISTS(\n  SELECT\n    c.category_id\n  FROM category AS c\n  WHERE NOT EXISTS(\n    SELECT\n      1\n    FROM film_actor AS fa\n    JOIN film_category AS fc\n      ON fa.film_id = fc.film_id\n    WHERE\n      fa.actor_id = a.actor_id AND fc.category_id = c.category_id\n  )\n);",
  "explanation": "The generated query is correct and efficient. It uses a double negative with `NOT EXISTS` to find actors who have appeared in at least one film in every category. The outer `NOT EXISTS` checks if there is an

## Running Query

In [7]:
db = Database()

In [8]:
df = db.fetch_query(validator.get_dictionary(validator_response)['query'])
df

Unnamed: 0,first_name,last_name
0,ED,CHASE
1,JOHNNY,LOLLOBRIGIDA
2,GRACE,MOSTEL
3,MATTHEW,JOHANSSON
4,JOE,SWANK
...,...,...
154,MERYL,ALLEN
155,JAYNE,SILVERSTONE
156,BELA,WALKEN
157,REESE,WEST
