In [5]:
import pandas as pd
import openai
import json
import os
from openai import OpenAI
import re

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv('OPENAI_KEY'),
)

In [6]:
# open queries file

with open('data/queries-txts/chatgpt_queries2.txt', 'r') as file:
    file = file.read().strip().split(';')
    sql_queries = [re.sub(r'\s+', ' ', query.strip().replace('\n', ' ').replace('\t', ' ')) for query in file if query.strip()]
    
sql_queries

["CREATE VIEW CUSTOMER_ORDER AS SELECT c.customer_id, c.first_name, c.last_name, c.email, recent_orders.order_id, recent_orders.order_date, recent_orders.total_order_amount, recent_payments.total_payment_amount, recent_order_items.product_name, recent_order_items.category_name, recent_order_items.quantity, recent_order_items.unit_price, recent_order_items.total_item_amount, CAST(avg_product_rating.avg_rating AS DECIMAL(3,2)) AS avg_product_rating, CAST(total_customer_spend.total_spent AS DECIMAL(10,2)) AS total_customer_spent FROM customers c JOIN ( SELECT o.order_id, o.customer_id, o.order_date, o.total_amount AS total_order_amount FROM orders o WHERE o.order_date >= DATEADD(month, -1, GETDATE()) AND o.status = 'Completed' ) recent_orders ON c.customer_id = recent_orders.customer_id JOIN ( SELECT p.order_id, SUM(p.payment_amount) AS total_payment_amount FROM payments p GROUP BY p.order_id ) recent_payments ON recent_orders.order_id = recent_payments.order_id JOIN ( SELECT oi.order_id,

In [3]:

def get_gpt_response(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        temperature = 0,
        messages=[
            {"role": "system", "content": "You are an SQL expert. Help with the best of your capabilities whatever problem is presented."},
            {"role": "user", "content": prompt},
        ]
    )
    
    return response.choices[0].message.content

def parse_sql_to_json(response):

    try:
        # Convert the response to JSON format
        parsed_json = json.loads(response)
    except json.JSONDecodeError as e:
        print(f"Failed to parse the response as JSON. Error: {e}")
        return None
    
    return parsed_json


In [4]:
# gpt prompts

# get json tree
prompt ="""
        I need to convert an SQL query into a structured JSON file. The JSON file should capture all the details contained in the SQL query, including but not limited to:
        1. SQL Query Structure: Tables, columns, joins, conditions, grouping, ordering, subqueries and any other SQL constructs.
        2. Metadata: Aliases, table names, column names, subquery names and any other relevant metadata.
        3. Query Details: SELECT statements, WHERE clauses, GROUP BY clauses, ORDER BY clauses, and any subqueries.
        The JSON format should be precise and error-free. Your response should only contain the json tree.
        The query is:\n\n{sql_query}"
"""

# extract subqueries

prompt = """From the SQL query, extract all the subqueries and nested subqueries, 
replace them with a temporary table name (e.g.: subquery1, subquery2...).
Save the content of the subqueries in a separate dictionary (e.g.: {subquery1: 'select * from A', subquery2: 'select c from B', ...}).
Your response shoulf therefore contain two objects: the modified SQL query and the dictionary containing the subqueries content.
The query is:\n\n{sql_query}"
"""
sql_query = '1'
prompt = f"""In the SQL query, extract all the subqueries and nested subqueries, 
    replace them with a temporary table name (e.g.: subquery1, subquery2...).
    Save the content of the subqueries in a separate dictionary (e.g.: {{'subquery1': 'select * from A', 'subquery2': 'select c from B', ...}}).
    Your response shoulf therefore contain two objects: the modified SQL query and the dictionary containing the subqueries content.
    The query is:\n\n{sql_query}"
"""
print(prompt)

In the SQL query, extract all the subqueries and nested subqueries, 
    replace them with a temporary table name (e.g.: subquery1, subquery2...).
    Save the content of the subqueries in a separate dictionary (e.g.: {'subquery1': 'select * from A', 'subquery2': 'select c from B', ...}).
    Your response shoulf therefore contain two objects: the modified SQL query and the dictionary containing the subqueries content.
    The query is:

1"



In [5]:
if __name__ == "__main__":
    responses= []
    for i, sql_query in enumerate(sql_queries):
        prompt = f"""In the SQL query, follow the steps:
            1. extract all the subqueries, 
            2. replace them with a temporary table name (e.g.: subquery1_1, subquery1_2...).
            3. Save the content of the subqueries in a separate dictionary (e.g.: subquery_dictionary = {{'subquery1_1': 'select * from A', 'subquery1_2': 'select c from B'...}}).
            4. Iterate through the subquery_dictionary values and, if the subquery is not in an EXISTS() function, extract the nested subqueries in the subqueries,
            5. Replace the nested subqueries with a temporary table name (e.g.: subquery2_1, subquery2_2...) in the subquery_dictionary
            6. Add the content to the subquery_dictionary.
            7. Repeat steps 4, 5 and 6 until no more nested subqueries are present in the subquery_dictionary values.

            Your response should therefore contain a dictionary with two objects: the modified SQL_query which is a string and the dictionary containing the subqueries content. Your response should only contain the requested data, no extra words needed.
            The query is:\n\n{sql_query}"
        """
        response = get_gpt_response(prompt, 'gpt-4o')
        responses.append(response)


In [7]:
for i, response in enumerate(responses):
    response = response.replace('json', '').strip('```')
    json_data = parse_sql_to_json(response.strip())
    print(json_data)

    # Specify the file name
    filename = f'gpt-sql-queries-extracted3/json_data{i}.json'

    # Save the dictionary as a JSON file
    with open(filename, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

    print(f"Dictionary saved as {filename}")

{'modified_SQL_query': 'CREATE VIEW CUSTOMER_ORDER AS SELECT c.customer_id, c.first_name, c.last_name, c.email, recent_orders.order_id, recent_orders.order_date, recent_orders.total_order_amount, recent_payments.total_payment_amount, recent_order_items.product_name, recent_order_items.category_name, recent_order_items.quantity, recent_order_items.unit_price, recent_order_items.total_item_amount, CAST(avg_product_rating.avg_rating AS DECIMAL(3,2)) AS avg_product_rating, CAST(total_customer_spend.total_spent AS DECIMAL(10,2)) AS total_customer_spent FROM customers c JOIN subquery1_1 recent_orders ON c.customer_id = recent_orders.customer_id JOIN subquery1_2 recent_payments ON recent_orders.order_id = recent_payments.order_id JOIN subquery1_3 recent_order_items ON recent_orders.order_id = recent_order_items.order_id JOIN subquery1_4 avg_product_rating ON recent_order_items.product_id = avg_product_rating.product_id JOIN subquery1_5 total_customer_spend ON c.customer_id = total_customer_sp