In [2]:
import re

In [5]:
def extract_subqueries(sql_query):
    subquery_dict = {}
    subquery_count = 1

    # Pattern to match subqueries
    subquery_pattern = re.compile(r'\(([^()]*SELECT[^()]*FROM[^()]*)\)', re.IGNORECASE)

    def replace_subquery(match):
        nonlocal subquery_count
        subquery_label = f"subquery{subquery_count}"
        subquery_content = match.group(1)
        subquery_dict[subquery_label] = f"({subquery_content})"
        subquery_count += 1
        return subquery_label

    # Extract subqueries
    modified_sql_query = subquery_pattern.sub(replace_subquery, sql_query)

    # Replace nested subquery placeholders
    while subquery_pattern.search(modified_sql_query):
        modified_sql_query = subquery_pattern.sub(replace_subquery, modified_sql_query)

    # Final dictionary
    result_dict = {
        'modified_SQL_query': modified_sql_query,
        'subquery_dictionary': subquery_dict
    }

    return result_dict

input_sql_query = '''INSERT INTO PROCESS_DB.market_data_analysis (
    stock_id, ticker, sector, total_volume_last_year, 
    avg_closing_price_last_year, max_closing_price_last_year, 
    min_closing_price_last_year, total_trades, avg_trade_value, 
    max_trade_value, min_trade_value, total_dividends_paid, 
    avg_dividend_yield, highest_volume_day, lowest_volume_day, 
    most_recent_closing_price, total_investors, total_portfolios, 
    most_active_investor, least_active_investor
)
SELECT 
    md.stock_id, s.ticker, s.sector, 
    SUM(md.volume) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS total_volume_last_year,
    AVG(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS avg_closing_price_last_year,
    MAX(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS max_closing_price_last_year,
    MIN(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS min_closing_price_last_year,
    COUNT(t.trade_id) AS total_trades,
    AVG(t.quantity * t.price_per_share) AS avg_trade_value,
    MAX(t.quantity * t.price_per_share) AS max_trade_value,
    MIN(t.quantity * t.price_per_share) AS min_trade_value,
    SUM(d.dividend_amount * t.quantity) AS total_dividends_paid,
    AVG(CASE WHEN md.closing_price > 0 THEN d.dividend_amount / md.closing_price ELSE 0 END) AS avg_dividend_yield,
    (SELECT md1.market_date FROM SOURCE_DB.market_data md1 WHERE md1.stock_id = md.stock_id ORDER BY md1.volume DESC LIMIT 1) AS highest_volume_day,
    (SELECT md1.market_date FROM SOURCE_DB.market_data md1 WHERE md1.stock_id = md.stock_id ORDER BY md1.volume ASC LIMIT 1) AS lowest_volume_day,
    (SELECT md1.closing_price FROM SOURCE_DB.market_data md1 WHERE md1.stock_id = md.stock_id ORDER BY md1.market_date DESC LIMIT 1) AS most_recent_closing_price,
    COUNT(DISTINCT i.investor_id) AS total_investors,
    COUNT(DISTINCT p.portfolio_id) AS total_portfolios,
    (SELECT i.investor_id FROM SOURCE_DB.trades t1 JOIN SOURCE_DB.portfolios p ON t1.portfolio_id = p.portfolio_id JOIN SOURCE_DB.investors i ON p.investor_id = i.investor_id WHERE t1.stock_id = s.stock_id GROUP BY i.investor_id ORDER BY COUNT(t1.trade_id) DESC LIMIT 1) AS most_active_investor,
    (SELECT i.investor_id FROM SOURCE_DB.trades t1 JOIN SOURCE_DB.portfolios p ON t1.portfolio_id = p.portfolio_id JOIN SOURCE_DB.investors i ON p.investor_id = i.investor_id WHERE t1.stock_id = s.stock_id GROUP BY i.investor_id ORDER BY COUNT(t1.trade_id) ASC LIMIT 1) AS least_active_investor
FROM 
    SOURCE_DB.market_data md
JOIN 
    SOURCE_DB.stocks s ON md.stock_id = s.stock_id
JOIN 
    SOURCE_DB.trades t ON s.stock_id = t.stock_id
JOIN 
    SOURCE_DB.portfolios p ON t.portfolio_id = p.portfolio_id
JOIN 
    SOURCE_DB.investors i ON p.investor_id = i.investor_id
JOIN 
    SOURCE_DB.dividends d ON s.stock_id = d.stock_id
GROUP BY 
    md.stock_id, s.ticker, s.sector;'''

output = extract_subqueries(input_sql_query)
print(output)

{'modified_SQL_query': 'INSERT INTO PROCESS_DB.market_data_analysis (\n    stock_id, ticker, sector, total_volume_last_year, \n    avg_closing_price_last_year, max_closing_price_last_year, \n    min_closing_price_last_year, total_trades, avg_trade_value, \n    max_trade_value, min_trade_value, total_dividends_paid, \n    avg_dividend_yield, highest_volume_day, lowest_volume_day, \n    most_recent_closing_price, total_investors, total_portfolios, \n    most_active_investor, least_active_investor\n)\nSELECT \n    md.stock_id, s.ticker, s.sector, \n    SUM(md.volume) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS total_volume_last_year,\n    AVG(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS avg_closing_price_last_year,\n    MAX(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS max_closing_price_last_year,\n    MIN(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS min_cl

In [6]:
output

{'modified_SQL_query': 'INSERT INTO PROCESS_DB.market_data_analysis (\n    stock_id, ticker, sector, total_volume_last_year, \n    avg_closing_price_last_year, max_closing_price_last_year, \n    min_closing_price_last_year, total_trades, avg_trade_value, \n    max_trade_value, min_trade_value, total_dividends_paid, \n    avg_dividend_yield, highest_volume_day, lowest_volume_day, \n    most_recent_closing_price, total_investors, total_portfolios, \n    most_active_investor, least_active_investor\n)\nSELECT \n    md.stock_id, s.ticker, s.sector, \n    SUM(md.volume) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS total_volume_last_year,\n    AVG(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS avg_closing_price_last_year,\n    MAX(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS max_closing_price_last_year,\n    MIN(md.closing_price) FILTER (WHERE md.market_date >= DATEADD(year, -1, GETDATE())) AS min_cl

In [4]:
import re

def extract_subqueries(sql_query):
    subquery_dict = {}
    subquery_counter = {}
    
    def replace_subquery(match, level):
        subquery_content = match.group(1)
        if level not in subquery_counter:
            subquery_counter[level] = 0
        subquery_counter[level] += 1
        subquery_label = f"subquery{level}_{subquery_counter[level]}"
        subquery_dict[subquery_label] = f"({subquery_content})"
        return subquery_label

    def process_query(query, level=1):
        subquery_pattern = re.compile(r'\(([^()]*SELECT[^()]*FROM[^()]*)\)', re.IGNORECASE)
        
        def replacer(match):
            return replace_subquery(match, level)
        
        modified_query = subquery_pattern.sub(replacer, query)

        # Process nested subqueries
        while subquery_pattern.search(modified_query):
            modified_query = subquery_pattern.sub(replacer, modified_query)
        
        return modified_query

    modified_sql_query = process_query(sql_query)
    
    result_dict = {
        'modified_SQL_query': modified_sql_query,
        'subquery_dictionary': subquery_dict
    }

    return result_dict

input_sql_query = '''CREATE VIEW ORDER_RECONCILIATION AS
SELECT A.CustomerId, C.CustomerName,  COUNT( DISTINCT A.OrderId) TotalNBOrders, COUNT( DISTINCT A.InvoiceId) TotalNBInvoices,
       SUM(A.UnitPrice*A.Quantity)AS OrdersTotalValue,  SUM(A.UnitPriceI * A.QuantityI) AS InvoicesTotalValue,
       ABS(SUM(A.UnitPrice * A.Quantity) -  SUM(A.UnitPriceI*A.QuantityI)) AS AbsoluteValueDifference
FROM 
(
    SELECT O.CustomerID, O.OrderId, NULL AS InvoiceID, OL.UnitPrice, OL.Quantity, 0 AS UnitPriceI, 0 AS QuantityI, OL.OrderLineID, NULL AS InvoiceLineID 
    FROM Sales.Orders As O, Sales.OrderLines AS OL
    WHERE O.OrderId = OL.OrderID AND EXISTS
    (   SELECT II.OrderId
        FROM Sales.Invoices AS II
        WHERE II.OrderID = O.OrderID
    )
    UNION
    SELECT I.CustomerID, NULL AS OrderId, I.InvoiceID, 0 AS UnitPriceO, 0 AS QuantityO, IL.UnitPrice, IL.Quantity, NULL AS OrderLineID, InvoiceLineID
    FROM Sales.Invoices AS I, Sales.InvoiceLines AS IL
    WHERE I.InvoiceID = IL.InvoiceID
) AS A, Sales.Customers As C
WHERE A.CustomerID = C.CustomerID
GROUP BY A.CustomerID, C.CustomerName
ORDER BY AbsoluteValueDifference DESC, TotalNBOrders, CustomerName;'''

output = extract_subqueries(input_sql_query)
output


{'modified_SQL_query': 'CREATE VIEW ORDER_RECONCILIATION AS\nSELECT A.CustomerId, C.CustomerName,  COUNT( DISTINCT A.OrderId) TotalNBOrders, COUNT( DISTINCT A.InvoiceId) TotalNBInvoices,\n       SUM(A.UnitPrice*A.Quantity)AS OrdersTotalValue,  SUM(A.UnitPriceI * A.QuantityI) AS InvoicesTotalValue,\n       ABS(SUM(A.UnitPrice * A.Quantity) -  SUM(A.UnitPriceI*A.QuantityI)) AS AbsoluteValueDifference\nFROM \nsubquery1_2 AS A, Sales.Customers As C\nWHERE A.CustomerID = C.CustomerID\nGROUP BY A.CustomerID, C.CustomerName\nORDER BY AbsoluteValueDifference DESC, TotalNBOrders, CustomerName;',
 'subquery_dictionary': {'subquery1_1': '(   SELECT II.OrderId\n        FROM Sales.Invoices AS II\n        WHERE II.OrderID = O.OrderID\n    )',
  'subquery1_2': '(\n    SELECT O.CustomerID, O.OrderId, NULL AS InvoiceID, OL.UnitPrice, OL.Quantity, 0 AS UnitPriceI, 0 AS QuantityI, OL.OrderLineID, NULL AS InvoiceLineID \n    FROM Sales.Orders As O, Sales.OrderLines AS OL\n    WHERE O.OrderId = OL.OrderID 

In [5]:
import re

def extract_subqueries(sql_query):
    subquery_dict = {}
    subquery_counter = {}

    def process_query(query, level=1):
        subquery_pattern = re.compile(r'\(([^()]*SELECT[^()]*FROM[^()]*)\)', re.IGNORECASE)
        
        def replace_subquery(match):
            nonlocal level
            subquery_content = match.group(1)
            if level not in subquery_counter:
                subquery_counter[level] = 0
            subquery_counter[level] += 1
            subquery_label = f"subquery{level}_{subquery_counter[level]}"
            subquery_dict[subquery_label] = f"({subquery_content})"
            # Recursively process the nested subquery to capture any subqueries within it
            subquery_dict[subquery_label] = process_query(subquery_dict[subquery_label], level + 1)
            return subquery_label

        modified_query = subquery_pattern.sub(replace_subquery, query)
        return modified_query

    modified_sql_query = process_query(sql_query)
    
    result_dict = {
        'modified_SQL_query': modified_sql_query,
        'subquery_dictionary': subquery_dict
    }

    return result_dict

input_sql_query = '''CREATE VIEW ORDER_RECONCILIATION AS
SELECT A.CustomerId, C.CustomerName,  COUNT( DISTINCT A.OrderId) TotalNBOrders, COUNT( DISTINCT A.InvoiceId) TotalNBInvoices,
       SUM(A.UnitPrice*A.Quantity)AS OrdersTotalValue,  SUM(A.UnitPriceI * A.QuantityI) AS InvoicesTotalValue,
       ABS(SUM(A.UnitPrice * A.Quantity) -  SUM(A.UnitPriceI*A.QuantityI)) AS AbsoluteValueDifference
FROM 
(
    SELECT O.CustomerID, O.OrderId, NULL AS InvoiceID, OL.UnitPrice, OL.Quantity, 0 AS UnitPriceI, 0 AS QuantityI, OL.OrderLineID, NULL AS InvoiceLineID 
    FROM Sales.Orders As O, Sales.OrderLines AS OL
    WHERE O.OrderId = OL.OrderID AND EXISTS
    (   SELECT II.OrderId
        FROM Sales.Invoices AS II
        WHERE II.OrderID = O.OrderID
    )
    UNION
    SELECT I.CustomerID, NULL AS OrderId, I.InvoiceID, 0 AS UnitPriceO, 0 AS QuantityO, IL.UnitPrice, IL.Quantity, NULL AS OrderLineID, InvoiceLineID
    FROM Sales.Invoices AS I, Sales.InvoiceLines AS IL
    WHERE I.InvoiceID = IL.InvoiceID
) AS A, Sales.Customers As C
WHERE A.CustomerID = C.CustomerID
GROUP BY A.CustomerID, C.CustomerName
ORDER BY AbsoluteValueDifference DESC, TotalNBOrders, CustomerName;'''

output = extract_subqueries(input_sql_query)
print(output)


: 