In [1]:

from sqlglot import parse_one, exp
from sqlglot.dialects.tsql import TSQL
import pypyodbc as odbc
import configparser
import copy
from collections import defaultdict
from collections import OrderedDict 
import pandas as pd
import pypyodbc as odbc
import configparser
import os
import json
import re
from modules.sql_parser.parse_nodes import *



In [2]:
# load data

queries = [] # queries content list
files = [] # file names list
for file in os.listdir("data/preprocessed-queries/"):
    files.append(file.split('.')[0])
    with open(f'data/preprocessed-queries/{file}', 'r') as file:
        data = json.load(file)

    queries.append(data)

print(files[0])
queries[0]


json_data0


{'modified_SQL_query': 'CREATE VIEW INVESTOR_ANALYSIS AS SELECT *, IFTHENELSE(EQ(Discount, 0),\'C\',EQ(Discount, 0.1),\'B\', \'A\') AS DiscountCategory FROM "dbo"."Investors_Extract"',
 'subquery_dictionary': {}}

In [3]:
# reverse subqueries list to have them ordered from deepest to shallowest

query_subqueries = dict(reversed(list(queries[0]['subquery_dictionary'].items())))
query_subqueries['main_query'] = queries[0]['modified_SQL_query']
query_subqueries


{'main_query': 'CREATE VIEW INVESTOR_ANALYSIS AS SELECT *, IFTHENELSE(EQ(Discount, 0),\'C\',EQ(Discount, 0.1),\'B\', \'A\') AS DiscountCategory FROM "dbo"."Investors_Extract"'}

In [4]:
# create nodes

nodes_dfs = []

queries= []
for i, file in enumerate(os.listdir("data/preprocessed-queries/")):
    filename = file.split('.')[0]

    files.append(file.split('.')[0])
    with open(f'data/preprocessed-queries/{file}', 'r') as file:
        data = json.load(file)

    queries.append(data)

    query_subqueries = dict(reversed(list(queries[i]['subquery_dictionary'].items())))
    query_subqueries['main_query'] = queries[i]['modified_SQL_query']

    nodes = []
    for name_query in query_subqueries:

        query = query_subqueries[name_query]
        print(query)
        if query.startswith("("): # remove open and closing paranthesis from subqueries
            query = query.strip("()")
        else:
            pass
        
        ast = parse_query(query) # get parsed tree
        
        tree = replace_aliases(query) # get transformed tree without table aliases
        
        # parse selects before getting statements
        select_statement_big = tree.find_all(exp.Select)

        source_tables = []
        # for every select statement in query, extract the source tables, where expressions and on conditions
        for select in list(select_statement_big):
            
            source_table, where_exp, on_condition = get_statements(select) 


            source_tables += source_table
        
        

        if 'subquery' in name_query: # if the query is a subquery then the name is the dict key, else the name is the target table
            print(where_exp)

            target_node = name_query
            nodes.append({'NAME_NODE': f"query_{target_node}", 'LABEL_NODE': f'{filename}@query_{target_node}', 'FILTER': where_exp, 'FUNCTION': 'subquery', 'ON': on_condition})

            nodes.append({'NAME_NODE': target_node, 'LABEL_NODE': f'{filename}@{target_node}', 'FILTER': None, 'FUNCTION': 'target', 'ON': None})
        else:
            print(where_exp)

            try: # try to find the create or insert into statements
                target_node = list(ast.find_all(exp.Create))[0].this.this.this
                
                nodes.append({'NAME_NODE': f"query_{target_node}",'LABEL_NODE': f"query_{target_node}", 'FILTER': where_exp, 'FUNCTION': 'query', 'ON': on_condition})
                nodes.append({'NAME_NODE': target_node,'LABEL_NODE': target_node, 'FILTER': None, 'FUNCTION': 'target', 'ON': None})
            except IndexError:
                target_node = list(ast.find_all(exp.Insert))[0].this.this
                nodes.append({'NAME_NODE': f"query_{target_node}",'LABEL_NODE': f"query_{target_node}", 'FILTER': where_exp, 'FUNCTION': 'query', 'ON': on_condition})
                nodes.append({'NAME_NODE': target_node,'LABEL_NODE': target_node, 'FILTER': where_exp, 'FUNCTION': 'target', 'ON': on_condition})


        for table in source_tables:
            if table not in [node['NAME_NODE'] for node in nodes]:
                if 'subquery' in table: 
                    nodes.append({'NAME_NODE': table,'LABEL_NODE': f'{filename}@{table}', 'FILTER': None, 'FUNCTION': 'subquery', 'ON': on_condition})
                else:
                    nodes.append({'NAME_NODE': table,'LABEL_NODE': table, 'FILTER': None, 'FUNCTION': 'DataSources', 'ON': None})
                    
    nodes = pd.DataFrame(nodes)
    nodes['COLOR'] = nodes['FILTER'].apply(lambda x: '#db59a5' if x is not None else '#42d6a4')

    nodes_dfs.append(pd.DataFrame(nodes))


# create and save dataframe
nodes = pd.concat(nodes_dfs).reset_index(drop=True)
nodes['ON'] = nodes['ON'].apply(lambda x: None if x == [] else x) # remove empty lists
nodes = nodes.drop_duplicates(subset=['NAME_NODE', 'LABEL_NODE', 'FILTER', 'FUNCTION']).reset_index(drop=True)
nodes['ID'] = nodes.index

nodes.to_csv('data/output-tables/nodes.csv')

nodes


CREATE VIEW INVESTOR_ANALYSIS AS SELECT *, IFTHENELSE(EQ(Discount, 0),'C',EQ(Discount, 0.1),'B', 'A') AS DiscountCategory FROM "dbo"."Investors_Extract"
None
CREATE VIEW SUPPLIERS_ANALYSIS AS SELECT id, Name, Entity, Discount, IFTHENELSE(EQ(Discount, 0),'C',EQ(Discount, 0.1),'B', 'A') AS DiscountCategory FROM "dbo"."Suppliers"
None


Unnamed: 0,NAME_NODE,LABEL_NODE,FILTER,FUNCTION,ON,COLOR,ID
0,query_INVESTOR_ANALYSIS,query_INVESTOR_ANALYSIS,,query,,#42d6a4,0
1,INVESTOR_ANALYSIS,INVESTOR_ANALYSIS,,target,,#42d6a4,1
2,"""dbo"".""Investors_Extract""","""dbo"".""Investors_Extract""",,DataSources,,#42d6a4,2
3,query_SUPPLIERS_ANALYSIS,query_SUPPLIERS_ANALYSIS,,query,,#42d6a4,3
4,SUPPLIERS_ANALYSIS,SUPPLIERS_ANALYSIS,,target,,#42d6a4,4
5,"""dbo"".""Suppliers""","""dbo"".""Suppliers""",,DataSources,,#42d6a4,5


In [5]:
nodes

Unnamed: 0,NAME_NODE,LABEL_NODE,FILTER,FUNCTION,ON,COLOR,ID
0,query_INVESTOR_ANALYSIS,query_INVESTOR_ANALYSIS,,query,,#42d6a4,0
1,INVESTOR_ANALYSIS,INVESTOR_ANALYSIS,,target,,#42d6a4,1
2,"""dbo"".""Investors_Extract""","""dbo"".""Investors_Extract""",,DataSources,,#42d6a4,2
3,query_SUPPLIERS_ANALYSIS,query_SUPPLIERS_ANALYSIS,,query,,#42d6a4,3
4,SUPPLIERS_ANALYSIS,SUPPLIERS_ANALYSIS,,target,,#42d6a4,4
5,"""dbo"".""Suppliers""","""dbo"".""Suppliers""",,DataSources,,#42d6a4,5
