In [1]:
# openpyxl pypyodbc pandas ipykernel sqlglot(forked)
import pypyodbc as odbc
from sqlglot import parse_one, exp
from sqlglot.dialects.tsql import TSQL
from sqlglot.dialects.ma import MA
import configparser
import copy
from collections import defaultdict
from collections import OrderedDict 
import pandas as pd
import pypyodbc as odbc
import configparser
import os
import json
import re
from modules.sql_parser.parse_lineages import *
from modules.sql_parser.parse_nodes import *



In [2]:
# load data

queries = [] # queries content list
files = [] # file names list
for file in os.listdir("data/preprocessed-queries/"):
    files.append(file.split('.')[0])
    with open(f'data/preprocessed-queries/{file}', 'r') as file:
        data = json.load(file)
    queries.append(data)

# load nodes dataset
nodes = pd.read_csv('data/output-tables/nodes.csv')
nodes.head()

Unnamed: 0.1,Unnamed: 0,NAME_NODE,LABEL_NODE,FILTER,FUNCTION,ON,COLOR,ID
0,0,query_INVESTOR_ANALYSIS,query_INVESTOR_ANALYSIS,,query,,#42d6a4,0
1,1,INVESTOR_ANALYSIS,INVESTOR_ANALYSIS,,target,,#42d6a4,1
2,2,"""dbo"".""Investors_Extract""","""dbo"".""Investors_Extract""",,DataSources,,#42d6a4,2
3,3,query_SUPPLIERS_ANALYSIS,query_SUPPLIERS_ANALYSIS,,query,,#42d6a4,3
4,4,SUPPLIERS_ANALYSIS,SUPPLIERS_ANALYSIS,,target,,#42d6a4,4


In [3]:
def SQL_transf(query, inputl = "tsql", outputl = "ma"):
    """ 
    Function is designed to extract all the functions from a given sql script and convert them into the desired
    dialect. Returns a list of tuples where the first element is the recognised function and the second one
    is the redesigned function. The first argument is the raw query that needs translation, and it always
    has to be accompanied by a starting SELECT statement. Second argument is the lookup list that should contain all
    the function keywords that need to be looked for within the raw query. inputl and outputl are the input language and
    the output language respectively. 
    """
    pattern = r'\$\$([^$]+?)\$\$'  # Modified regex pattern to capture the variable
    matches = re.findall(pattern, query)
    
    if matches:
        for var_name in matches:
            new_string = f'"{var_name}"'
            query = re.sub(rf"\'\$\${re.escape(var_name)}\$\$\'", new_string, query)
         
    df_tf = pd.read_excel('data/functions.xlsx')
    flookup = list(df_tf["Parser Keyword"])
    ast = parse_one(query, dialect = inputl)
    
    org_columns = list(ast.find_all(exp.Column))
    cleaned_columns = []
    
    for element in org_columns:
        if "$" in element.name:
            cleaned_columns.append((element.name,element.name.replace("$", "")))
  
    def transformer_column(node):
        for element in cleaned_columns:
            if isinstance(node, exp.Column) and node.name == element[0]:
                return parse_one('"'+element[1]+'"')
        return node

    cleaned_tree = ast.transform(transformer_column)

    general_syntax= []
    transformations = []
    scripts = []
    for i in flookup:    
        o = list(cleaned_tree.find_all(getattr(exp, i)))
        for element in o:
            general_syntax.append(element.sql(dialect = outputl))
            scripts.append(repr(element))
            if len(o)>0:
                transformations.append(i)
        
    matched = list(zip(transformations, general_syntax))  
    
    mother_expressions = []
    for expression1 in matched:
        is_mother = True
        for expression2 in matched:
            if expression1[1] != expression2[1] and expression1[1] in expression2[1]:
                is_mother = False
                break
            
        if is_mother: 
            mother_expressions.append(expression1)
       
    ifs = []
    cases = []
    
    for i, element in list(enumerate(mother_expressions)):
        if element[0] == "If":
            ifs.append((i,element[1].replace(" ","").replace("(", "").replace(")", "").strip("IFTHENELSE")))
        elif element[0] == "Case":
            cases.append((i, element[1].replace(" ","").replace("(", "").replace(")", "").strip("IFTHENELSE")))
    
    singular_if = []
    for element in ifs:
        main = False
        for i in cases:
           if element[1] in i[1]:
                main = True
        if main:
            singular_if.append(element)
    
    index_remove =[]
    for element in singular_if:
        index_remove.append(element[0])
    
    list_remove=[]
    for element in index_remove:
        for l in mother_expressions:
            if element == mother_expressions.index(l):
                list_remove.append(l)
            
    for element in list_remove:
        mother_expressions.remove(element)
    
    return mother_expressions[0][1]



SQL_transf('SUM(total_amount)')

'SUM(total_amount)'

In [4]:
# extract lineages

lineages_dfs = []
trees = []
queries=[]
for i, file in enumerate(os.listdir("data/preprocessed-queries/")):
    filename = file.split('.')[0]
    print(filename)
    files.append(file.split('.')[0])
    with open(f'data/preprocessed-queries/{file}', 'r') as file:
        data = json.load(file)

    queries.append(data)

    # reverse subqueries dict to start from deepest level
    query_subqueries = dict(reversed(list(queries[i]['subquery_dictionary'].items())))
    query_subqueries['main_query'] = queries[i]['modified_SQL_query']
    query_subqueries

    lineages = [] # list of dictionaries with the nodes

    for name_query in query_subqueries:

        query = query_subqueries[name_query]

        if query.startswith("("):
            query = query.strip("()")
        else:
            pass
        ast = parse_query(query) # get parsed tree

        if 'subquery' in name_query: # if the query is a subquery then the name is the dict key, else the name is the target table
            target_node = name_query
            target_columns =[]

        else:
            target_columns =[]

            try: # try with create table statement
                target_node = list(ast.find_all(exp.Create))[0].this.this.this
            except IndexError: # else try with insert into table statement
                target_node = list(ast.find_all(exp.Insert))[0].this.this

                insert_obj = list(ast.find_all(exp.Insert))[0]
                target_columns = list(insert_obj.find_all(exp.Column))
                target_columns = [[i] for i in target_columns]
                

        space_table = find_table_w_spaces(ast) # list with tables with spaces (sqlglot cant parse them)

        space_table = list(set(space_table)) # a list of tuples with table names paired (space removed original - original ) Eg. (OrderDetails, Order Details)

        alias_table = get_tables(ast) # parse table name + table alias

        tree = replace_aliases(query) # transform query by removing table aliases

        if target_columns == []:
            select_statement, target_columns = extract_target_columns(tree) # extract target columns
        else:
            select_statement, x = extract_target_columns(tree) # extract target columns


        replaced_trees = [x.transform(transformer_functions) for x in select_statement] # replace columns aliases
        trees.append(replaced_trees)

        # add possible transformation to columns
        transformations = extract_transformation(replaced_trees)
        target_columns = list(zip(target_columns, transformations)) 

        query_node = f'query_{target_node}'
        
        lineages = extract_source_target_transformation(target_columns, lineages, space_table, query_node, target_node) # append lineages of node to list


    lineages = pd.DataFrame(lineages)

    lineages = lineages.explode('SOURCE_COLUMNS').reset_index()

    lineages['FILE_NAME'] = filename
    lineages['ROW_ID'] = 0
    lineages['LINK_VALUE'] = 1

    lineages['SOURCE_NODE'] = lineages['SOURCE_COLUMNS'].apply(lambda x:".".join(x.split('.')[0:-1]))
    lineages['TARGET_NODE'] = lineages['TARGET_COLUMN'].apply(lambda x:".".join(x.split('.')[0:-1]))

    lineages['SOURCE_FIELD'] = lineages['SOURCE_COLUMNS'].apply(lambda x:x.split('.')[-1])
    lineages['TARGET_FIELD'] = lineages['TARGET_COLUMN'].apply(lambda x:x.split('.')[-1])

    lineages['SOURCE_NODE'] = [f'{filename}@{i}' if 'subquery' in i else i for i in lineages['SOURCE_NODE'] ]
    lineages['TARGET_NODE'] = [f'{filename}@{i}' if 'subquery' in i else i for i in lineages['TARGET_NODE']]


    lineages['COLOR'] =  ["aliceblue" if i == "" else "orangered" for i in lineages['TRANSFORMATION']]

    # merge source id
    lineages = pd.merge(lineages, nodes[['ID', 'LABEL_NODE']], left_on='SOURCE_NODE', right_on = 'LABEL_NODE', how='left')
    lineages['SOURCE_NODE'] = lineages['ID']
    lineages.drop(columns=['ID', 'LABEL_NODE'], inplace=True)

    # merge target id
    lineages = pd.merge(lineages, nodes[['ID', 'LABEL_NODE']], left_on='TARGET_NODE', right_on = 'LABEL_NODE', how='left')
    lineages['TARGET_NODE'] = lineages['ID']
    lineages.drop(columns=['ID', 'LABEL_NODE'], inplace=True)

    lineages = lineages.drop_duplicates(subset =['SOURCE_COLUMNS', 'TARGET_COLUMN', 'TRANSFORMATION']).reset_index(drop=True)


    lineages.to_csv(f"data/output-tables/lineages/lineage-{target_node}.csv")
    lineages_dfs.append(lineages)

json_data0
[]
['INVESTOR_ANALYSIS.Discount', 'INVESTOR_ANALYSIS.Discount']
json_data1
['SUPPLIERS_ANALYSIS.id']
['SUPPLIERS_ANALYSIS.Name']
['SUPPLIERS_ANALYSIS.Entity']
['SUPPLIERS_ANALYSIS.Discount']
['SUPPLIERS_ANALYSIS.Discount', 'SUPPLIERS_ANALYSIS.Discount']


In [5]:
lineages_dfs[0]

Unnamed: 0,index,SOURCE_COLUMNS,TARGET_COLUMN,TRANSFORMATION,FILE_NAME,ROW_ID,LINK_VALUE,SOURCE_NODE,TARGET_NODE,SOURCE_FIELD,TARGET_FIELD,COLOR
0,0,INVESTOR_ANALYSIS.Discount,query_INVESTOR_ANALYSIS.DiscountCategory,"IFTHENELSE(EQ(Discount, 0), 'C', EQ(Discount, ...",json_data0,0,1,1,0,Discount,DiscountCategory,orangered
1,1,query_INVESTOR_ANALYSIS.DiscountCategory,INVESTOR_ANALYSIS.DiscountCategory,,json_data0,0,1,0,1,DiscountCategory,DiscountCategory,aliceblue


In [6]:
lineages_dfs.__len__()

2

In [7]:
lineages_dfs[0]['TRANSFORMATION'][1] 

# target node in case of insert into should also include the new columns
# when two columns are multiplied that does not count as transformation, fix that

''