In [1]:
import pandas as pd
from sqlglot import parse_one, exp
from sqlglot.dialects.ma import MA
from sqlglot.dialects.tsql import TSQL
import sqlglot
import re

In [2]:
def open_query(dir:str) -> list:
    """
    Open TSQL queries from one text file
    """ 
    with open(dir, 'r') as file: 
        file = file.read().strip().split(';')
        sql_queries = [query for query in file if query.strip()]
    return sql_queries

preprocessed_queries = []
sql_queries = open_query('data/queries-txts/queries_rabo_qrm.txt')


In [3]:
def split_sql_queries(file_content:str) -> str:
    """
    Splits SQL file content into individual queries based on common starting keywords.

    Parameters:
        file_content (str): The full SQL script as a single string.

    Returns:
        list: A list of individual SQL queries.
    """
    # Regex pattern to identify starting words of a SQL query
    keywords = [
        r"SELECT", r"INSERT", r"UPDATE", r"DELETE", r"MERGE", r"CREATE", r"ALTER",
        r"DROP", r"TRUNCATE", r"BEGIN", r"DECLARE", r"SET", r"EXEC(?:UTE)?", r"WITH",
        r"COMMIT", r"ROLLBACK", r"SAVEPOINT", r"USE", r"SHOW", r"DESCRIBE", r"EXPLAIN"
    ]
    
    # Combine into a single regex pattern
    pattern = r"(?i)\\b(" + "|".join(keywords) + r")\\b"
    
    # Split content based on starting keywords while retaining them
    queries = re.split(pattern, file_content, flags=re.IGNORECASE)
    
    # Recombine split parts with starting keywords attached
    queries = ["".join(queries[i:i+2]).strip() for i in range(0, len(queries), 2)]
    
    # Remove empty queries and trim whitespace
    return [q for q in queries if q.strip()]

# Example usage
sql_file_content = """
SELECT * FROM Users
INSERT INTO Orders (OrderID, CustomerID) VALUES (1, 101)
UPDATE Products SET Price = Price * 1.1 WHERE Category = 'Electronics'
"""
queries = split_sql_queries(sql_file_content)
for i, query in enumerate(queries, start=1):
    print(f"Query {i}:\n{query}\n")


Query 1:
SELECT * FROM Users
INSERT INTO Orders (OrderID, CustomerID) VALUES (1, 101)
UPDATE Products SET Price = Price * 1.1 WHERE Category = 'Electronics'



In [19]:
def replace_spaces_in_brackets(input_string: str, replacement: str = "_space_" ) -> str:
    """
    Replaces spaces within square brackets [] in a string with replacement using regex.

    :param input_string: The input string.
    :return: A new string with spaces replaced within square brackets.
    """
    def replace_space(match):
        # Replace spaces within the matched brackets with other sequence
        return match.group(0).replace(' ', replacement)
    
    # Regex pattern to find text within square brackets, including the brackets
    pattern = r'\[.*?\]'
    
    # Use re.sub with a replacement function
    return re.sub(pattern, replace_space, input_string)

In [20]:
# parse declared variables

for i, query in enumerate(sql_queries):
    for line in query.split("\n"):
        line=line.strip()
        if 'declare' in line.lower():
            #if line[-1] == '?':
            variable = [i for i in re.findall(r"@\w+", line)][0]
            print(variable)
      
            # if there is an equal not followed by ? then extract hardcoded value
            #print(line)
            #print()
            


@PortfolioCurrent
@Market
@CompanyName
@RunId
@RunId1


In [21]:
# parse simple select

for i, query in enumerate(sql_queries):
    if 'select' in query.lower():
        try:
            print(repr(parse_one(query)))
            print()
            print(list([str(i.sql('tsql')) for i in parse_one(query).find_all(exp.Where)]))
            print()
        except:
            pass


Select(
  expressions=[
    EQ(
      this=Parameter(
        this=Var(this=RunId)),
      expression=Column(
        this=Identifier(this=RUNID, quoted=False)))],
  from=From(
    this=Table(
      this=Identifier(this=Run_Time_Description, quoted=False),
      db=Identifier(this=QRM, quoted=False))),
  where=Where(
    this=And(
      this=And(
        this=And(
          this=And(
            this=EQ(
              this=Column(
                this=Identifier(this=Portfolio, quoted=False)),
              expression=Parameter(
                this=Var(this=PortfolioCurrent))),
            expression=EQ(
              this=Cast(
                this=Column(
                  this=Identifier(this=date, quoted=False)),
                to=DataType(this=Type.USERDEFINED, kind=Market)),
              expression=Cast(
                this=Column(
                  this=Identifier(this=date, quoted=False)),
                to=DataType(this=Type.USERDEFINED, kind=Market)))),
          express

In [22]:
# parse update set statements

for i, query in enumerate(sql_queries):
    if 'update' in query.lower() and 'select' not in query.lower():
        #print(repr(parse_one(query.replace('[', '').replace(']', ''))))
        query = replace_spaces_in_brackets(query)

        ast = parse_one(query.replace('[', '').replace(']', ''))
        update = list(ast.find_all(exp.Update))[0]

        table = list(list(update.find_all(exp.Table))[0].find_all(exp.Table))
        print('source and dest table: ', list(list(update.find_all(exp.Table))[0].find_all(exp.Table)))


        columns = list(update.expressions)
        #print('columns: ', list(update.expressions))#.find_all(exp.EQ)))


        for column in list(update.expressions):
            #print(repr(column))
            print(column.this)
            print(column.expression)
            print()

        join  = list(ast.find_all(exp.Join))#[0]
        #print('join: ', join)

        if join != []:
            print('join table: ', list(join[0].find_all(exp.Table)))
            print('join type: ', join[0].kind)
            print('join condition: ', join[0].on)

        where  = list(ast.find_all(exp.Where))#[0]
        print('where: ', where)

        #print(list(update.find_all(exp.Table)))
        print()
       

source and dest table:  [Table(
  this=Identifier(this=FTP_Unpivot, quoted=False),
  db=Identifier(this=staging, quoted=False))]
Funding_space_Component
d.Funding_Component

Receiving_Legal_Entity
d.Receiving_Legal_Entity

join table:  [Table(
  this=Identifier(this=ALM_LKP_Receiving_legal_entity, quoted=False),
  db=Identifier(this=cfg, quoted=False),
  alias=TableAlias(
    this=Identifier(this=d, quoted=False)))]
join type:  INNER
join condition:  <bound method Join.on of Join(
  this=Table(
    this=Identifier(this=ALM_LKP_Receiving_legal_entity, quoted=False),
    db=Identifier(this=cfg, quoted=False),
    alias=TableAlias(
      this=Identifier(this=d, quoted=False))),
  kind=INNER,
  on=EQ(
    this=Column(
      this=Identifier(this=Funding_space_Component, quoted=False),
      table=Identifier(this=a, quoted=False)),
    expression=Column(
      this=Identifier(this=Funding_Component_Export, quoted=False),
      table=Identifier(this=d, quoted=False))))>
where:  []

source and

## Convert Where Statements to Natural Language

In [23]:
where[0].sql('tsql')

"WHERE (Level_4 = 'Fixed Rate Mortgages' AND instrument_set IN ('mortgage', 'offset mortgage') AND external_counterparty_segment = 'households' AND (market_shock LIKE '%SOT%' OR market_shock LIKE '%EVEatR%')) OR (Level_4 = 'Fixed Rate Mortgage Savings' AND instrument_set = 'mortgage savings' AND (market_shock LIKE 'SOT%' OR market_shock LIKE 'EVEatR%')) AND (Portfolio LIKE '%OBV' OR Portfolio LIKE '%ABB')"

In [14]:

def sql_to_natural_language(sql_where_clause):
    """
    Converts a SQL WHERE clause into a natural language explanation.

    Parameters:
        sql_where_clause (str): The SQL WHERE clause to be translated.

    Returns:
        str: The natural language explanation.
    """
    # Replace common SQL syntax with natural language equivalents
    replacements = [
        (r"\bAND\b", "and"),
        (r"\bOR\b", "or"),
        (r"=", "is"),
        (r"IN \((.*?)\)", r"is one of \1"),
        (r"LIKE '%(.*?)%'", r"contains '\1'"),
        (r"LIKE '(.*?)%'", r"starts with '\1'"),
        (r"LIKE '%(.*?)'", r"ends with '\1'"),
        (r"\(\s*(.*?)\s*\)", r"(\1)")  # Remove extra spaces inside parentheses
    ]
    
    natural_lang = sql_where_clause.strip()
    for pattern, replacement in replacements:
        natural_lang = re.sub(pattern, replacement, natural_lang, flags=re.IGNORECASE)

    # Add a period after OR conditions for better readability
    groups = re.split(r"\\s*\\bOR\\b\\s*", natural_lang, flags=re.IGNORECASE)
    explanation = []
    
    for group in groups:
        # Keep AND intact in the explanation
        readable_group = re.sub(r"\\s*\\bAND\\b\\s*", " and ", group)
        explanation.append(f"({readable_group.strip()})")
    
    # Rejoin with " or "
    return " or ".join(explanation)


natural_language = sql_to_natural_language(where[0].sql('tsql'))
print('tsql statement:')
print(where[0].sql('tsql'))
print("Natural Language Explanation:")

print(natural_language)


tsql statement:
WHERE (Level_4 = 'Fixed Rate Mortgages' AND instrument_set IN ('mortgage', 'offset mortgage') AND external_counterparty_segment = 'households' AND (market_shock LIKE '%SOT%' OR market_shock LIKE '%EVEatR%')) OR (Level_4 = 'Fixed Rate Mortgage Savings' AND instrument_set = 'mortgage savings' AND (market_shock LIKE 'SOT%' OR market_shock LIKE 'EVEatR%')) AND (Portfolio LIKE '%OBV' OR Portfolio LIKE '%ABB')
Natural Language Explanation:
(WHERE (Level_4 is 'Fixed Rate Mortgages' and instrument_set is one of 'mortgage', 'offset mortgage' and external_counterparty_segment is 'households' and (market_shock contains 'SOT' or market_shock contains 'EVEatR')) or (Level_4 is 'Fixed Rate Mortgage Savings' and instrument_set is 'mortgage savings' and (market_shock starts with 'SOT' or market_shock starts with 'EVEatR')) and (Portfolio ends with 'OBV' or Portfolio ends with 'ABB'))


In [None]:
def sql_to_natural_language(sql_where_clause):
    """
    Converts a SQL WHERE clause into a natural language explanation.

    Parameters:
        sql_where_clause (str): The SQL WHERE clause to be translated.

    Returns:
        str: The natural language explanation.
    """
    # Replace common SQL syntax with natural language equivalents
    replacements = [
        (r"\bAND\b", "and"),
        (r"\bOR\b", "or"),
        (r"=", "is"),
        (r"IN \((.*?)\)", r"is one of \1"),
        (r"LIKE '%(.*?)%'", r"contains '\1'"),
        (r"LIKE '(.*?)%'", r"starts with '\1'"),
        (r"LIKE '%(.*?)'", r"ends with '\1'"),
        (r"ISDATE\((.*?)\) = 1", r"where \1 is a valid date"),  # Handle ISDATE
        (r"CAST\((.*?) AS (.*?)\)", r"convert \1 to \2"),  # Handle CAST
        (r"@(\w+)", r"the variable '\1'"),  # Handle variables like @PortfolioCurrent
        (r"\(\s*(.*?)\s*\)", r"(\1)")  # Remove extra spaces inside parentheses
    ]
    
    natural_lang = sql_where_clause.strip()
    for pattern, replacement in replacements:
        natural_lang = re.sub(pattern, replacement, natural_lang, flags=re.IGNORECASE)

    # Handle breaking down logical groups
    groups = re.split(r"\\s*\\bOR\\b\\s*", natural_lang, flags=re.IGNORECASE)
    explanation = []
    
    for group in groups:
        # Keep AND intact in the explanation
        readable_group = re.sub(r"\\s*\\bAND\\b\\s*", " and ", group)
        explanation.append(f"({readable_group.strip()})")

    if 'or' in sql_where_clause.lower():
        return " or ".join(explanation)
    else:
        return natural_lang

where_condition = "WHERE Portfolio = @PortfolioCurrent AND CAST(date AS Market) = CAST(date AS Market) AND CompanyName = @CompanyName AND RunParameter IN ('Val_shock', 'Val_shock_old', 'CSRBB') AND ISDATE(Market) = 1"
natural_language = sql_to_natural_language(where_condition)
print('tsql statement:')
print(where[0].sql('tsql'))
print("Natural Language Explanation:")

tsql statement:
WHERE (Level_4 = 'Fixed Rate Mortgages' AND instrument_set IN ('mortgage', 'offset mortgage') AND external_counterparty_segment = 'households' AND (market_shock LIKE '%SOT%' OR market_shock LIKE '%EVEatR%')) OR (Level_4 = 'Fixed Rate Mortgage Savings' AND instrument_set = 'mortgage savings' AND (market_shock LIKE 'SOT%' OR market_shock LIKE 'EVEatR%')) AND (Portfolio LIKE '%OBV' OR Portfolio LIKE '%ABB')
Natural Language Explanation:


In [None]:
parse_one("""From  staging.FTP_Unpivot a
          INNER JOIN cfg.ALM_LKP_Receiving_legal_entity d 
ON 
   a.Funding_Component=d.Funding_Component_Export""")

Select(
  expressions=[
    Star()],
  from=From(
    this=Table(
      this=Identifier(this=FTP_Unpivot, quoted=False),
      db=Identifier(this=staging, quoted=False),
      alias=TableAlias(
        this=Identifier(this=a, quoted=False)))),
  joins=[
    Join(
      this=Table(
        this=Identifier(this=ALM_LKP_Receiving_legal_entity, quoted=False),
        db=Identifier(this=cfg, quoted=False),
        alias=TableAlias(
          this=Identifier(this=d, quoted=False))),
      kind=INNER,
      on=EQ(
        this=Column(
          this=Identifier(this=Funding_Component, quoted=False),
          table=Identifier(this=a, quoted=False)),
        expression=Column(
          this=Identifier(this=Funding_Component_Export, quoted=False),
          table=Identifier(this=d, quoted=False))))])