In [2]:
import pandas as pd
import os
import json
import xmltodict
import json
import xml.etree.ElementTree as ET



In [3]:
# parse nodes - generic parser

def parse_dtsx(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Define namespaces
    namespaces = {
        'DTS': 'www.microsoft.com/SqlServer/Dts'
    }
    
    # Function to parse Executable elements
    def parse_executables(executables):
        for executable in executables:
            exec_type = executable.attrib.get(f'{{{namespaces["DTS"]}}}ExecutableType')
            name = executable.attrib.get(f'{{{namespaces["DTS"]}}}Name')
            refid = executable.attrib.get(f'{{{namespaces["DTS"]}}}refId')
            description = executable.attrib.get(f'{{{namespaces["DTS"]}}}Description')
            object_data = executable.attrib.get(f'{{{namespaces["DTS"]}}}ObjectData')


            print(f'Executable: {name}, Type: {exec_type}, refId: {refid}, description: {description}, object data: {object_data}')
            
            # Check for nested executables
            nested_executables = executable.findall('DTS:Executables/DTS:Executable', namespaces)
            if nested_executables:
                parse_executables(nested_executables)


    
    # Parse control flow
    print("Control Flow:")
    executables = root.findall('.//DTS:Executable', namespaces)
    parse_executables(executables)
    
    # Parse data flow components
    print("\nData Flow Components:")
    data_flows = root.findall('.//DTS:Executable[@DTS:ExecutableType="SSIS.Pipeline.2"]/DTS:ObjectData/pipeline/components/component', namespaces)
    for component in data_flows:
        comp_name = component.attrib.get('name')
        comp_class_id = component.attrib.get('componentClassID')
        print(f'Component: {comp_name}, ClassID: {comp_class_id}')

# Example usage
file_path = 'data/DailyETLMain.dtsx'
parse_dtsx(file_path)


Control Flow:
Executable: None, Type: Microsoft.ExpressionTask, refId: Package\Calculate ETL Cutoff Time backup, description: Expression Task, object data: None
Executable: None, Type: Microsoft.ExecuteSQLTask, refId: Package\Ensure Date Dimension includes current year, description: Execute SQL Task, object data: None
Executable: None, Type: STOCK:SEQUENCE, refId: Package\Load City Dimension, description: Sequence Container, object data: None
Executable: None, Type: Microsoft.Pipeline, refId: Package\Load City Dimension\Extract Updated City Data to Staging, description: Data Flow Task, object data: None
Executable: None, Type: Microsoft.ExecuteSQLTask, refId: Package\Load City Dimension\Get Last City ETL Cutoff Time, description: Execute SQL Task, object data: None
Executable: None, Type: Microsoft.ExecuteSQLTask, refId: Package\Load City Dimension\Get Lineage Key, description: Execute SQL Task, object data: None
Executable: None, Type: Microsoft.ExecuteSQLTask, refId: Package\Load Cit

In [5]:
# parse nodes - DailyETLMain.dtsx specific


# Define namespaces
namespaces = {
    'DTS': 'www.microsoft.com/SqlServer/Dts'
}

# Function to parse Executable elements (Control Flow)
def parse_executables(executables, depth=0):
    executables_info = []
    for executable in executables:
        exec_type = executable.attrib.get(f'{{{namespaces["DTS"]}}}ExecutableType')
        name = executable.attrib.get(f'{{{namespaces["DTS"]}}}Name')
        refid = executable.attrib.get(f'{{{namespaces["DTS"]}}}refId')
        description = executable.attrib.get(f'{{{namespaces["DTS"]}}}Description')
        object_data = executable.attrib.get(f'{{{namespaces["DTS"]}}}ObjectData')

        executables_info.append({'name': name, 'type': exec_type, 'depth': depth, 'refid': refid, 'description': description, 'object_data': object_data})
        
        # Check for nested executables
        nested_executables = executable.findall('DTS:Executables/DTS:Executable', namespaces)
        if nested_executables:
            executables_info.extend(parse_executables(nested_executables, depth + 1))
    
    return executables_info

# Function to parse Data Flow components
def parse_data_flows(data_flow_tasks):
    data_flow_info = []
    for task in data_flow_tasks:
        task_name = task.attrib.get(f'{{{namespaces["DTS"]}}}Name')
        components = task.findall('.//component', namespaces)
        for component in components:
            comp_name = component.attrib.get('name')
            comp_class_id = component.attrib.get('componentClassID')
            refid = component.attrib.get('refId')
            name = component.attrib.get('name')
            data_flow_info.append({'task_name': task_name, 'component_name': comp_name, 'classID': comp_class_id, 'name':name, 'refid': refid})
    return data_flow_info

# Load and parse the uploaded .dtsx file
def parse_dtsx(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Parse control flow
    control_flow = parse_executables(root.findall('.//DTS:Executable', namespaces))
    
    # Parse data flow tasks
    data_flow_tasks = root.findall('.//DTS:Executable[@DTS:ExecutableType="Microsoft.Pipeline"]', namespaces)
    data_flow_info = parse_data_flows(data_flow_tasks)
    
    return control_flow, data_flow_info

file_path = 'data/DailyETLMain.dtsx'
control_flow, data_flow_info = parse_dtsx(file_path)

(control_flow, data_flow_info)


([{'name': None,
   'type': 'Microsoft.ExpressionTask',
   'depth': 0,
   'refid': 'Package\\Calculate ETL Cutoff Time backup',
   'description': 'Expression Task',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.ExecuteSQLTask',
   'depth': 0,
   'refid': 'Package\\Ensure Date Dimension includes current year',
   'description': 'Execute SQL Task',
   'object_data': None},
  {'name': None,
   'type': 'STOCK:SEQUENCE',
   'depth': 0,
   'refid': 'Package\\Load City Dimension',
   'description': 'Sequence Container',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.Pipeline',
   'depth': 1,
   'refid': 'Package\\Load City Dimension\\Extract Updated City Data to Staging',
   'description': 'Data Flow Task',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.ExecuteSQLTask',
   'depth': 1,
   'refid': 'Package\\Load City Dimension\\Get Last City ETL Cutoff Time',
   'description': 'Execute SQL Task',
   'object_data': None},
  {'name': None,
   '

In [6]:
file_path = 'data/import_sql_flowA.dtsx' # not good
control_flow, data_flow_info = parse_dtsx(file_path)

(control_flow, data_flow_info)

([{'name': None,
   'type': 'STOCK:SEQUENCE',
   'depth': 0,
   'refid': None,
   'description': None,
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.SqlServer.Dts.Tasks.ScriptTask.ScriptTask, Microsoft.SqlServer.ScriptTask, Version=10.0.0.0, Culture=neutral, PublicKeyToken=89845dcd8080cc91',
   'depth': 0,
   'refid': None,
   'description': None,
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.SqlServer.Dts.Tasks.ExecuteSQLTask.ExecuteSQLTask, Microsoft.SqlServer.SQLTask, Version=10.0.0.0, Culture=neutral, PublicKeyToken=89845dcd8080cc91',
   'depth': 0,
   'refid': None,
   'description': None,
   'object_data': None},
  {'name': None,
   'type': '{E3CFBEA8-1F48-40D8-91E1-2DEDC1EDDD56}',
   'depth': 0,
   'refid': None,
   'description': None,
   'object_data': None},
  {'name': None,
   'type': None,
   'depth': 0,
   'refid': None,
   'description': None,
   'object_data': None},
  {'name': None,
   'type': None,
   'depth': 0,
   'refid': None,
  

In [7]:
file_path = 'data/ISP_02.dtsx' # good
control_flow, data_flow_info = parse_dtsx(file_path)

(control_flow, data_flow_info)

([{'name': None,
   'type': 'STOCK:FOREACHLOOP',
   'depth': 0,
   'refid': 'Package\\Foreach File in Folder',
   'description': 'Foreach Loop Container',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.Pipeline',
   'depth': 1,
   'refid': 'Package\\Foreach File in Folder\\Extra Sample Data Task',
   'description': 'Data Flow Task',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.Pipeline',
   'depth': 0,
   'refid': 'Package\\Foreach File in Folder\\Extra Sample Data Task',
   'description': 'Data Flow Task',
   'object_data': None},
  {'name': None,
   'type': 'Microsoft.ExecuteSQLTask',
   'depth': 0,
   'refid': 'Package\\Reset the NewFactCurrencyRate table',
   'description': 'Execute SQL Task - Reset the NewFactCurrencyRate table',
   'object_data': None}],
 [{'task_name': None,
   'component_name': 'Lookup Currency Key',
   'classID': 'Microsoft.Lookup',
   'name': 'Lookup Currency Key',
   'refid': 'Package\\Foreach File in Folder\\Extra Sample 

In [11]:
import xml.etree.ElementTree as ET

# Read the contents of the .dtsx file
with open('data/import_sql_flowB.dtsx', 'r', encoding='utf-8') as file:
    dtsx_content = file.read()


# Define namespaces
namespaces = {
    'DTS': 'www.microsoft.com/SqlServer/Dts'
}

# Function to parse Executable elements (Control Flow)
def parse_executables(executables, precedence_constraints, depth=0):
    executables_info = []
    for executable in executables:
        exec_id = executable.attrib.get(f'{{{namespaces["DTS"]}}}ID')
        exec_type = executable.attrib.get(f'{{{namespaces["DTS"]}}}ExecutableType')
        name = executable.attrib.get(f'{{{namespaces["DTS"]}}}Name')
        executables_info.append({'id': exec_id, 'name': name, 'type': exec_type, 'depth': depth})

        # Check for nested executables
        nested_executables = executable.findall(f'./DTS:Executables/DTS:Executable', namespaces)
        if nested_executables:
            executables_info.extend(parse_executables(nested_executables, precedence_constraints, depth + 1))

    # Sort executables based on precedence constraints
    ordered_executables = sort_by_precedence(executables_info, precedence_constraints)
    return ordered_executables

# Function to parse Precedence Constraints
def parse_precedence_constraints(root):
    precedence_constraints = []
    constraints = root.findall('.//DTS:PrecedenceConstraints/DTS:PrecedenceConstraint', namespaces)
    for constraint in constraints:
        from_id = constraint.attrib.get(f'{{{namespaces["DTS"]}}}From')
        to_id = constraint.attrib.get(f'{{{namespaces["DTS"]}}}To')
        precedence_constraints.append({'from': from_id, 'to': to_id})
    return precedence_constraints

# Function to sort executables based on precedence constraints
def sort_by_precedence(executables, precedence_constraints):
    exec_dict = {exec['id']: exec for exec in executables}
    ordered_executables = []

    def add_executable(exec_id):
        if exec_id in exec_dict:
            ordered_executables.append(exec_dict.pop(exec_id))
            for constraint in precedence_constraints:
                if constraint['from'] == exec_id:
                    add_executable(constraint['to'])

    # Find and add initial executables (those not targeted by any precedence constraint)
    initial_executables = [exec['id'] for exec in executables if all(constraint['to'] != exec['id'] for constraint in precedence_constraints)]
    for exec_id in initial_executables:
        add_executable(exec_id)

    # Add any remaining executables
    for exec_id in list(exec_dict.keys()):
        add_executable(exec_id)

    return ordered_executables

# Function to parse Data Flow components
def parse_data_flows(data_flow_tasks):
    data_flow_info = []
    for task in data_flow_tasks:
        task_name = task.attrib.get(f'{{{namespaces["DTS"]}}}Name')
        components = task.findall('.//component', namespaces)
        for component in components:
            comp_name = component.attrib.get('name')
            comp_class_id = component.attrib.get('componentClassID')
            data_flow_info.append({'task_name': task_name, 'component_name': comp_name, 'classID': comp_class_id})
    return data_flow_info

# Parse the dtsx file content
def parse_dtsx(dtsx_content):
    root = ET.fromstring(dtsx_content)

    # Parse precedence constraints
    precedence_constraints = parse_precedence_constraints(root)

    # Parse control flow
    control_flow = parse_executables(root.findall('.//DTS:Executable', namespaces), precedence_constraints)
    
    # Parse data flow tasks
    data_flow_tasks = root.findall('.//DTS:Executable[@DTS:ExecutableType="Microsoft.Pipeline"]', namespaces)
    data_flow_info = parse_data_flows(data_flow_tasks)
    
    return control_flow, data_flow_info

# Example usage with the actual content
control_flow, data_flow_info = parse_dtsx(dtsx_content)

(control_flow, data_flow_info)


([{'id': None, 'name': None, 'type': None, 'depth': 0}], [])