# XML schema extraction
 If you want to load complex xml data to database but xsd schema is not available then you need to profile representative sample of xml files to get all the possible xpaths in these documents to be able to design data pipe and target model.  
 
Python code below address this issue, it gives you:
1. xpaths for all tags (+ info about parent tag for easier data pipe development)
2. info if attributes occur on a tag
3. document name with the tag/attribute variant to be used for tests

Check the result for sample xml documents in xml_files_to_profile here: 

## Set Up

In [79]:
import os
from lxml import etree as ElementTree
import pandas as pd

## Extract paths from xml documents
1. get the file list to process
2. get paths, attributes with namespace prefix for each document and add to pandas dataframe

In [84]:
from lxml import etree as ElementTree
import pandas as pd

# replace namespace in node path with its prefix 
def replace_ns_w_prefix(file_namespaces, path_w_ns):
    for key, name in file_namespaces.items():
        path_w_ns = path_w_ns.replace(name, key)
    path_w_ns_prefix = path_w_ns.replace('{','').replace('}',':')
    return path_w_ns_prefix

# find all xml nodes and its paths
def etree_iter_path(node, tag=None, path='.'):
    if tag == "*":
        tag = None
    if tag is None or node.tag == tag:
        yield node, path
    for child in node:
        _child_path = '{path}/{child_tag}'.format(path=path, child_tag=child.tag)
        for child, child_path in etree_iter_path(child, tag, path=_child_path):
            yield child, child_path

# get paths, attributes for a xml file
def xml_node_paths(file_path):
    xmldoc = ElementTree.parse(file_path)
    file_namespaces = dict([node for _, node in ElementTree.iterparse(file_path,events=['start-ns'])])                                  
    node_list = []
    for elem, path in etree_iter_path(xmldoc.getroot()):
        elem_parent = ''
        if elem.getparent() is not None:
            elem_parent = replace_ns_w_prefix(file_namespaces,elem.getparent().tag)
        node_attributes = elem.attrib
        attr_keys_string = ",".join(node_attributes.keys())
        node_list.append([replace_ns_w_prefix(file_namespaces,elem.tag), elem_parent,attr_keys_string,replace_ns_w_prefix(file_namespaces,path), file_path])
    file_data = pd.DataFrame(node_list, columns = ['tag','parent_tag','attributes','xpath','filename'])
    file_data_no_dupl = file_data.drop_duplicates()
    return file_data_no_dupl

## Run extraction from multiple xml files and export result to xlsx for convenience

In [86]:
files_to_profile = os.listdir('./xml_files_to_profile')
all_files_data = pd.DataFrame(columns = ['tag','parent_tag','attributes','xpath','filename'])

for file in files_to_profile:
    file_path = os.path.join('./xml_files_to_profile/',file)
    result = xml_node_paths(file_path)
    all_files_data = pd.concat([all_files_data, result])    
    
# get rid of multiple examples of the same path from multiple files
all_files_data_examles = all_files_data.groupby(['xpath','attributes','tag','parent_tag'])['filename'].max()

# write result to xlsx
writer = pd.ExcelWriter('xml_schema_extraction.xlsx')
all_files_data_examles.to_excel(writer, sheet_name='Sheet1')
writer.save()
