In [None]:
# imports
import os
import csv
from xml.dom import minidom
from pathlib import Path

# --- TO WHOM IT MAY CONCERN
# --- IF YOU WISH TO RE-USER THIS CODE, then change the below path
start_from_folder = Path("C:\\Users\\e546315\\Desktop\\dev_analysis\\conda\\netex\\xsd")
start_from_relative_path = Path("./xsd")

# defining a recursive function to dive into sub-folders
def findEnums(full_file_path, relative_path, writer, parent_container):
    # print("Folder: ", full_file_path)
    # go through the files and folders structure...
    for dir_item in os.listdir(full_file_path):
        # ...and parse each file individually...
        if os.path.isfile(os.path.join(full_file_path, dir_item)):
            # ...specifically, search for the "enumeration" elements...
            # parse an xml file by name
            rel_path = relative_path / os.path.splitext(dir_item)[0]
            file = minidom.parse(os.path.join(full_file_path, dir_item))

            #use getElementsByTagName() to get tag
            #we get both enums that have not been properly assigned a namespace and those who have
            enumerations = file.getElementsByTagName('enumeration')
            xsd_enumerations = file.getElementsByTagName('xsd:enumeration')

            if enumerations:
                print("-------this should not be existing-------")
                print(enumerations[0].parentNode.tagName)
                print("--------------")
            elif len(xsd_enumerations) > 0:
                # ...then go up to the superceding element and print the name of that...
                # in fact we are looking for the simpleType (a manual check did not show any complexTypes) 
                # containing the enumeration; the strucutre is usually enumeration>restriction>*Type
                if "simpleType" not in xsd_enumerations[0].parentNode.parentNode.tagName:
                    print("-------this should not be existing-------")
                    print(xsd_enumerations[0].parentNode.parentNode.tagName)
                    print("--------------")
                else:
                    # list of dictionaries representing all parent elements
                    parents = {}
                    
                    # iterate through enums ...
                    for xsd_enumeration in xsd_enumerations:
                        # ... and check their parent and add to mapping if not there and start new list, else use old list ...
                        parent = xsd_enumeration.parentNode.parentNode.getAttribute("name")
                        parent_description = ""
                        
                        for annotation_element in xsd_enumeration.parentNode.parentNode.getElementsByTagName("xsd:documentation"):
                            if annotation_element.parentNode.parentNode == xsd_enumeration.parentNode.parentNode:
                                parent_description = annotation_element.firstChild.nodeValue
                        
                        if parent not in parents:
                            parents[parent] = ["./"+str(rel_path).replace(os.sep, '/'), parent, parent_description]
                            parents[parent].append([])
                        
                        # ... then add the new enum item to the row of parent items ...
                        parents[parent][3].append(xsd_enumeration.getAttribute("value"))
                    
                    # ... when all parents and children have been gathered: add to csv
                    
                    for parent in parents:
                        enum_values = ' '.join(map(str, parents[parent][3]))
                        parents[parent] = parents[parent][0:3]
                        parents[parent].append(enum_values)
                        writer.writerow(parents[parent])
                    
                    # add the new parents to our container list
                    parent_container.append(parents)
            else:
                # ... do useless stuff to quiet python ...
                a = 1+1
        # if we have a folder go in recursively
        else:
            # !!!!! we skip the gml folder !!!!!
            if dir_item not in ["gml", "wsdl", "wsdl_model", "siri", "siri_utility"]:
                findEnums(os.path.join(full_file_path, dir_item), relative_path / os.path.splitext(dir_item)[0], writer, parent_container)

# open the result file in the write mode
f = open(os.path.join(start_from_folder.parent, "enumerations.csv"), 'w', newline='')
# create the csv writer
writer = csv.writer(f, delimiter=';')
writer.writerow(["Path", "EnumName", "EnumNameDescription", "EnumValues"])

# parent container
parent_container = []

# call recursive find, print, and write
findEnums(start_from_folder, start_from_relative_path, writer, parent_container)

# close the file
f.close()


In [None]:
# ---- Here we find duplicate items within a parent enum
for parent_dictionary in parent_container:
    for parent in parent_dictionary:
        mylist = parent_dictionary[parent][3] # get the list of enum values
        dup = [x for x in mylist if mylist.count(x) > 1]
        if len(dup) > 0:
            print("Parent: ", parent)
            print("duplicates: ", dup)

In [None]:
# ---- Here we find duplicate parent items accross parent enums
parent_list = []

for parent_dictionary in parent_container:
    for parent in parent_dictionary:
        parent_list.append(parent)
   
for parent_in_list in parent_list:
    if parent_list.count(parent_in_list) > 1:
        print("parent ", parent_in_list, " has duplicates")
        
        for parent_dictionary in parent_container:
            if parent_in_list in parent_dictionary:
                print(parent_dictionary[parent_in_list])

In [None]:
# ---- Here we find duplicate child items accross parent enums
import difflib

parent_list = []
for parent_dictionary in parent_container:
    for parent in parent_dictionary:
        parent_list.append(parent)

parent_list_2 = parent_list
        
for p1 in parent_list:
    print("parent ", p1, " has the following items that match those of parent ")
    parent_list_2 = [x for x in parent_list_2 if x != p1]
    for p2 in parent_list_2:
        childrenp1 = []
        childrenp2 = []
        
        for parent_dictionary in parent_container:
            if p1 in parent_dictionary:
                childrenp1 = parent_dictionary[p1][3]
            elif p2 in parent_dictionary:
                childrenp2 = parent_dictionary[p2][3]
        
        duplicates = []
        
        for child1 in childrenp1:
            for child2 in childrenp2:
                if child1 not in ["other", "unknown", "any", "all", "none", "undefined"]: # a blacklist of too common similarities
                    if child1 == child2:
                        duplicates.append(child1)
        
        if len(duplicates) > 0:
            print(p2, ": ", duplicates)

In [None]:
# ---- Here we find similar items within a parent enum
import difflib

for parent_dictionary in parent_container:
    for parent in parent_dictionary:
        print("Parent: ", parent)
        children = parent_dictionary[parent][3]
        for child in children:
            similars = difflib.get_close_matches(child, ([x for x in children if x != child]))
            if len(similars) > 0:
                print("similarities for ", child, "are: ", similars)
                #print("File: ", mylist[0])

In [None]:
# ---- Here we find similar parent items accross parent enums
parent_list = []

for parent_dictionary in parent_container:
    for parent in parent_dictionary:
        parent_list.append(parent)
        
for parent_in_list in parent_list:
    print("parent ", parent_in_list, " has similarities: ", difflib.get_close_matches(parent_in_list, ([x for x in parent_list if x != parent_in_list])))