In [4]:
#%pip install xmltodict
import xmltodict
import json
import os
import xml.etree.ElementTree as ET
import csv

In [32]:
def flatten_xml_to_csv(xml_file: str, csv_file_path: str) -> None:

    xml_payload = ''

    with open(xml_file, 'r', encoding='utf-8') as xf:
        xml_payload = xf.read()

    

    try:
        if xml_payload.index('?>'):
            dec_end = xml_payload.index('?>')+2
            xml_payload = xml_payload[dec_end:]
    except ValueError:
        pass

    print(xml_payload)

    xd = xmltodict.parse(xml_payload)

    def flatten_array(arr: list, parent_key = '', sep = '/'):
        items = []
        for i, val in enumerate(arr):
            if isinstance(val, dict):
                items.extend(flatten_dict(val, f"{parent_key}[{i}]", sep=sep).item())
            elif isinstance(val, list):
                items.extend(flatten_array(val, f"{parent_key}[{i}]", sep=sep))
            else:
                items.append((f"{parent_key}[{i}]", val))

        return items
    
    def flatten_dict(d: dict, parent_key='', sep='/'):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict) and len(v) == 1 and list(v.keys())[0].startswith("@"):
                attr_key = f"{new_key}/{list(v.keys())[0][1:]}"
                items.append((attr_key, list(v.values())[0]))
            elif isinstance(v, dict):
                items.extend(flatten_dict(v, new_key, sep=sep).items())
            elif isinstance(v, list):
                for i, val in enumerate(v):
                    if isinstance(val, dict):
                        items.extend(flatten_dict(val, f"{new_key}[{i}]", sep=sep).items())
                    elif isinstance(val, list):
                        items.extend(flatten_array(val, f"{new_key}[{i}]",sep=sep))
                    else:
                        items.append((f"{new_key}[{i}]",val))
            else:
                items.append((new_key, v))

        return dict(items)
    

    data = flatten_dict(xd)

    if "~" in csv_file_path:
        csv_file_path = os.path.expanduser(csv_file_path)

    print(csv_file_path)

    with open(csv_file_path, 'w', newline='') as f:
        csv_writer = csv.writer(f)

        headers = ["Path", "Value"]
        csv_writer.writerow(headers)

        for k, v in data.items():
            csv_writer.writerow([k,v])
        




In [None]:
x_path = 'states.xml'
c_path = 'states.csv'

flatten_xml_to_csv(x_path, c_path)

x_path = 'states2.xml'
c_path = 'states2.csv'

flatten_xml_to_csv(x_path, c_path)

In [20]:
def gen_simple_xml():
    import xml.etree.ElementTree as ET
    from xml.dom.minidom import parseString

    root = ET.Element("states")

    data = [
        {"name": "Texas", "population": "30503301", "largest_city": "Houston", "land_mass_sqmi": "261232"},
        {"name": "New York", "population": "19571216", "largest_city": "New York City", "land_mass_sqmi": "54555"},
    ]

    for state in data:
        state_element = ET.SubElement(root, "state")
        name_element = ET.SubElement(state_element, "name")
        name_element.text = state["name"]
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        city_element = ET.SubElement(state_element, "largest_city")
        city_element.text = state["largest_city"]
        land_mass_element = ET.SubElement(state_element, "land_mass_sqmi")
        land_mass_element.text = state["land_mass_sqmi"]

    # Convert the ElementTree to a string
    rough_string = ET.tostring(root, encoding="utf-8")
    # Parse the string with minidom for pretty printing
    reparsed = parseString(rough_string)
    pretty_xml_as_string = reparsed.toprettyxml(indent="  ")

    # Write the pretty-printed XML to a file
    with open("./files/states_simple.xml", "w", encoding="utf-8") as f:
        f.write(pretty_xml_as_string)

In [35]:

def gen_simple_xml_varying_elements_and_attrs():
    import xml.etree.ElementTree as ET
    from xml.dom.minidom import parseString

    root = ET.Element("states")

    data = [
        {"name": "Georgia", "nickname": "Peach State", "climate":"warm", "population": "10711908", "largest_city": "Atlanta", "land_mass_sqmi": "57906"},
        {"name": "California", "nickname": "Granola State", "population": "39538223", "largest_city": "Los Angeles", "land_mass_sqmi": "163696", "water_sqmi": "7737"},
    ]

    for state in data:
        state_element = ET.SubElement(root, "state")
        attributes = {"nickname": state["nickname"]}
        if "climate" in state:
            attributes["climate"] = state["climate"]
        name_element = ET.SubElement(state_element, "name", **attributes)
        name_element.text = state["name"]
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        city_element = ET.SubElement(state_element, "largest_city")
        city_element.text = state["largest_city"]
        land_mass_element = ET.SubElement(state_element, "land_mass_sqmi")
        land_mass_element.text = state["land_mass_sqmi"]
        if "water_sqmi" in state:
            water_mass_element = ET.SubElement(state_element, "water_sqmi")
            water_mass_element.text = state["water_sqmi"]

    # Convert the ElementTree to a string
    rough_string = ET.tostring(root, encoding="utf-8")
    # Parse the string with minidom for pretty printing
    reparsed = parseString(rough_string)
    pretty_xml_as_string = reparsed.toprettyxml(indent="  ")

    # Write the pretty-printed XML to a file
    with open("./files/gen_simple_xml_varying_elements_and_attrs.xml", "w", encoding="utf-8") as f:
        f.write(pretty_xml_as_string)

In [36]:
gen_simple_xml()
gen_simple_xml_varying_elements_and_attrs()

In [37]:
import pandas as pd

# Read the XML file
xml_file = './files/gen_simple_xml_varying_elements_and_attrs.xml'
df = pd.read_xml(xml_file)

# Display the DataFrame
print(df)

         name  population largest_city  land_mass_sqmi  water_sqmi
0     Georgia    10711908      Atlanta           57906         NaN
1  California    39538223  Los Angeles          163696      7737.0


In [19]:
def gen_xml_doc_1():
    import xml.etree.ElementTree as ET

    # Create the root element
    root = ET.Element("states")

    # Define the states data
    states_data = [
        {"name": "California", "population": "39538223", "city": "Los Angeles"},
        {"name": "Texas", "population": "29145505", "city": "Houston"},
        {"name": "Florida", "population": "21538187", "city": "Miami"},
        {"name": "New York", "population": "20201249", "city": "New York City"},
        {"name": "Pennsylvania", "population": "13002700", "city": "Philadelphia"},
        {"name": "Illinois", "population": "12812508", "city": "Chicago"},
        {"name": "Ohio", "population": "11799448", "city": "Columbus"},
        {"name": "Georgia", "population": "10711908", "city": "Atlanta"},
        {"name": "North Carolina", "population": "10439388", "city": "Charlotte"},
        {"name": "Michigan", "population": "10077331", "city": "Detroit"}
    ]

    # Populate the XML tree with state data
    for state in states_data:
        state_element = ET.SubElement(root, "state")
        name_element = ET.SubElement(state_element, "name")
        name_element.text = state["name"]
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        city_element = ET.SubElement(state_element, "city")
        city_element.text = state["city"]

    # Create the tree and write to an XML file
    tree = ET.ElementTree(root)
    tree.write("states.xml", encoding="utf-8", xml_declaration=True)


XML file has been generated.


In [35]:
def gen_xml_doc_2():
    import xml.etree.ElementTree as ET

    # Create the root element
    root = ET.Element("states")

    # Define the states data
    states_data = [
        {"name": "California", "population": "39538223", "city": "Los Angeles", "land_size": "423967"},
        {"name": "Texas", "population": "29145505", "city": "Houston", "land_size": "695662"},
        {"name": "Florida", "population": "21538187", "city": "Miami", "land_size": "170312"},
        {"name": "New York", "population": "20201249", "city": "New York City", "land_size": "141297"},
        {"name": "Pennsylvania", "population": "13002700", "city": "Philadelphia", "land_size": "119280"},
        {"name": "Illinois", "population": "12812508", "city": "Chicago", "land_size": "149995"},
        {"name": "Ohio", "population": "11799448", "city": "Columbus", "land_size": "116096"},
        {"name": "Georgia", "population": "10711908", "city": "Atlanta", "land_size": "153910"},
        {"name": "North Carolina", "population": "10439388", "city": "Charlotte", "land_size": "139391"},
        {"name": "Michigan", "population": "10077331", "city": "Detroit", "land_size": "250487"}
    ]

    # Populate the XML tree with state data
    for state in states_data:
        state_element = ET.SubElement(root, "state", land_size=state["land_size"])
        
        name_element = ET.SubElement(state_element, "name")
        name_element.text = state["name"]
        
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        
        city_element = ET.SubElement(state_element, "city")
        city_element.text = state["city"]

    # Create the tree and write to an XML file
    tree = ET.ElementTree(root)
    tree.write("states2.xml", encoding="utf-8", xml_declaration=True)

gen_xml_doc_2()

In [4]:
import pandas as pd

# Read the XML file
xml_file = './files/states2.xml'
df = pd.read_xml(xml_file)

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file
csv_file = './files/states3.csv'
df.to_csv(csv_file, index=False)

print(f"Data has been written to {csv_file}")


   land_size            name  population           city
0     423967      California    39538223    Los Angeles
1     695662           Texas    29145505        Houston
2     170312         Florida    21538187          Miami
3     141297        New York    20201249  New York City
4     119280    Pennsylvania    13002700   Philadelphia
5     149995        Illinois    12812508        Chicago
6     116096            Ohio    11799448       Columbus
7     153910         Georgia    10711908        Atlanta
8     139391  North Carolina    10439388      Charlotte
9     250487        Michigan    10077331        Detroit
Data has been written to ./files/states3.csv
