In [16]:
#%pip install xmltodict
import xmltodict
import json
import os
import xml.etree.ElementTree as ET
import csv

In [18]:
def flatten_xml_to_csv(xml_file: str, csv_file_path: str) -> None:

    xml_payload = ''

    with open(xml_file, 'r', encoding='utf-8') as xf:
        xml_payload = xf.read()

    

    try:
        if xml_payload.index('?>'):
            dec_end = xml_payload.index('?>')+2
            xml_payload = xml_payload[dec_end:]
    except ValueError:
        pass

    #print(xml_payload)

    xd = xmltodict.parse(xml_payload)

    def flatten_array(arr: list, parent_key = '', sep = '/'):
        items = []
        for i, val in enumerate(arr):
            if isinstance(val, dict):
                items.extend(flatten_dict(val, f"{parent_key}[{i}]", sep=sep).item())
            elif isinstance(val, list):
                items.extend(flatten_array(val, f"{parent_key}[{i}]", sep=sep))
            else:
                items.append((f"{parent_key}[{i}]", val))

        return items
    
    def flatten_dict(d: dict, parent_key='', sep='/'):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict) and len(v) == 1 and list(v.keys())[0].startswith("@"):
                attr_key = f"{new_key}/{list(v.keys())[0][1:]}"
                items.append((attr_key, list(v.values())[0]))
            elif isinstance(v, dict):
                items.extend(flatten_dict(v, new_key, sep=sep).items())
            elif isinstance(v, list):
                for i, val in enumerate(v):
                    if isinstance(val, dict):
                        items.extend(flatten_dict(val, f"{new_key}[{i}]", sep=sep).items())
                    elif isinstance(val, list):
                        items.extend(flatten_array(val, f"{new_key}[{i}]",sep=sep))
                    else:
                        items.append((f"{new_key}[{i}]",val))
            else:
                items.append((new_key, v))

        return dict(items)
    

    data = flatten_dict(xd)

    if "~" in csv_file_path:
        csv_file_path = os.path.expanduser(csv_file_path)

    #print(csv_file_path)

    with open(csv_file_path, 'w', newline='') as f:
        csv_writer = csv.writer(f)

        headers = ["Path", "Value"]
        csv_writer.writerow(headers)

        for k, v in data.items():
            csv_writer.writerow([k,v])

In [3]:
def states_simple_xml():
    import xml.etree.ElementTree as ET
    from xml.dom.minidom import parseString

    root = ET.Element("states")

    data = [
        {"name": "Georgia", "nickname": "Peach State", "population": "10711908", "largest_city": "Atlanta", "land_mass_sqmi": "57906"},
        {"name": "California", "nickname": "Granola State", "population": "39538223", "largest_city": "Los Angeles", "land_mass_sqmi": "163696"},
    ]

    for state in data:

        root = ET.Element("states")
        state_element = ET.SubElement(root, "state")
        name_element = ET.SubElement(state_element, "name")
        name_element.text = state["name"]
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        city_element = ET.SubElement(state_element, "largest_city")
        city_element.text = state["largest_city"]
        land_mass_element = ET.SubElement(state_element, "land_mass_sqmi")
        land_mass_element.text = state["land_mass_sqmi"]

        # Convert the ElementTree to a string
        rough_string = ET.tostring(root, encoding="utf-8")
        # Parse the string with minidom for pretty printing
        reparsed = parseString(rough_string)
        pretty_xml_as_string = reparsed.toprettyxml(indent="  ")

        # Write the pretty-printed XML to a file
        with open(f"./files/{state['name']}.xml", "w", encoding="utf-8") as f:
            f.write(pretty_xml_as_string)
        

In [4]:
states_simple_xml()

In [7]:
def states_complex_xml():
    import xml.etree.ElementTree as ET
    from xml.dom.minidom import parseString

    root = ET.Element("states")

    data = [
        {"name": "Georgia", "nickname": "Peach State", "population": "10711908", "largest_city": "Atlanta", "land_mass_sqmi": "57906", "misc": [{"climate": "warm"}, {"state_flower": "Cherokee Rose"}]},
        {"name": "California", "nickname": "Granola State", "population": "39538223", "largest_city": "Los Angeles", "land_mass_sqmi": "163696", "water_sqmi": "7737", "misc":[{"bird":"California Quail"}]},
    ]

    for state in data:
        root = ET.Element("states")
        state_element = ET.SubElement(root, "state")
        attributes = {"nickname": state["nickname"]}
        name_element = ET.SubElement(state_element, "name", **attributes)
        name_element.text = state["name"]
        population_element = ET.SubElement(state_element, "population")
        population_element.text = state["population"]
        city_element = ET.SubElement(state_element, "largest_city")
        city_element.text = state["largest_city"]
        land_mass_element = ET.SubElement(state_element, "land_mass_sqmi")
        land_mass_element.text = state["land_mass_sqmi"]
        if "water_sqmi" in state:
            water_mass_element = ET.SubElement(state_element, "water_sqmi")
            water_mass_element.text = state["water_sqmi"]
        if "misc" in state:
            misc_el = ET.SubElement(state_element, "misc")
            for misc_item in state['misc']:
                for key, value in misc_item.items():
                    misc_sub_el = ET.SubElement(misc_el, key)
                    misc_sub_el.text = value

        # Convert the ElementTree to a string
        rough_string = ET.tostring(root, encoding="utf-8")
        # Parse the string with minidom for pretty printing
        reparsed = parseString(rough_string)
        pretty_xml_as_string = reparsed.toprettyxml(indent="  ")

        # Write the pretty-printed XML to a file
        with open(f"./files/{state['name']}_complex.xml", "w", encoding="utf-8") as f:
            f.write(pretty_xml_as_string)


In [8]:
states_simple_xml()
states_complex_xml()

In [None]:
import os
files = ['./files/California.xml', './files/Georgia.xml', './files/California_complex.xml', './files/Georgia_complex.xml']

for file_path in files:
    directory, fname = os.path.split(file_path)
    csv_path = os.path.join(directory, fname.replace('xml','csv'))
    flatten_xml_to_csv(file_path, csv_path)



In [11]:
import pandas as pd
import glob
f_pattern = './files/*_simple.xml'
file_paths = glob.glob(f_pattern)
df_list = [pd.read_xml(file_path) for file_path in file_paths]
df_stacked = pd.concat(df_list, ignore_index=True)
print(df_stacked)

         name  population largest_city  land_mass_sqmi
0  California    39538223  Los Angeles          163696
1     Georgia    10711908      Atlanta           57906


In [33]:
df = pd.read_xml('./files/states_complex.xml')
print(df)

         name  population largest_city  land_mass_sqmi      misc  water_sqmi
0     Georgia    10711908      Atlanta           57906  \n               NaN
1  California    39538223  Los Angeles          163696  \n            7737.0


In [4]:
import pandas as pd

# Read the XML file
xml_file = './files/states2.xml'
df = pd.read_xml(xml_file)

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file
csv_file = './files/states3.csv'
df.to_csv(csv_file, index=False)

print(f"Data has been written to {csv_file}")


   land_size            name  population           city
0     423967      California    39538223    Los Angeles
1     695662           Texas    29145505        Houston
2     170312         Florida    21538187          Miami
3     141297        New York    20201249  New York City
4     119280    Pennsylvania    13002700   Philadelphia
5     149995        Illinois    12812508        Chicago
6     116096            Ohio    11799448       Columbus
7     153910         Georgia    10711908        Atlanta
8     139391  North Carolina    10439388      Charlotte
9     250487        Michigan    10077331        Detroit
Data has been written to ./files/states3.csv


In [7]:
import pandas as pd
from google.cloud import bigquery
import os

def load_csv_to_bigquery(file_path, table_id):
    df = pd.read_csv(file_path)

    filename = os.path.splitext(os.path.basename(file_path))[0]
    df['filename'] = filename

    client = bigquery.Client()

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED
    
    )

    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)

    # Wait for the load job to complete
    job.result()

    print(f"Loaded {df.shape[0]} rows into {table_id} from {file_path}")

def main():
    # Paths to your CSV files
    csv_files = ['./files/California.csv', './files/Georgia.csv']
    table_id = 'test_ds.states_flat'  # Replace with your project ID, dataset ID, and table ID

    for csv_file in csv_files:
        load_csv_to_bigquery(csv_file, table_id)

if __name__ == "__main__":
    main()





Loaded 7 rows into test_ds.states_flat from ./files/California.csv




Loaded 7 rows into test_ds.states_flat from ./files/Georgia.csv
