In [1]:
# load required libraries
import os
import gzip
import xml.etree.ElementTree as ET
import pandas as pd

In [4]:
def flatten_item(item):
    row = {}
    for child in item:
        row[child.tag] = child.text
    return row

def xml_to_csv(root, output):
    # get all .gz files
    all_gz_files = []
    for root, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith(".xml.gz"):
                all_gz_files.append(os.path.join(root, f))

    # filter out tmc files
    gz_files = [
        f for f in all_gz_files
        if "tmc" not in os.path.basename(f).lower()
    ]

    all_rows = []

    for gz_file in gz_files:
        try:
            # decompress the file and read XML
            with gzip.open(gz_file, 'rb') as f:
                xml_data = f.read()

            # parse XML 
            root = ET.fromstring(xml_data)

            # extract all <item> elements
            items = root.findall(".//channel/item")

            for item in items:
                row = flatten_item(item)
                row["source_file"] = gz_file
                all_rows.append(row)

        except Exception as e:
            print(f"❌ Error parsing {gz_file}: {e}")

    # save to CSV
    df = pd.DataFrame(all_rows)

    # filter needed columns
    df = df[df["type"].astype(int) == 3]
    df = df[["latitude", "longitude", "starttime", "description_en", "road_en"]]

    df.to_csv(output, index=False)


In [None]:
xml_to_csv("../raw_data/iTIC-Longdo-Traffic-events-2021", "../processed_data/2021_incidents.csv")
xml_to_csv("../raw_data/iTIC-Longdo-Traffic-events-2022", "../processed_data/2022_incidents.csv")
xml_to_csv("../raw_data/iTIC-Longdo-Traffic-events-2023-incomplete", "../processed_data/2023_incidents.csv")

❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/03/05/202103051230-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/03/17/202103172200-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/04/202104041900-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/04/202104041700-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/27/202104270900-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/16/202104160600-event.xml.gz: mismatched tag: line 9367, column 185
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/16/202104160730-event.xml.gz: no element found: line 1, column 0
❌ Error parsing data/iTIC-Longdo-Traffic-events-2021/04/16/202104160630-event.xml.gz: mismatched tag: line 12106, column 6
❌ Error parsing data/iTIC-L