In [4]:
import xml.etree.ElementTree as ET
from xml.dom import minidom

def sort_xml(elem):
    """Sorts the children of an XML element by tag name and attributes."""
    if len(elem) == 0:
        return

    elem[:] = sorted(elem, key=lambda e: (e.tag, sorted(e.attrib.items())))

    for child in elem:
        sort_xml(child)

def prettify_xml(elem):
    """Return a pretty-printed XML string."""
    rough_string = ET.tostring(elem, encoding='utf-8')
    parsed = minidom.parseString(rough_string)
    return parsed.toprettyxml(indent="  ")

# Load and parse the XML
tree = ET.parse("01_opendata.swiss/test/__.xml")
root = tree.getroot()

# Sort recursively
sort_xml(root)

# Save pretty sorted XML
with open("file1_sorted.xml", "w", encoding="utf-8") as f:
    f.write(prettify_xml(root))


In [3]:
import os
import pandas as pd

folder_path = "01_opendata.swiss/test"  # Replace with the actual folder path

dataset_names = []
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        dataset_names.append(filename[:-4])  # Remove the file extension

df = pd.DataFrame({"Dataset_Name": dataset_names})
df.to_csv("01_opendata.swiss/dataset_names.csv", index=False)


In [4]:
import hashlib
import os
import pandas as pd

# Set the folder path
folder_path = "01_opendata.swiss/test"  # Change this to the correct folder

# Function to compute SHA-256 hash of a file
def file_hash(file_path):
    try:
        with open(file_path, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    except FileNotFoundError:
        return None

# List all XML files in the folder and compute their hashes
files = [f for f in os.listdir(folder_path) if f.endswith(".xml")]
df = pd.DataFrame({"Dataset_Name": files, "dataset_hash": [file_hash(os.path.join(folder_path, f)) for f in files]})

# Save the hashes to a CSV file
df.to_csv("01_opendata.swiss/NHF.csv", index=False)

# Display first few rows
print(df.head())

print("Hashes calculated and saved to dataset_names.csv")


  Dataset_Name                                       dataset_hash
0     test.xml  e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...
Hashes calculated and saved to dataset_names.csv


In [5]:
import pandas as pd
import os

# File paths
nhf_file = "01_opendata.swiss/NHF.csv"
ohf_file = "01_opendata.swiss/OHF.csv"

# Load NHF file
nhf_df = pd.read_csv(nhf_file)

# Check if OHF file exists
if os.path.exists(ohf_file):
    ohf_df = pd.read_csv(ohf_file)
    
    # Merge NHF with OHF to track all entries
    comparison_df = pd.merge(nhf_df, ohf_df, on="Dataset_Name", how="left", suffixes=('_NHF', '_OHF'))

    # Define status column for NHF-based comparison
    def get_status(row):
        if pd.isna(row["dataset_hash_OHF"]):  # Not found in OHF
            return "new"
        elif row["dataset_hash_NHF"] == row["dataset_hash_OHF"]:  # Exact match
            return "found"
        else:  # Hash is different
            return "changed"

    comparison_df["status"] = comparison_df.apply(get_status, axis=1)

    # Rename column for consistency
    comparison_df.rename(columns={"dataset_hash_NHF": "dataset_hash"}, inplace=True)

    # Drop the OHF hash column
    comparison_df = comparison_df.drop(columns=["dataset_hash_OHF"])

    # Identify entries in OHF that are missing in NHF (i.e., removed)
    removed_df = ohf_df[~ohf_df["Dataset_Name"].isin(nhf_df["Dataset_Name"])].copy()
    removed_df.rename(columns={"dataset_hash": "dataset_hash"}, inplace=True)
    removed_df["status"] = "removed"

    # Select relevant columns for removed_df
    removed_df = removed_df[["Dataset_Name", "dataset_hash", "status"]]

    # Combine both dataframes
    final_df = pd.concat([comparison_df, removed_df], ignore_index=True)
else:
    # If OHF does not exist, mark all datasets as new
    nhf_df["status"] = "new"
    final_df = nhf_df

# Display or save the output
print(final_df)
final_df.to_csv("01_opendata.swiss/comparison_result.csv", index=False)


  Dataset_Name                                       dataset_hash status
0     test.xml  e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...    new


In [6]:
import pandas as pd
import os

# Load the comparison result
comparison_file = "01_opendata.swiss/comparison_result.csv"
df = pd.read_csv(comparison_file)

# Folder containing the files
folder_path = "01_opendata.swiss/test"

# Get the list of files to remove
files_to_remove = df[df["status"] == "found"]["Dataset_Name"].tolist()

# Remove the files
for file_name in files_to_remove:
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed: {file_name}")
    else:
        print(f"File not found: {file_name}")

print("Cleanup complete.")


Cleanup complete.


In [7]:
import pandas as pd

# Load the comparison result
comparison_file = "01_opendata.swiss/comparison_result.csv"
df = pd.read_csv(comparison_file)

# Filter for files with status "changed" or "removed"
remove_order_df = df[df["status"].isin(["changed", "removed"])]

# Save to new CSV file
remove_order_file = "01_opendata.swiss/removeorder_metadata.csv"
remove_order_df.to_csv(remove_order_file, index=False)

print(f"Remove order metadata saved to {remove_order_file}")

Remove order metadata saved to 01_opendata.swiss/removeorder_metadata.csv


In [8]:
import pandas as pd

# Load the comparison result
comparison_file = "01_opendata.swiss/comparison_result.csv"
df = pd.read_csv(comparison_file)

# Filter for statuses: found, changed, and new
valid_statuses = ["found", "changed", "new"]
filtered_df = df[df["status"].isin(valid_statuses)][["Dataset_Name", "dataset_hash"]]

# Save the filtered data to OHF.csv (override it)
oh_file = "01_opendata.swiss/OHF.csv"
filtered_df.to_csv(oh_file, index=False)

print(f"OHF.csv has been updated with relevant dataset entries.")


OHF.csv has been updated with relevant dataset entries.


### Combine


In [1]:
import os
import hashlib
import pandas as pd

folder_path = "01_opendata.swiss/test"  # Replace with the actual folder path
latest_import = "01_opendata.swiss/latest_import.csv"
previous_import = "01_opendata.swiss/previous_import.csv"
comparison_file = "01_opendata.swiss/comparison_result.csv"
remove_order_file = "removeorder_metadata.csv"

# Function to compute SHA-256 hash of a file
def file_hash(file_path):
    try:
        with open(file_path, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    except FileNotFoundError:
        return None

# List all XML files in the folder and compute their names and hashes
dataset_names = []
dataset_hashes = []

for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        dataset_names.append(filename[:-4])  # Remove the file extension
        file_path = os.path.join(folder_path, filename)
        dataset_hashes.append(file_hash(file_path))

# Save latest import
latest_df = pd.DataFrame({"Dataset_Name": dataset_names, "dataset_hash": dataset_hashes})
latest_df.to_csv(latest_import, index=False)

# Compare latest with previous if previous exists
if os.path.exists(previous_import):
    previous_df = pd.read_csv(previous_import)

    # Ensure consistent types for merge
    latest_df["Dataset_Name"] = latest_df["Dataset_Name"].astype(str)
    previous_df["Dataset_Name"] = previous_df["Dataset_Name"].astype(str)

    if not latest_df.empty:
        # Merge to track all entries
        comparison_df = pd.merge(latest_df, previous_df, on="Dataset_Name", how="left", suffixes=('_latest', '_previous'))

        # Define status column for comparison
        def get_status(row):
            if pd.isna(row["dataset_hash_previous"]):
                return "new"
            elif row["dataset_hash_latest"] == row["dataset_hash_previous"]:
                return "found"
            else:
                return "changed"

        comparison_df["status"] = comparison_df.apply(get_status, axis=1)

        # Rename and clean columns
        comparison_df.rename(columns={"dataset_hash_latest": "dataset_hash"}, inplace=True)
        comparison_df = comparison_df.drop(columns=["dataset_hash_previous"])
    else:
        comparison_df = pd.DataFrame(columns=["Dataset_Name", "dataset_hash", "status"])

    # Identify entries in previous import missing in latest (i.e., removed)
    removed_df = previous_df[~previous_df["Dataset_Name"].isin(latest_df["Dataset_Name"])]
    removed_df["status"] = "removed"

    # Combine current and removed datasets
    final_df = pd.concat([comparison_df, removed_df], ignore_index=True)
else:
    latest_df["status"] = "new"
    final_df = latest_df

# Save comparison result
final_df.to_csv(comparison_file, index=False)

# Display sample
print(final_df.head())
print("Hashes calculated and saved to comparison_result.csv")

# Cleanup step: remove files marked as 'found'
df = pd.read_csv(comparison_file)
files_to_remove = df[df["status"] == "found"]["Dataset_Name"].tolist()

for file_name in files_to_remove:
    file_path = os.path.join(folder_path, file_name + ".xml")
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed: {file_name}.xml")
    else:
        print(f"File not found: {file_name}.xml")

print("Cleanup complete.")

# Save metadata for files marked as 'changed' or 'removed'
remove_order_df = df[df["status"].isin(["changed", "removed"])]
remove_order_df.to_csv(remove_order_file, index=False)
print(f"Remove order metadata saved to {remove_order_file}")

# Update previous import with found, changed, and new entries
valid_statuses = ["found", "changed", "new"]
filtered_df = df[df["status"].isin(valid_statuses)][["Dataset_Name", "dataset_hash"]]
filtered_df.to_csv(previous_import, index=False)
print("previous_import.csv has been updated with relevant dataset entries.")


                               Dataset_Name  \
0             10-ag-bieneninspektionskreise   
1                         11-ag-schulkreise   
2  116-ch-kataster-der-belasteten-standorte   
3                                        __   
4                                     __101   

                                        dataset_hash status  
0  c50317be73339eecb370cf489b4379b79b6e8675e85031...  found  
1  6d5f3b6d1fbc67cbd9d19d7fb9802a98376e7e5b284693...  found  
2  85be8ac70f3fd7158b3a86e71b3e668f86f6011bd44e34...  found  
3  c4c3e4b2e37f13dfbec088dcd36841a3e7e6db3d3fa716...  found  
4  2178eb1e48cde5c0b9c0a24689203e4b72be9768404563...  found  
Hashes calculated and saved to comparison_result.csv
Removed: 10-ag-bieneninspektionskreise.xml
Removed: 11-ag-schulkreise.xml
Removed: 116-ch-kataster-der-belasteten-standorte.xml
Removed: __.xml
Removed: __101.xml
Cleanup complete.
Remove order metadata saved to removeorder_metadata.csv
previous_import.csv has been updated with relevant data

In [88]:
from dataset_change_detector import compare_dataset_hashes

compare_dataset_hashes("01_opendata.swiss", "removeorder_metadata_opendata.swiss.csv")


Empty DataFrame
Columns: [Dataset_Name, dataset_hash, status]
Index: []
Hashes calculated and saved to comparison_result.csv
Cleanup complete.
Remove order metadata saved to removeorder_metadata.csv
previous_import.csv has been updated with relevant dataset entries.
