In [7]:
import pandas as pd
import requests
import datetime
import hashlib
import json

def fetch_metadata(url):
    """Fetch JSON metadata from an API endpoint."""
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def generate_hash(data):
    """Generate a SHA-256 hash for the dataset metadata."""
    data_str = json.dumps(data, sort_keys=True)  # Convert JSON to sorted string
    return hashlib.sha256(data_str.encode()).hexdigest()  # Compute hash

def merge_metadata(dataframes):
    """Merge multiple metadata DataFrames while preserving all columns."""
    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)
    return merged_df

def save_to_csv(df, filename="merged_metadata.csv"):
    """Save the merged DataFrame to a CSV file."""
    df.to_csv(filename, index=False)
    print(f"Saved merged metadata to {filename}")

if __name__ == "__main__":
    # Read dataset names from datasets.csv
    datasets_df = pd.read_csv("opendata_swiss_datasets.csv")

    # Pick 10 random datasets from the CSV (assuming the dataset names are in the 'Dataset_Name' column)
    random_datasets = datasets_df['Dataset_Name'].sample(n=10, random_state=42).tolist()

    # List to store dataframes
    dataframes = []

    # Fetch and store API responses for the 10 random datasets
    for dataset in random_datasets:
        url = f"https://ckan.opendata.swiss/api/3/action/package_show?id={dataset}"
        timestamp = datetime.datetime.now().isoformat()

        # Fetch metadata
        data = fetch_metadata(url)

        # Generate hash
        dataset_hash = generate_hash(data)

        # Normalize JSON and add metadata columns
        df = pd.json_normalize(data)
        df['Dataset_Name'] = dataset  # Add dataset name column
        df['Request_Timestamp'] = timestamp  # Add timestamp column
        df['Metadata_Hash'] = dataset_hash  # Add hash column

        dataframes.append(df)

    # Merge all the dataframes
    merged_df = merge_metadata(dataframes)

    # Save the merged data to CSV
    save_to_csv(merged_df)


Saved merged metadata to merged_metadata.csv


### explode result.resources

In [8]:
import pandas as pd
import ast

# Read the CSV file
df_merged = pd.read_csv("merged_metadata.csv")

# Convert 'result.resources' column from string representation to a list of dictionaries
df_merged['result.resources'] = df_merged['result.resources'].apply(ast.literal_eval)

# Explode the 'result.resources' column while keeping the 'Dataset_Name' column
df_resources = df_merged.explode('result.resources')

# Convert dictionary entries to separate columns
df_resources = df_resources[['Dataset_Name', 'result.resources']].reset_index(drop=True)
df_resources = df_resources.join(df_resources.pop('result.resources').apply(pd.Series))



# Optionally, save the extracted resource details to a CSV file
df_resources.to_csv("resources_metadata.csv", index=False)
print("Saved extracted resources to resources_metadata.csv")


Saved extracted resources to resources_metadata.csv


In [10]:
from rdflib import Graph
import pandas as pd
from sqlalchemy import create_engine

# RDF-Datei laden
g = Graph()
g.parse("langsamverkehr-wanderland-schweiz.xml", format="xml")  # Oder "turtle" für .ttl

# In eine Liste von Tripeln konvertieren
data = [(str(s), str(p), str(o)) for s, p, o in g]

# In ein DataFrame umwandeln
df_tripple = pd.DataFrame(data, columns=["subject", "predicate", "object"])


df_tripple.to_csv("df_tripple.csv", index=False)
print("Saved extracted resources to df_tripple.csv")

Saved extracted resources to df_tripple.csv


In [11]:
from rdflib import Graph, Literal
import pandas as pd

# RDF-Datei laden
g = Graph()
g.parse("langsamverkehr-wanderland-schweiz.xml", format="xml")

# In eine Liste von Tripeln konvertieren (mit Sprachinformation)
data = []
for s, p, o in g:
    if isinstance(o, Literal):  # Nur für Literal-Objekte prüfen
        lang = o.language if o.language else "unknown"  # Falls keine Sprache angegeben ist
        data.append((str(s), str(p), str(o), lang))
    else:
        data.append((str(s), str(p), str(o), "N/A"))  # Falls es kein Literal ist, keine Sprache

# In ein DataFrame umwandeln
df_tripple = pd.DataFrame(data, columns=["subject", "predicate", "object", "language"])

# Speichern als CSV
df_tripple.to_csv("df_tripple_with_lang.csv", index=False)
print("Saved extracted resources with language information to df_tripple_with_lang.csv")


Saved extracted resources with language information to df_tripple_with_lang.csv


### Save all datasets

In [58]:
import requests
import xml.etree.ElementTree as ET
import os

# Set maximum number of datasets to download for testing
MAX_DATASETS = 5  # Change this value for development

def fetch_xml_metadata(identifier):
    """Fetch XML metadata from an API endpoint."""
    url = f"https://ckan.opendata.swiss/dataset/{identifier}.xml"
    print(f"Fetching XML: {url}")

    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request fails
    return response.content  # Return raw XML content

def parse_xml_metadata(xml_content):
    """Parse XML metadata and extract relevant information."""
    root = ET.fromstring(xml_content)

    # Define namespaces
    namespace = {"dct": "http://purl.org/dc/terms/", "dcat": "http://www.w3.org/ns/dcat#"}

    # Extract key metadata
    title = root.find(".//dct:title", namespace)
    description = root.find(".//dct:description", namespace)
    modified = root.find(".//dct:modified", namespace)

    # Return extracted metadata
    return {
        "Title": title.text if title is not None else "N/A",
        "Description": description.text if description is not None else "N/A",
        "Modified": modified.text if modified is not None else "N/A"
    }

# === READ IDENTIFIERS FROM CSV ===
data_file = "opendata_swiss_datasets.csv"
identifiers = []
with open(data_file, "r", encoding="utf-8") as file:
    identifiers = [line.strip() for line in file.readlines()[1:]]  # Skip header

# Limit the number of datasets for testing
identifiers = identifiers[:MAX_DATASETS]

# === SET FOLDER PATH ===
save_folder = "saved_metadata_xml"  # Change this to your desired folder name

# Create folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# === FETCH AND SAVE XML METADATA ===
for index, identifier in enumerate(identifiers, start=1):
    print(f"\nProcessing {index}/{len(identifiers)}: {identifier}")
    try:
        xml_data = fetch_xml_metadata(identifier)
        metadata = parse_xml_metadata(xml_data)

        # Print extracted metadata
        print("✅ Extracted Metadata:")
        for key, value in metadata.items():
            print(f"{key}: {value}")

        # Save raw XML to the folder
        xml_filename = os.path.join(save_folder, f"{identifier}.xml")
        with open(xml_filename, "wb") as file:
            file.write(xml_data)

        print(f"📂 XML file saved in: {xml_filename}")
    except Exception as e:
        print(f"⚠️ Failed to process {identifier}: {e}")


OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'saved_metadata_xml\x07eromagnetische-karte-der-schweiz-1-500000.xml'

### lets go

In [11]:
import requests
import xml.etree.ElementTree as ET
import os

# Set maximum number of datasets to download for testing
MAX_DATASETS = None  # Set to None to download all datasets  # Change this value for development

def fetch_xml_metadata(identifier):
    """Fetch XML metadata from an API endpoint."""
    url = f"https://ckan.opendata.swiss/dataset/{identifier}.xml"
    print(f"Fetching XML: {url}")

    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request fails
    return response.content  # Return raw XML content

def parse_xml_metadata(xml_content):
    """Parse XML metadata and extract relevant information."""
    root = ET.fromstring(xml_content)

    # Define namespaces
    namespace = {"dct": "http://purl.org/dc/terms/", "dcat": "http://www.w3.org/ns/dcat#"}

    # Extract key metadata
    title = root.find(".//dct:title", namespace)
    description = root.find(".//dct:description", namespace)
    modified = root.find(".//dct:modified", namespace)

    # Return extracted metadata
    return {
        "Title": title.text if title is not None else "N/A",
        "Description": description.text if description is not None else "N/A",
        "Modified": modified.text if modified is not None else "N/A"
    }

# === READ IDENTIFIERS FROM CSV ===
data_file = "opendata_swiss_datasets.csv"
identifiers = []
with open(data_file, "r", encoding="utf-8") as file:
    identifiers = [line.strip() for line in file.readlines()[1:]]  # Skip header

# Limit the number of datasets for testing
identifiers = identifiers[:MAX_DATASETS]

# === SET FOLDER PATH ===
save_folder = "saved_metadata_xml"  # Change this to your desired folder name

# Create folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# === FETCH AND SAVE XML METADATA ===
for index, identifier in enumerate(identifiers, start=1):
    print(f"\nProcessing {index}/{len(identifiers)}: {identifier}")
    try:
        xml_data = fetch_xml_metadata(identifier)
        metadata = parse_xml_metadata(xml_data)

        # Print extracted metadata
        print("✅ Extracted Metadata:")
        for key, value in metadata.items():
            print(f"{key}: {value}")

        # Save raw XML to the folder
        xml_filename = os.path.join(save_folder, f"{identifier}.xml")
        with open(xml_filename, "wb") as file:
            file.write(xml_data)

        print(f"📂 XML file saved in: {xml_filename}")
    except Exception as e:
        print(f"⚠️ Failed to process {identifier}: {e}")



Processing 1/13135: __
Fetching XML: https://ckan.opendata.swiss/dataset/__.xml
✅ Extracted Metadata:
Title: Carta del rischio di erosione dei terreni coltivi, categorizzazione qualitativa
Description: Carta del rischio di erosione dei terreni coltivi, categorizzazione qualitativa (Ufficio federale dell’agricoltura) Carta del rischio di erosione dei terreni coltivi della Svizzera realizzata con un reticolo a celle di 2x2 metri sulla base di SwissALTI3D e dei dati cantonali sulle superfici riguardanti i terreni coltivi (stato 2021). È riportato il potenziale rischio di erosione qualitativo. La valutazione complessiva viene classificata in uno dei tre livelli di pericolo (nessun pericolo, pericolo, pericolo elevato), senza considerare l’utilizzo o il tipo di gestione del suolo. La perdita di suolo media pluriennale è calcolata basandosi sulla Universal Soil Loss Equation (USLE).
Modified: N/A
📂 XML file saved in: saved_metadata_xml\__.xml

Processing 2/13135: __101
Fetching XML: https:/

KeyboardInterrupt: 

### Kill all carriage returns :)

In [33]:
import os
import re
import xml.etree.ElementTree as ET

def remove_html_tags(text):
    """Remove HTML tags from a string while preserving content."""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def clean_xml_file(input_path, output_path):
    """Process an XML file to remove HTML tags and save it in a single line."""
    try:
        # Parse XML
        tree = ET.parse(input_path)
        root = tree.getroot()

        # Recursively clean text elements
        for elem in root.iter():
            if elem.text:
                elem.text = remove_html_tags(elem.text)

        # Convert XML tree to a single-line string
        cleaned_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
        cleaned_xml = cleaned_xml.replace('\n', '').replace('\t', '')

        # Write output
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_xml)

        print(f"Processed: {os.path.basename(input_path)} -> {os.path.basename(output_path)}")
    
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

def process_folder(input_folder, output_folder):
    """Process all XML files in the input folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".xml"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            clean_xml_file(input_path, output_path)

# Example Usage
input_folder = "saved_metadata_xml"  # Change to your actual folder path
output_folder = "saved_metadata_xml"

process_folder(input_folder, output_folder)


Processed: 1-ag-bezirke.xml -> 1-ag-bezirke.xml
Processed: 1-personenhaushalte-anz.xml -> 1-personenhaushalte-anz.xml
Processed: 10-ag-bieneninspektionskreise.xml -> 10-ag-bieneninspektionskreise.xml
Processed: 11-ag-schulkreise.xml -> 11-ag-schulkreise.xml
Processed: 116-ch-kataster-der-belasteten-standorte.xml -> 116-ch-kataster-der-belasteten-standorte.xml
Processed: 13-ag-inventare-der-baudenkmaler-und-kulturobjekte.xml -> 13-ag-inventare-der-baudenkmaler-und-kulturobjekte.xml
Processed: 130-ch-gewasserschutzbereiche.xml -> 130-ch-gewasserschutzbereiche.xml
Processed: 131-ch-grundwasserschutzzonen.xml -> 131-ch-grundwasserschutzzonen.xml
Processed: 132-ch-grundwasserschutzareale.xml -> 132-ch-grundwasserschutzareale.xml
Processed: 145-ch-larmempfindlichkeitsstufen-in-nutzungszonen.xml -> 145-ch-larmempfindlichkeitsstufen-in-nutzungszonen.xml
Processed: 15-ag-archaologische-fundstellen.xml -> 15-ag-archaologische-fundstellen.xml
Processed: 15-jahresverbrauch-ha.xml -> 15-jahresverbr

In [6]:
import os
import re

# Define folder path
folder_path = "saved_metadata_xml"

# Ensure folder exists
if not os.path.exists(folder_path):
    print(f"Error: Folder '{folder_path}' not found.")
    exit()

# Process each XML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)

        # Read file content
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Remove all occurrences of carriage return codes
        content = re.sub(r"(&#13;|&#10;|&#xD;|&#xA;)", " ", content)

        # Ensure all lines end with ">"
        cleaned_lines = []
        temp_line = ""

        for line in content.splitlines():
            line = line.strip()

            # If the line doesn't end with ">", merge it with the next
            if not line.endswith(">"):
                temp_line += line + " "
            else:
                temp_line += line
                cleaned_lines.append(temp_line.strip())
                temp_line = ""

        # If there's any remaining text in temp_line, add it
        if temp_line:
            cleaned_lines.append(temp_line.strip())

        # Save the cleaned file
        with open(file_path, "w", encoding="utf-8") as file:
            file.write("\n".join(cleaned_lines) + "\n")

        print(f"Processed: {filename}")

print("Carriage return and line break cleanup complete for all XML files!")


Processed: 1-ag-bezirke.xml
Processed: 1-personenhaushalte-anz.xml
Processed: 10-ag-bieneninspektionskreise.xml
Processed: 11-ag-schulkreise.xml
Processed: 116-ch-kataster-der-belasteten-standorte.xml
Processed: 13-ag-inventare-der-baudenkmaler-und-kulturobjekte.xml
Processed: 130-ch-gewasserschutzbereiche.xml
Processed: 131-ch-grundwasserschutzzonen.xml
Processed: 132-ch-grundwasserschutzareale.xml
Processed: 145-ch-larmempfindlichkeitsstufen-in-nutzungszonen.xml
Processed: 15-ag-archaologische-fundstellen.xml
Processed: 15-jahresverbrauch-ha.xml
Processed: 15-jahresverbrauch.xml
Processed: 159-ch-waldabstandslinien.xml
Processed: 16-ag-bevolkerungsschutzregionen.xml
Processed: 17-ag-beurteilungsgebiete-zur-schutzraumsteuerung.xml
Processed: 17-ch-inventar-der-historischen-verkehrswege-der-schweiz-regional-und-lokal.xml
Processed: 184-ch-kantonale-ausnahmetransportrouten.xml
Processed: 19-ag-polizeiregionen-oeffentlich.xml
Processed: 2-ag-kreise-des-kantons-aargau.xml
Processed: 2-per

### Extract Data

In [7]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def extract_multilang_elements(element_name, dataset_element, namespace, default_label):
    """Extract multilingual elements like descriptions, titles, and keywords from XML."""
    elements = {}
    collected_values = {}
    
    for element in dataset_element.findall(f".//{element_name}", namespace):
        lang_attr = element.get("{http://www.w3.org/XML/1998/namespace}lang", "unknown").upper()
        text_value = element.text.strip() if element.text else "N/A"
        
        if lang_attr in collected_values:
            collected_values[lang_attr].append(text_value)
        else:
            collected_values[lang_attr] = [text_value]
    
    for lang, values in collected_values.items():
        elements[f"{default_label}_{lang}"] = ", ".join(values)
    
    if not elements:
        elements[f"{default_label}_UNKNOWN"] = "N/A"
    
    return elements

def extract_text(element, tag, namespace, default="N/A"):
    """Extract text from an XML element."""
    found_element = element.find(tag, namespace)
    return found_element.text.strip() if found_element is not None and found_element.text else default

def extract_attribute(element, tag, attribute, namespace, default="N/A"):
    """Extract attribute from an XML element."""
    found_element = element.find(tag, namespace)
    return found_element.get(attribute, default) if found_element is not None else default

def extract_identifier(dataset_element, namespace):
    """Extract dataset identifier."""
    identifier_element = dataset_element.find("dct:identifier", namespace)
    return identifier_element.text if identifier_element is not None else "N/A"

def extract_list(element, tag, namespace):
    """Extract a list of text values from multiple occurrences of a tag."""
    return [elem.text.strip() for elem in element.findall(tag, namespace) if elem.text] or ["N/A"]

def extract_issued_date(distribution_element, namespace):
    """Extract issued date from distribution."""
    issued_element = distribution_element.find(".//dct:issued", namespace)
    return issued_element.text if issued_element is not None else "N/A"

def extract_multilang_attributes(element, tag, namespace, default_label):
    """Extract multilingual elements from XML."""
    elements = {}
    for elem in element.findall(tag, namespace):
        lang_attr = elem.get("{http://www.w3.org/XML/1998/namespace}lang", "unknown").upper()
        elements[f"{default_label}_{lang_attr}"] = elem.text.strip() if elem.text else "N/A"
    return elements if elements else {f"{default_label}_UNKNOWN": "N/A"}

def extract_distributions(dataset_element, namespace, dataset_id):
    """Extract distribution metadata linked to dataset ID."""
    distributions = []
    for distribution_element in dataset_element.findall(".//dcat:Distribution", namespace):
        access_url_element = distribution_element.find("dcat:accessURL", namespace)
        access_url = access_url_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if access_url_element is not None else "N/A"
        license_element = distribution_element.find("dct:license", namespace)
        license_url = license_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if license_element is not None else "N/A"
        rights_element = distribution_element.find("dct:rights", namespace)
        rights_text = rights_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if rights_element is not None else "N/A"
        byte_size_element = distribution_element.find("dcat:byteSize", namespace)
        byte_size = byte_size_element.text if byte_size_element is not None else "N/A"
        format_element = distribution_element.find("dct:format", namespace)
        format_url = format_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if format_element is not None else "N/A"
        media_type_element = distribution_element.find("dcat:mediaType", namespace)
        media_type = media_type_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if media_type_element is not None else "N/A"
        modified_element = distribution_element.find("dct:modified", namespace)
        modified_date = modified_element.text if modified_element is not None else "N/A"
        languages = [lang.text.strip() for lang in distribution_element.findall("dct:language", namespace) if lang.text] or ["N/A"]
        titles = extract_multilang_elements("dct:title", distribution_element, namespace, "title")
        descriptions = extract_multilang_elements("dct:description", distribution_element, namespace, "description")
        distribution_identifier_element = distribution_element.find("dct:identifier", namespace)
        distribution_identifier = distribution_identifier_element.text.strip() if distribution_identifier_element is not None else "N/A"
        download_url_element = distribution_element.find("dcat:downloadURL", namespace)
        download_url = download_url_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if download_url_element is not None else "N/A"
        documentation_element = distribution_element.find(".//foaf:page/foaf:Document", namespace)
        documentation_url = documentation_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A") if documentation_element is not None else "N/A"
        distribution_temporal_resolution_element = distribution_element.find("dcat:temporalResolution", namespace)
        distribution_temporal_resolution = distribution_temporal_resolution_element.text if distribution_temporal_resolution_element is not None else "N/A"
        coverage_elements = [elem.text.strip() for elem in distribution_element.findall("{http://purl.org/dc/terms/}coverage") if elem.text]
        coverage = coverage_elements if coverage_elements else ["N/A"]
        distribution_entry = {
            "dataset_id": dataset_id,
            "distribution_id": distribution_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A"),
            "issued_date": extract_issued_date(distribution_element, namespace),
            "modified_date": modified_date,
            "access_url": access_url,
            "license": license_url,
            "rights": rights_text,
            "byte_size": byte_size,
            "format": format_url,
            "media_type": media_type,
            "language": languages,
            "download_url": download_url,
            "coverage": coverage,
            "distribution_temporal_resolution": distribution_temporal_resolution,
            "distribution_documentation": documentation_url,
            "distribution_identifier": distribution_identifier
        }
        distribution_entry.update(titles)
        distribution_entry.update(descriptions)
        distributions.append(distribution_entry)
    return distributions

def extract_contact_points(dataset_element, namespace, dataset_id):
    """Extract contact points from dataset."""
    contact_points = []
    for contact_element in dataset_element.findall(".//dcat:contactPoint", namespace):
        organization_element = contact_element.find("vcard:Organization", namespace)
        individual_element = contact_element.find("vcard:Individual", namespace)

        if organization_element is not None:
            contact_type = "Organization"
            node_id = organization_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID", "N/A")
            email_element = organization_element.find("vcard:hasEmail", namespace)
            email = email_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if email_element is not None else "N/A"
            name_element = organization_element.find("vcard:fn", namespace)
            name = name_element.text.strip() if name_element is not None else "N/A"

        elif individual_element is not None:
            contact_type = "Individual"
            node_id = ""
            email_element = individual_element.find("vcard:hasEmail", namespace)
            email = email_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if email_element is not None else "N/A"
            name_element = individual_element.find("vcard:fn", namespace)
            name = name_element.text.strip() if name_element is not None else "N/A"
        
        else:
            continue
        
        contact_points.append({
            "contact_type": contact_type,
            "dataset_id": dataset_id,
            "contact_nodeID": node_id,
            "contact_email": email,
            "contact_name": name
        })
    return contact_points


import xml.etree.ElementTree as ET
import os
import pandas as pd

def extract_metadata_from_xml(xml_file,xml_filename):
    """Extract metadata from an XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    namespace = {
        "dct": "http://purl.org/dc/terms/",
        "foaf": "http://xmlns.com/foaf/0.1/",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "dcat": "http://www.w3.org/ns/dcat#",
        "vcard": "http://www.w3.org/2006/vcard/ns#",
        "xml": "http://www.w3.org/XML/1998/namespace",
        "schema": "http://schema.org/",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    }
    
    dataset_element = root.find(".//dcat:Dataset", namespace)
    if dataset_element is None:
        return {}, []
    
    dataset_id = extract_identifier(dataset_element, namespace)
    sorted_keywords = extract_multilang_elements("dcat:keyword", dataset_element, namespace, "keyword")
    dataset_descriptions = extract_multilang_elements("dct:description", dataset_element, namespace, "dataset_description")
    dataset_titles = extract_multilang_elements("dct:title", dataset_element, namespace, "dataset_title")
    distributions = extract_distributions(dataset_element, namespace, dataset_id)
    
    dataset_issued = extract_text(dataset_element, "dct:issued", namespace, "N/A")
    dataset_modified = extract_text(dataset_element, "dct:modified", namespace, "N/A")
    dataset_theme_elements = dataset_element.findall("dcat:theme", namespace)
    dataset_theme = [elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") for elem in dataset_theme_elements]
    dataset_theme = dataset_theme if dataset_theme else ["N/A"]
    landing_page = extract_attribute(dataset_element, "dcat:landingPage", "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", namespace, "N/A")
    dataset_languages = extract_list(dataset_element, "dct:language", namespace)
    dataset_spatial = extract_list(dataset_element, "dct:spatial", namespace)
    dataset_coverage = extract_list(dataset_element, "dct:coverage", namespace)
    dataset_accrual_periodicity = extract_attribute(dataset_element, "dct:accrualPeriodicity", "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", namespace, "N/A")
    
    # Extract temporal start and end dates correctly
    temporal_element = dataset_element.find("dct:temporal/dct:PeriodOfTime", namespace)
    dataset_temporal_startDate = extract_text(temporal_element, "dcat:startDate", namespace, "N/A") if temporal_element is not None else "N/A"
    dataset_temporal_endDate = extract_text(temporal_element, "dcat:endDate", namespace, "N/A") if temporal_element is not None else "N/A"
    relation_elements = dataset_element.findall("dct:relation/rdf:Description", namespace)
    dataset_relation = [f'"{elem.find("rdfs:label", namespace).text if elem.find("rdfs:label", namespace) is not None else "N/A"}", "{elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A")}"' for elem in relation_elements]
    dataset_relation = ";".join(dataset_relation) if dataset_relation else "N/A"


    # Extract qualified relations correctly
    qualified_relation_elements = dataset_element.findall("dcat:qualifiedRelation/dcat:Relationship", namespace)
    qualified_relations = []
    for rel in qualified_relation_elements:
        relation = extract_attribute(rel, "dct:relation", "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", namespace, "N/A")
        had_role = extract_attribute(rel, "dcat:hadRole", "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", namespace, "N/A")
        qualified_relations.append(f'"{relation}", "{had_role}"')
    dataset_qualified_relation = "; ".join(qualified_relations) if qualified_relations else "N/A"
    
    # Extract multiple documentation links
    page_elements = dataset_element.findall(".//foaf:page/foaf:Document", namespace)
    dataset_page = "; ".join([elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A") for elem in page_elements]) if page_elements else "N/A"
    
    dataset_conforms_to = extract_attribute(dataset_element, "dct:conformsTo", "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", namespace, "N/A")
    
    dataset_metadata = {
        "dataset_id": dataset_id,
        "dataset_issued": dataset_issued,
        "dataset_modified": dataset_modified,
        "dataset_theme": dataset_theme,
        "dataset_landing_page": landing_page,
        "dataset_language": dataset_languages,
        "dataset_spatial": dataset_spatial,
        "dataset_coverage": dataset_coverage,
        "dataset_accrual_periodicity": dataset_accrual_periodicity,
        "dataset_temporal_startDate": dataset_temporal_startDate,
        "dataset_temporal_endDate": dataset_temporal_endDate,
        "dataset_relation": dataset_relation,
        "dataset_qualified_relation": dataset_qualified_relation,
        "dataset_documentation": dataset_page,
        "dataset_conforms_to": dataset_conforms_to
    }
    
    dataset_metadata.update(sorted_keywords)
    dataset_metadata.update(dataset_descriptions)
    dataset_metadata.update(dataset_titles)
    
    return dataset_metadata, distributions


contact_point_data = []
folder_path = "saved_metadata_xml"
dataset_data = []
distribution_data = []
namespace = {
    "dct": "http://purl.org/dc/terms/",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "dcat": "http://www.w3.org/ns/dcat#",
    "vcard": "http://www.w3.org/2006/vcard/ns#",
    "xml": "http://www.w3.org/XML/1998/namespace",
    "schema": "http://schema.org/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
}

for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        
        # Extract dataset metadata and distributions, passing the filename
        dataset_metadata, distributions = extract_metadata_from_xml(file_path, filename)
        
        # Add the filename to dataset metadata
        dataset_metadata["xml_filename"] = filename
        dataset_data.append(dataset_metadata)

        # Add the filename to each distribution entry
        for distribution in distributions:
            distribution["xml_filename"] = filename
        distribution_data.extend(distributions)

        # Extract contact points and add the filename
        contact_points = extract_contact_points(ET.parse(file_path).getroot().find(".//dcat:Dataset", namespace), namespace, dataset_metadata["dataset_id"])
        for contact in contact_points:
            contact["xml_filename"] = filename
        contact_point_data.extend(contact_points)


df_dataset = pd.DataFrame(dataset_data)
df_dataset = pd.DataFrame(dataset_data)
df_contact_point = pd.DataFrame(contact_point_data)
df_distribution = pd.DataFrame(distribution_data)
df_distribution = pd.DataFrame(distribution_data)

print("\nExtracted Dataset Metadata:")
print(df_dataset)
df_dataset.to_csv("datasets_metadata.csv", index=False)
print("\nDataset metadata saved as datasets_metadata.csv")

print("\nExtracted Distribution Metadata:")
print(df_distribution)
df_distribution.to_csv("distribution_metadata.csv", index=False)
df_contact_point.to_csv("contact_point_metadata.csv", index=False)
print("Distribution metadata saved as distribution_metadata.csv")
print("Contact Point metadata saved as contact_point_metadata.csv")


Processing: 1-ag-bezirke.xml
Processing: 1-personenhaushalte-anz.xml
Processing: 10-ag-bieneninspektionskreise.xml
Processing: 11-ag-schulkreise.xml
Processing: 116-ch-kataster-der-belasteten-standorte.xml
Processing: 13-ag-inventare-der-baudenkmaler-und-kulturobjekte.xml
Processing: 130-ch-gewasserschutzbereiche.xml
Processing: 131-ch-grundwasserschutzzonen.xml
Processing: 132-ch-grundwasserschutzareale.xml
Processing: 145-ch-larmempfindlichkeitsstufen-in-nutzungszonen.xml
Processing: 15-ag-archaologische-fundstellen.xml
Processing: 15-jahresverbrauch-ha.xml
Processing: 15-jahresverbrauch.xml
Processing: 159-ch-waldabstandslinien.xml
Processing: 16-ag-bevolkerungsschutzregionen.xml
Processing: 17-ag-beurteilungsgebiete-zur-schutzraumsteuerung.xml
Processing: 17-ch-inventar-der-historischen-verkehrswege-der-schweiz-regional-und-lokal.xml
Processing: 184-ch-kantonale-ausnahmetransportrouten.xml
Processing: 19-ag-polizeiregionen-oeffentlich.xml
Processing: 2-ag-kreise-des-kantons-aargau.

### Sort the languages dataset

In [13]:
# Load dataset
df_dataset = pd.read_csv("datasets_metadata.csv")

# Ensure dataset_language column is parsed as lists (if stored as strings)
df_dataset["dataset_language"] = df_dataset["dataset_language"].apply(lambda x: sorted(eval(x)) if isinstance(x, str) else x)

# Save the updated dataset with sorted language lists
df_dataset.to_csv("datasets_metadata.csv", index=False)

print("✅ Language lists sorted and updated in datasets_metadata.csv.")


✅ Language lists sorted and updated in datasets_metadata.csv.


### Sort the languages distribution

In [14]:
# Load dataset
df_dataset = pd.read_csv("distribution_metadata.csv")

# Ensure dataset_language column is parsed as lists (if stored as strings)
df_dataset["language"] = df_dataset["language"].apply(lambda x: sorted(eval(x)) if isinstance(x, str) else x)

# Save the updated dataset with sorted language lists
df_dataset.to_csv("distribution_metadata.csv", index=False)

print("✅ Language lists sorted and updated in distribution_metadata.csv.")


  df_dataset = pd.read_csv("distribution_metadata.csv")


✅ Language lists sorted and updated in distribution_metadata.csv.


### Same precision for date format dataset

In [15]:
import pandas as pd
from datetime import datetime

# Load dataset
df_dataset = pd.read_csv("datasets_metadata.csv")


# Function to transform date to the required precision
def transform_date(date_str):
    try:
        if pd.isna(date_str) or date_str.strip() == "":
            return "N/A"  # Handle empty or NaN values
        return datetime.fromisoformat(date_str).strftime("%Y-%m-%dT%H:%M:%S")
    except ValueError:
        return date_str  # Return as-is if conversion fails

# Transform date columns
date_columns = ["dataset_issued", "dataset_modified", "dataset_temporal_endDate", "dataset_temporal_startDate"]
for col in date_columns:
    if col in df_dataset.columns:
        df_dataset[col] = df_dataset[col].apply(transform_date)

# Save the updated dataset with sorted language lists and formatted dates
df_dataset.to_csv("datasets_metadata.csv", index=False)

print("✅ Language lists sorted and dates transformed in datasets_metadata.csv.")

✅ Language lists sorted and dates transformed in datasets_metadata.csv.


### Same precision for date format distribution

In [16]:
import pandas as pd
from datetime import datetime

# Load dataset
df_dataset = pd.read_csv("distribution_metadata.csv")


# Function to transform date to the required precision
def transform_date(date_str):
    try:
        if pd.isna(date_str) or date_str.strip() == "":
            return "N/A"  # Handle empty or NaN values
        return datetime.fromisoformat(date_str).strftime("%Y-%m-%dT%H:%M:%S")
    except ValueError:
        return date_str  # Return as-is if conversion fails

# Transform date columns
date_columns = ["issued_date", "modified_date"]
for col in date_columns:
    if col in df_dataset.columns:
        df_dataset[col] = df_dataset[col].apply(transform_date)

# Save the updated dataset with sorted language lists and formatted dates
df_dataset.to_csv("distribution_metadata.csv", index=False)

print("✅ Language lists sorted and dates transformed in distribution_metadata.csv.")

  df_dataset = pd.read_csv("distribution_metadata.csv")


✅ Language lists sorted and dates transformed in distribution_metadata.csv.


### Remove all N/A and ['N/A']

In [19]:
import pandas as pd
import ast

# List of CSV file paths (modifying the original files)
csv_files = ["datasets_metadata.csv", "distribution_metadata.csv", "contact_point_metadata.csv"]

# Define values to be removed
values_to_remove = {"N/A", "[N/A]"}

# Function to clean individual values in the DataFrame
def clean_value(value):
    if pd.isna(value) or str(value).strip() in values_to_remove:
        return ""  # Replace with an empty string instead of NaN
    try:
        # Convert string representation of lists into actual lists
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, list):
            # Remove 'N/A' from lists
            parsed_value = [item for item in parsed_value if str(item).strip() not in values_to_remove]
            return parsed_value if parsed_value else ""  # Convert empty lists to empty string
    except (ValueError, SyntaxError):
        pass
    return value.strip()  # Strip whitespace from normal strings

# Process each CSV file and overwrite with cleaned data
for file in csv_files:
    df = pd.read_csv(file, dtype=str)  # Read all columns as strings
    df = df.applymap(clean_value)  # Apply cleaning function
    df.to_csv(file, index=False)  # Overwrite original file

print("Cleaning complete. Original CSV files have been updated.")


  df = df.applymap(clean_value)  # Apply cleaning function
  df = df.applymap(clean_value)  # Apply cleaning function
  df = df.applymap(clean_value)  # Apply cleaning function


Cleaning complete. Original CSV files have been updated.
