In [7]:
import pandas as pd
import requests
import datetime
import hashlib
import json

def fetch_metadata(url):
    """Fetch JSON metadata from an API endpoint."""
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def generate_hash(data):
    """Generate a SHA-256 hash for the dataset metadata."""
    data_str = json.dumps(data, sort_keys=True)  # Convert JSON to sorted string
    return hashlib.sha256(data_str.encode()).hexdigest()  # Compute hash

def merge_metadata(dataframes):
    """Merge multiple metadata DataFrames while preserving all columns."""
    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)
    return merged_df

def save_to_csv(df, filename="merged_metadata.csv"):
    """Save the merged DataFrame to a CSV file."""
    df.to_csv(filename, index=False)
    print(f"Saved merged metadata to {filename}")

if __name__ == "__main__":
    # Read dataset names from datasets.csv
    datasets_df = pd.read_csv("opendata_swiss_datasets.csv")

    # Pick 10 random datasets from the CSV (assuming the dataset names are in the 'Dataset_Name' column)
    random_datasets = datasets_df['Dataset_Name'].sample(n=10, random_state=42).tolist()

    # List to store dataframes
    dataframes = []

    # Fetch and store API responses for the 10 random datasets
    for dataset in random_datasets:
        url = f"https://ckan.opendata.swiss/api/3/action/package_show?id={dataset}"
        timestamp = datetime.datetime.now().isoformat()

        # Fetch metadata
        data = fetch_metadata(url)

        # Generate hash
        dataset_hash = generate_hash(data)

        # Normalize JSON and add metadata columns
        df = pd.json_normalize(data)
        df['Dataset_Name'] = dataset  # Add dataset name column
        df['Request_Timestamp'] = timestamp  # Add timestamp column
        df['Metadata_Hash'] = dataset_hash  # Add hash column

        dataframes.append(df)

    # Merge all the dataframes
    merged_df = merge_metadata(dataframes)

    # Save the merged data to CSV
    save_to_csv(merged_df)


Saved merged metadata to merged_metadata.csv


### explode result.resources

In [8]:
import pandas as pd
import ast

# Read the CSV file
df_merged = pd.read_csv("merged_metadata.csv")

# Convert 'result.resources' column from string representation to a list of dictionaries
df_merged['result.resources'] = df_merged['result.resources'].apply(ast.literal_eval)

# Explode the 'result.resources' column while keeping the 'Dataset_Name' column
df_resources = df_merged.explode('result.resources')

# Convert dictionary entries to separate columns
df_resources = df_resources[['Dataset_Name', 'result.resources']].reset_index(drop=True)
df_resources = df_resources.join(df_resources.pop('result.resources').apply(pd.Series))



# Optionally, save the extracted resource details to a CSV file
df_resources.to_csv("resources_metadata.csv", index=False)
print("Saved extracted resources to resources_metadata.csv")


Saved extracted resources to resources_metadata.csv


In [10]:
from rdflib import Graph
import pandas as pd
from sqlalchemy import create_engine

# RDF-Datei laden
g = Graph()
g.parse("langsamverkehr-wanderland-schweiz.xml", format="xml")  # Oder "turtle" für .ttl

# In eine Liste von Tripeln konvertieren
data = [(str(s), str(p), str(o)) for s, p, o in g]

# In ein DataFrame umwandeln
df_tripple = pd.DataFrame(data, columns=["subject", "predicate", "object"])


df_tripple.to_csv("df_tripple.csv", index=False)
print("Saved extracted resources to df_tripple.csv")

Saved extracted resources to df_tripple.csv


In [11]:
from rdflib import Graph, Literal
import pandas as pd

# RDF-Datei laden
g = Graph()
g.parse("langsamverkehr-wanderland-schweiz.xml", format="xml")

# In eine Liste von Tripeln konvertieren (mit Sprachinformation)
data = []
for s, p, o in g:
    if isinstance(o, Literal):  # Nur für Literal-Objekte prüfen
        lang = o.language if o.language else "unknown"  # Falls keine Sprache angegeben ist
        data.append((str(s), str(p), str(o), lang))
    else:
        data.append((str(s), str(p), str(o), "N/A"))  # Falls es kein Literal ist, keine Sprache

# In ein DataFrame umwandeln
df_tripple = pd.DataFrame(data, columns=["subject", "predicate", "object", "language"])

# Speichern als CSV
df_tripple.to_csv("df_tripple_with_lang.csv", index=False)
print("Saved extracted resources with language information to df_tripple_with_lang.csv")


Saved extracted resources with language information to df_tripple_with_lang.csv


### Save all datasets

In [47]:
import requests
import xml.etree.ElementTree as ET
import os

# Set maximum number of datasets to download for testing
MAX_DATASETS = 5  # Change this value for development

def fetch_xml_metadata(identifier):
    """Fetch XML metadata from an API endpoint."""
    url = f"https://ckan.opendata.swiss/dataset/{identifier}.xml"
    print(f"Fetching XML: {url}")

    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request fails
    return response.content  # Return raw XML content

def parse_xml_metadata(xml_content):
    """Parse XML metadata and extract relevant information."""
    root = ET.fromstring(xml_content)

    # Define namespaces
    namespace = {"dct": "http://purl.org/dc/terms/", "dcat": "http://www.w3.org/ns/dcat#"}

    # Extract key metadata
    title = root.find(".//dct:title", namespace)
    description = root.find(".//dct:description", namespace)
    modified = root.find(".//dct:modified", namespace)

    # Return extracted metadata
    return {
        "Title": title.text if title is not None else "N/A",
        "Description": description.text if description is not None else "N/A",
        "Modified": modified.text if modified is not None else "N/A"
    }

# === READ IDENTIFIERS FROM CSV ===
data_file = "opendata_swiss_datasets.csv"
identifiers = []
with open(data_file, "r", encoding="utf-8") as file:
    identifiers = [line.strip() for line in file.readlines()[1:]]  # Skip header

# Limit the number of datasets for testing
identifiers = identifiers[:MAX_DATASETS]

# === SET FOLDER PATH ===
save_folder = "saved_metadata_xml"  # Change this to your desired folder name

# Create folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# === FETCH AND SAVE XML METADATA ===
for index, identifier in enumerate(identifiers, start=1):
    print(f"\nProcessing {index}/{len(identifiers)}: {identifier}")
    try:
        xml_data = fetch_xml_metadata(identifier)
        metadata = parse_xml_metadata(xml_data)

        # Print extracted metadata
        print("✅ Extracted Metadata:")
        for key, value in metadata.items():
            print(f"{key}: {value}")

        # Save raw XML to the folder
        xml_filename = os.path.join(save_folder, f"{identifier}.xml")
        with open(xml_filename, "wb") as file:
            file.write(xml_data)

        print(f"📂 XML file saved in: {xml_filename}")
    except Exception as e:
        print(f"⚠️ Failed to process {identifier}: {e}")



Processing 1/5: __
Fetching XML: https://ckan.opendata.swiss/dataset/__.xml
✅ Extracted Metadata:
Title: ch.blw.erosion
Description: Download Server von geo.admin.ch
Modified: N/A
📂 XML file saved in: saved_metadata_xml\__.xml

Processing 2/5: __101
Fetching XML: https://ckan.opendata.swiss/dataset/__101.xml
✅ Extracted Metadata:
Title: ADRESSES GEOREFERENCEES A L'ENTREE DU BATIMENT
Description: Les adresses géoréférencées permettent de définir avec exactitude l'entrée des bâtiments de logements ou d'activités. « Aujourd'hui, l'adressage des bâtiments joue un rôle central dans l'administration publique comme dans la vie privée des citoyennes et des citoyens. La position d'un bâtiment est établie sans équivoque par son adresse. Celle-ci aide par exemple les services de secours, les pompiers, la police ou une personne peu familière d'un lieu à trouver rapidement le bâtiment recherché. Les systèmes de navigation pour les véhicules n'ont du reste cessé de gagner en importance. Les adresse

### lets go

In [52]:
import requests
import xml.etree.ElementTree as ET
import os

# Set maximum number of datasets to download for testing
MAX_DATASETS = None  # Set to None to download all datasets  # Change this value for development

def fetch_xml_metadata(identifier):
    """Fetch XML metadata from an API endpoint."""
    url = f"https://ckan.opendata.swiss/dataset/{identifier}.xml"
    print(f"Fetching XML: {url}")

    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request fails
    return response.content  # Return raw XML content

def parse_xml_metadata(xml_content):
    """Parse XML metadata and extract relevant information."""
    root = ET.fromstring(xml_content)

    # Define namespaces
    namespace = {"dct": "http://purl.org/dc/terms/", "dcat": "http://www.w3.org/ns/dcat#"}

    # Extract key metadata
    title = root.find(".//dct:title", namespace)
    description = root.find(".//dct:description", namespace)
    modified = root.find(".//dct:modified", namespace)

    # Return extracted metadata
    return {
        "Title": title.text if title is not None else "N/A",
        "Description": description.text if description is not None else "N/A",
        "Modified": modified.text if modified is not None else "N/A"
    }

# === READ IDENTIFIERS FROM CSV ===
data_file = "opendata_swiss_datasets.csv"
identifiers = []
with open(data_file, "r", encoding="utf-8") as file:
    identifiers = [line.strip() for line in file.readlines()[1:]]  # Skip header

# Limit the number of datasets for testing
identifiers = identifiers[:MAX_DATASETS]

# === SET FOLDER PATH ===
save_folder = "saved_metadata_xml"  # Change this to your desired folder name

# Create folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# === FETCH AND SAVE XML METADATA ===
for index, identifier in enumerate(identifiers, start=1):
    print(f"\nProcessing {index}/{len(identifiers)}: {identifier}")
    try:
        xml_data = fetch_xml_metadata(identifier)
        metadata = parse_xml_metadata(xml_data)

        # Print extracted metadata
        print("✅ Extracted Metadata:")
        for key, value in metadata.items():
            print(f"{key}: {value}")

        # Save raw XML to the folder
        xml_filename = os.path.join(save_folder, f"{identifier}.xml")
        with open(xml_filename, "wb") as file:
            file.write(xml_data)

        print(f"📂 XML file saved in: {xml_filename}")
    except Exception as e:
        print(f"⚠️ Failed to process {identifier}: {e}")



Processing 1/13135: __
Fetching XML: https://ckan.opendata.swiss/dataset/__.xml
✅ Extracted Metadata:
Title: Erosionsrisikokarte des Ackerlandes, qualitative Einstufung
Description: Carte des risques d’érosion des terres arables, classification qualitative (Office fédéral de l'agriculture) Carte des risques d’érosion des terres arables de la Suisse au raster 2x2 m, sur la base du modèle swissALTI3D et des données cantonales sur les terres arables (situation en 2021). La carte indique le risque d’érosion qualitatif potentiel. L’attribution à l’un des trois niveaux de risque (pas de risque, risque avéré, risque élevé) est évaluée globalement, sans tenir compte de l’utilisation ou du mode d’exploitation du sol. Les pertes de sol moyennes sur le long terme sont calculées sur la base du modèle Universal Soil Loss Equation (USLE).
Modified: N/A
📂 XML file saved in: saved_metadata_xml\__.xml

Processing 2/13135: __101
Fetching XML: https://ckan.opendata.swiss/dataset/__101.xml
✅ Extracted Me

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def extract_multilang_elements(element_name, dataset_element, namespace, default_label):
    """Extract multilingual elements like descriptions, titles, and keywords from XML."""
    elements = {}
    collected_values = {}
    
    for element in dataset_element.findall(f".//{element_name}", namespace):
        lang_attr = element.get("{http://www.w3.org/XML/1998/namespace}lang", "unknown").upper()
        text_value = element.text.strip() if element.text else "N/A"
        
        if lang_attr in collected_values:
            collected_values[lang_attr].append(text_value)
        else:
            collected_values[lang_attr] = [text_value]
    
    for lang, values in collected_values.items():
        elements[f"{default_label}_{lang}"] = ", ".join(values)
    
    if not elements:
        elements[f"{default_label}_UNKNOWN"] = "N/A"
    
    return elements

def extract_text(element, tag, namespace, default="N/A"):
    """Extract text from an XML element."""
    found_element = element.find(tag, namespace)
    return found_element.text.strip() if found_element is not None and found_element.text else default

def extract_attribute(element, tag, attribute, namespace, default="N/A"):
    """Extract attribute from an XML element."""
    found_element = element.find(tag, namespace)
    return found_element.get(attribute, default) if found_element is not None else default

def extract_identifier(dataset_element, namespace):
    """Extract dataset identifier."""
    identifier_element = dataset_element.find("dct:identifier", namespace)
    return identifier_element.text if identifier_element is not None else "N/A"

def extract_list(element, tag, namespace):
    """Extract a list of text values from multiple occurrences of a tag."""
    return [elem.text.strip() for elem in element.findall(tag, namespace) if elem.text] or ["N/A"]

def extract_issued_date(distribution_element, namespace):
    """Extract issued date from distribution."""
    issued_element = distribution_element.find(".//dct:issued", namespace)
    return issued_element.text if issued_element is not None else "N/A"

def extract_multilang_attributes(element, tag, namespace, default_label):
    """Extract multilingual elements from XML."""
    elements = {}
    for elem in element.findall(tag, namespace):
        lang_attr = elem.get("{http://www.w3.org/XML/1998/namespace}lang", "unknown").upper()
        elements[f"{default_label}_{lang_attr}"] = elem.text.strip() if elem.text else "N/A"
    return elements if elements else {f"{default_label}_UNKNOWN": "N/A"}

def extract_distributions(dataset_element, namespace, dataset_id):
    """Extract distribution metadata linked to dataset ID."""
    distributions = []
    for distribution_element in dataset_element.findall(".//dcat:Distribution", namespace):
        access_url_element = distribution_element.find("dcat:accessURL", namespace)
        access_url = access_url_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if access_url_element is not None else "N/A"
        license_element = distribution_element.find("dct:license", namespace)
        license_url = license_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if license_element is not None else "N/A"
        rights_element = distribution_element.find("dct:rights", namespace)
        rights_text = rights_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if rights_element is not None else "N/A"
        byte_size_element = distribution_element.find("dcat:byteSize", namespace)
        byte_size = byte_size_element.text if byte_size_element is not None else "N/A"
        format_element = distribution_element.find("dct:format", namespace)
        format_url = format_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if format_element is not None else "N/A"
        media_type_element = distribution_element.find("dcat:mediaType", namespace)
        media_type = media_type_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if media_type_element is not None else "N/A"
        modified_element = distribution_element.find("dct:modified", namespace)
        modified_date = modified_element.text if modified_element is not None else "N/A"
        languages = [lang.text.strip() for lang in distribution_element.findall("dct:language", namespace) if lang.text] or ["N/A"]
        titles = extract_multilang_elements("dct:title", distribution_element, namespace, "title")
        descriptions = extract_multilang_elements("dct:description", distribution_element, namespace, "description")
        distribution_identifier_element = distribution_element.find("dct:identifier", namespace)
        distribution_identifier = distribution_identifier_element.text.strip() if distribution_identifier_element is not None else "N/A"
        download_url_element = distribution_element.find("dcat:downloadURL", namespace)
        download_url = download_url_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if download_url_element is not None else "N/A"
        documentation_element = distribution_element.find(".//foaf:page/foaf:Document", namespace)
        documentation_url = documentation_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A") if documentation_element is not None else "N/A"
        distribution_temporal_resolution_element = distribution_element.find("dcat:temporalResolution", namespace)
        distribution_temporal_resolution = distribution_temporal_resolution_element.text if distribution_temporal_resolution_element is not None else "N/A"
        coverage_elements = [elem.text.strip() for elem in distribution_element.findall("dct:coverage", namespace) if elem.text]
        coverage = coverage_elements if coverage_elements else ["N/A"]
        distribution_entry = {
            "dataset_id": dataset_id,
            "distribution_id": distribution_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A"),
            "issued_date": extract_issued_date(distribution_element, namespace),
            "modified_date": modified_date,
            "access_url": access_url,
            "license": license_url,
            "rights": rights_text,
            "byte_size": byte_size,
            "format": format_url,
            "media_type": media_type,
            "language": languages,
            "download_url": download_url,
            "coverage": coverage,
            "distribution_temporal_resolution": distribution_temporal_resolution,
            "distribution_documentation": documentation_url,
            "distribution_identifier": distribution_identifier
        }
        distribution_entry.update(titles)
        distribution_entry.update(descriptions)
        distributions.append(distribution_entry)
    return distributions

def extract_contact_points(dataset_element, namespace, dataset_id):
    """Extract contact points from dataset."""
    contact_points = []
    for contact_element in dataset_element.findall(".//dcat:contactPoint", namespace):
        organization_element = contact_element.find("vcard:Organization", namespace)
        individual_element = contact_element.find("vcard:Individual", namespace)

        if organization_element is not None:
            contact_type = "Organization"
            node_id = organization_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID", "N/A")
            email_element = organization_element.find("vcard:hasEmail", namespace)
            email = email_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if email_element is not None else "N/A"
            name_element = organization_element.find("vcard:fn", namespace)
            name = name_element.text.strip() if name_element is not None else "N/A"

        elif individual_element is not None:
            contact_type = "Individual"
            node_id = ""
            email_element = individual_element.find("vcard:hasEmail", namespace)
            email = email_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if email_element is not None else "N/A"
            name_element = individual_element.find("vcard:fn", namespace)
            name = name_element.text.strip() if name_element is not None else "N/A"
        
        else:
            continue
        
        contact_points.append({
            "contact_type": contact_type,
            "dataset_id": dataset_id,
            "contact_nodeID": node_id,
            "contact_email": email,
            "contact_name": name
        })
    return contact_points


def extract_metadata_from_xml(xml_file):
    """Extract metadata from an XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    namespace = {
    "dct": "http://purl.org/dc/terms/",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "dcat": "http://www.w3.org/ns/dcat#",
    "vcard": "http://www.w3.org/2006/vcard/ns#",
    "xml": "http://www.w3.org/XML/1998/namespace",
    "schema": "http://schema.org/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
}

    dataset_element = root.find(".//dcat:Dataset", namespace)
    if dataset_element is None:
        return {}, []
    dataset_id = extract_identifier(dataset_element, namespace)
    sorted_keywords = extract_multilang_elements("dcat:keyword", dataset_element, namespace, "keyword")
    dataset_descriptions = extract_multilang_elements("dct:description", dataset_element, namespace, "dataset_description")
    dataset_titles = extract_multilang_elements("dct:title", dataset_element, namespace, "dataset_title")
    distributions = extract_distributions(dataset_element, namespace, dataset_id)
    dataset_metadata = {"dataset_id": dataset_id}
    dataset_issued_element = dataset_element.find("dct:issued", namespace)
    dataset_issued = dataset_issued_element.text if dataset_issued_element is not None else "N/A"
    dataset_modified_element = dataset_element.find("dct:modified", namespace)
    dataset_modified = dataset_modified_element.text if dataset_modified_element is not None else "N/A"
    dataset_theme_elements = [elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource") for elem in dataset_element.findall("dcat:theme", namespace) if elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")]
    dataset_theme = dataset_theme_elements if dataset_theme_elements else ["N/A"]
    landing_page_element = dataset_element.find("dcat:landingPage", namespace)
    landing_page = landing_page_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if landing_page_element is not None else "N/A"
    dataset_languages = [lang.text.strip() for lang in dataset_element.findall("dct:language", namespace) if lang.text] or ["N/A"]
    spatial_element = dataset_element.find("dct:spatial", namespace)
    dataset_spatial = [spatial_element.text.strip()] if spatial_element is not None else ["N/A"]
    dataset_coverage_elements = [elem.text.strip() for elem in dataset_element.findall("dct:coverage", namespace) if elem.text]
    dataset_coverage = dataset_coverage_elements if dataset_coverage_elements else ["N/A"]
    accrual_periodicity_element = dataset_element.find("dct:accrualPeriodicity", namespace)
    dataset_accrual_periodicity = accrual_periodicity_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if accrual_periodicity_element is not None else "N/A"
    temporal_element = dataset_element.find("dct:temporal/dct:PeriodOfTime", namespace)
    start_date_element = temporal_element.find("schema:startDate", namespace) if temporal_element is not None else None
    end_date_element = temporal_element.find("schema:endDate", namespace) if temporal_element is not None else None
    dataset_temporal_startDate = start_date_element.text if start_date_element is not None else "N/A"
    dataset_temporal_endDate = end_date_element.text if end_date_element is not None else "N/A"
    relation_elements = dataset_element.findall("dct:relation/rdf:Description", namespace)
    dataset_relation = [f'"{elem.find("rdfs:label", namespace).text if elem.find("rdfs:label", namespace) is not None else "N/A"}", "{elem.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A")}"' for elem in relation_elements]
    dataset_relation = ";".join(dataset_relation) if dataset_relation else "N/A"
    qualified_relation_elements = dataset_element.findall("dcat:qualifiedRelation/dcat:Relationship", namespace)
    dataset_qualified_relation = [
        f'"{rel.find("dct:relation", namespace).get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A")}", ' 
        f'"{rel.find("dcat:hadRole", namespace).get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A")}"'
        for rel in qualified_relation_elements if rel.find("dct:relation", namespace) is not None and rel.find("dcat:hadRole", namespace) is not None
    ]
    dataset_qualified_relation = ";".join(dataset_qualified_relation) if dataset_qualified_relation else "N/A"
    page_element = dataset_element.find(".//foaf:page/foaf:Document", namespace)
    dataset_page = page_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "N/A") if page_element is not None else "N/A"
    conforms_to_element = dataset_element.find("dct:conformsTo", namespace)
    dataset_conforms_to = conforms_to_element.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource", "N/A") if conforms_to_element is not None else "N/A"
    
    dataset_metadata = {"dataset_id": dataset_id,
                        "dataset_issued": dataset_issued,
                        "dataset_modified": dataset_modified,
                        "dataset_theme": dataset_theme,
                        "dataset_landing_page": landing_page,
                        "dataset_language": dataset_languages,
                        "dataset_spatial": dataset_spatial,
                        "dataset_coverage": dataset_coverage,
                        "dataset_accrual_periodicity": dataset_accrual_periodicity,
                        "dataset_temporal_startDate": dataset_temporal_startDate,
                        "dataset_temporal_endDate": dataset_temporal_endDate,
                        "dataset_relation": dataset_relation,
                        "dataset_qualified_relation":dataset_qualified_relation,
                        "dataset_documentation": dataset_page,
                        "dataset_conforms_to": dataset_conforms_to}


    dataset_metadata.update(sorted_keywords)
    dataset_metadata.update(dataset_descriptions)
    dataset_metadata.update(dataset_titles)
    return dataset_metadata, distributions

contact_point_data = []
folder_path = "saved_metadata_xml"
dataset_data = []
distribution_data = []
namespace = {
    "dct": "http://purl.org/dc/terms/",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "dcat": "http://www.w3.org/ns/dcat#",
    "vcard": "http://www.w3.org/2006/vcard/ns#",
    "xml": "http://www.w3.org/XML/1998/namespace",
    "schema": "http://schema.org/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
}

for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        dataset_metadata, distributions = extract_metadata_from_xml(file_path)
        dataset_data.append(dataset_metadata)
        distribution_data.extend(distributions)
        contact_points = extract_contact_points(ET.parse(file_path).getroot().find(".//dcat:Dataset", namespace), namespace, dataset_metadata["dataset_id"])
        contact_point_data.extend(contact_points)

df_dataset = pd.DataFrame(dataset_data)
df_dataset = pd.DataFrame(dataset_data)
df_contact_point = pd.DataFrame(contact_point_data)
df_distribution = pd.DataFrame(distribution_data)
df_distribution = pd.DataFrame(distribution_data)

print("\nExtracted Dataset Metadata:")
print(df_dataset)
df_dataset.to_csv("datasets_metadata.csv", index=False)
print("\nDataset metadata saved as datasets_metadata.csv")

print("\nExtracted Distribution Metadata:")
print(df_distribution)
df_distribution.to_csv("distribution_metadata.csv", index=False)
df_contact_point.to_csv("contact_point_metadata.csv", index=False)
print("Distribution metadata saved as distribution_metadata.csv")
print("Contact Point metadata saved as contact_point_metadata.csv")


Processing: 10-ag-bieneninspektionskreise.xml
Processing: 11-ag-schulkreise.xml
Processing: 116-ch-kataster-der-belasteten-standorte.xml
Processing: masterplan-velo.xml
Processing: mobilitat-und-verkehr4.xml
Processing: ogdch_dcatapch_v2_import.xml
Processing: parkplatzbelegung-stadt-frauenfeld.xml
Processing: __.xml
Processing: __101.xml

Extracted Dataset Metadata:
                                          dataset_id  \
0  eef7063e-7c15-47f1-aa10-ce61f81778ad-6571@agis...   
1  544e34c5-88b5-4c81-8290-05c3dd2f0a4f-6571@agis...   
2  0e57f315-2b70-48c3-9802-4cde2db10c49-6571@agis...   
3  0d3ca2e3-e87f-4cb8-87c0-a2d0da5b3ef6@stadt-zurich   
4               18144776@bundesamt-fur-statistik-bfs   
5       325@bundesamt-fur-landestopografie-swisstopo   
6                        frauenfeld-1@kanton-thurgau   
7  02210bb3-1c51-4c2c-a665-a696286b945c@bundesamt...   
8  SITG_1735@sitg-systeme-dinformation-du-territo...   

                     dataset_issued                  dataset_modified