In [16]:
import requests
import xml.etree.ElementTree as ET
import os
import time
import re

# CSW endpoint and schema
CSW_URL = "https://www.geocat.ch/geonetwork/srv/deu/csw"
# SCHEMA = "http://www.geocat.ch/2008/che"
SCHEMA = "http://www.isotc211.org/2005/gmd"
SAVE_DIR = "test"
BATCH_SIZE = 1
WAIT_TIME = 1  # Time in seconds between requests
MAX_RECORDS = None  # Set to an integer for development, or None to fetch all

os.makedirs(SAVE_DIR, exist_ok=True)

def sanitize_filename(identifier):
    """Replace invalid filename characters with underscores."""
    return re.sub(r'[<>:"/\\|?*]', '_', identifier)

def fetch_records(start_position, max_records=100):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "namespace": "xmlns(csw=http://www.opengis.net/cat/csw/2.0.2)",
        "typeNames": "csw:Record",
        "elementSetName": "full",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": SCHEMA,
        "maxRecords": max_records,
        "startPosition": start_position,
        "sortBy": "title:A"
    }

    print(requests.get(CSW_URL, params=params).url)
    response = requests.get(CSW_URL, params=params)
    print(response.text)
    response.raise_for_status()
    return response.text

def extract_and_save_each_metadata(xml_text):
    namespaces = {
        "che": "http://www.geocat.ch/2008/che",
        "gmd": "http://www.isotc211.org/2005/gmd",
        "gco": "http://www.isotc211.org/2005/gco"
    }
    root = ET.fromstring(xml_text)
    records = root.findall(".//che:CHE_MD_Metadata", namespaces)
    saved_count = 0
    for record in records:
        identifier_el = record.find(".//gmd:fileIdentifier/gco:CharacterString", namespaces)
        identifier = identifier_el.text.strip() if identifier_el is not None else None
        if identifier:
            safe_name = sanitize_filename(identifier)
            filename = os.path.join(SAVE_DIR, f"{safe_name}.xml")
            with open(filename, "w", encoding="utf-8") as f:
                xml_str = ET.tostring(record, encoding="unicode")
                f.write(xml_str)
            print(f"✅ Saved: {filename}")
            saved_count += 1
        else:
            print("⚠️ Skipping record with missing identifier")
    return saved_count

if __name__ == "__main__":
    print("📥 Starting download of metadata records...")
    start = 271
    total_saved = 0

    while True:
        if MAX_RECORDS is not None and total_saved >= MAX_RECORDS:
            print(f"⏹️ Reached MAX_RECORDS = {MAX_RECORDS}. Stopping.")
            break

        print(f"🔄 Fetching records {start} to {start + BATCH_SIZE - 1}...")
        try:
            xml_data = fetch_records(start_position=start, max_records=BATCH_SIZE)
            saved = extract_and_save_each_metadata(xml_data)
            if saved == 0:
                print("✅ No more records found. Stopping.")
                break
            total_saved += saved
            start += BATCH_SIZE
            time.sleep(WAIT_TIME)
        except Exception as e:
            print(f"❌ Error during fetch: {e}. Stopping.")
            break

    print(f"✅ Done. Total records saved: {total_saved}")


📥 Starting download of metadata records...
🔄 Fetching records 271 to 271...
https://www.geocat.ch/geonetwork/srv/deu/csw?service=CSW&version=2.0.2&request=GetRecords&namespace=xmlns%28csw%3Dhttp%3A%2F%2Fwww.opengis.net%2Fcat%2Fcsw%2F2.0.2%29&typeNames=csw%3ARecord&elementSetName=full&resultType=results&outputFormat=application%2Fxml&outputSchema=http%3A%2F%2Fwww.isotc211.org%2F2005%2Fgmd&maxRecords=1&startPosition=271&sortBy=title%3AA
<?xml version="1.0" encoding="UTF-8"?>
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
  <csw:SearchStatus timestamp="2025-03-31T08:05:01.045Z" />
  <csw:SearchResults numberOfRecordsMatched="16277" numberOfRecordsReturned="1" elementSet="full" nextRecord="272">
    <gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:geonet="http://www.fao.org/geonet

In [2]:
import requests
import time
import csv
import xml.etree.ElementTree as ET

# Configuration Variables
BATCH_SIZE = 100
WAIT_TIME = 1
FETCH_LIMIT = 20

CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"
OUTPUT_XML = "geocat_datasets.csv"

def get_total_records():
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "resultType": "hits",
        "maxRecords": 1,
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    
    root = ET.fromstring(response.text)
    namespaces = {"csw": "http://www.opengis.net/cat/csw/2.0.2"}
    search_results = root.find(".//csw:SearchResults", namespaces)
    if search_results is None:
        print("⚠️ Could not find <csw:SearchResults>. Full response:")
        print(response.text)
        raise ValueError("Missing <csw:SearchResults> in CSW response.")
    return int(search_results.attrib.get("numberOfRecordsMatched", 0))


def fetch_csw_records(start_position, sort_order="A", max_records=BATCH_SIZE):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": max_records,
        "startPosition": start_position,
        "sortBy": f"title:{sort_order}",
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    return response.text

def parse_csw_response(xml_data):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/",
    }
    root = ET.fromstring(xml_data)
    records = []
    for record in root.findall(".//csw:BriefRecord", namespaces):
        dataset_id = record.find("dc:identifier", namespaces)
        dataset_title = record.find("dc:title", namespaces)
        records.append((
            dataset_id.text.strip() if dataset_id is not None else "N/A",
            dataset_title.text.strip() if dataset_title is not None else "N/A"
        ))
    return records

def save_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Identifier", "Title"])
        writer.writerows(data)

if __name__ == "__main__":
    total_records = get_total_records()
    if FETCH_LIMIT:
        total_records = min(total_records, FETCH_LIMIT)

    all_records = []
    print(f"Fetching {total_records} records in steps of {BATCH_SIZE}...")
    for start_pos in range(1, min(15001, total_records + 1), BATCH_SIZE):
        print(f"Fetching records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
        xml_response = fetch_csw_records(start_pos, sort_order="A")
        records = parse_csw_response(xml_response)
        all_records.extend(records)
        time.sleep(WAIT_TIME)

    if total_records > 15000:
        remaining_records = total_records - 15000
        print(f"Fetching last {remaining_records} records in descending order...")
        for start_pos in range(1, remaining_records + 1, BATCH_SIZE):
            print(f"Fetching reversed records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
            xml_response = fetch_csw_records(start_pos, sort_order="D")
            records = parse_csw_response(xml_response)
            all_records.extend(records)
            time.sleep(WAIT_TIME)

    print(f"Saving {len(all_records)} records to CSV...")
    save_to_csv(all_records, OUTPUT_XML)
    print(f"Dataset saved to {OUTPUT_XML}")


Fetching 20 records in steps of 100...
Fetching records from 1 to 100...
Saving 100 records to CSV...
Dataset saved to geocat_datasets.csv


### Time for request

In [3]:
import requests
import time
from datetime import datetime

# Configuration
BATCH_SIZE = 10  # Number of records per request
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# Function to measure request time
def measure_request_time(batch_size):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": batch_size,
        "startPosition": 1,
    }
    
    start_time = datetime.now()
    response = requests.get(CSW_URL, params=params)
    print(response.text)
    response.raise_for_status()
    elapsed_time = datetime.now() - start_time
    
    print(response.text)
    print(f"Request for {batch_size} records took: {elapsed_time}")

if __name__ == "__main__":
    measure_request_time(BATCH_SIZE)


<?xml version="1.0" encoding="UTF-8"?>
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
  <csw:SearchStatus timestamp="2025-03-31T07:00:33.202Z" />
  <csw:SearchResults numberOfRecordsMatched="16277" numberOfRecordsReturned="10" elementSet="brief" nextRecord="11">
    <csw:BriefRecord xmlns:geonet="http://www.fao.org/geonetwork" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ows="http://www.opengis.net/ows">
      <dc:identifier>d9e6d06a-bc9f-4b1c-8801-7581f211a5b6</dc:identifier>
      <dc:title>Geodatenmodell "RAMSAR_V1"</dc:title>
      <dc:type>model</dc:type>
    </csw:BriefRecord>
    <csw:BriefRecord xmlns:geonet="http://www.fao.org/geonetwork" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ows="http://www.opengis.net/ows">
      <dc:identifier>geoportal-157-85</dc:identifier>
      <

### get Metadata

In [3]:
https://www.geocat.ch/geonetwork/srv/api/records/geoportal-152-69/formatters/xml?approved=true

Fetched metadata for geoportal-689-111 (Time taken: 0:00:00.296855)
Fetched metadata for geoportal-152-69 (Time taken: 0:00:00.288492)
Fetched metadata for afe12880-3dc3-8a66-94be-7cb3aa19529f (Time taken: 0:00:00.333897)
Fetched metadata for a02205bf-c14f-4c0d-9fb4-cfaf700b1525 (Time taken: 0:00:00.289680)
Fetched metadata for 9a5c3b20-8e56-8044-3df4-33b2a92b8316 (Time taken: 0:00:00.227687)
Fetched metadata for geoportal-7-135 (Time taken: 0:00:00.246135)
Fetched metadata for 6f90d5ba-c8aa-45ea-be6c-0e1c1b03dbd0 (Time taken: 0:00:00.317112)
Fetched metadata for geoportal-951-78 (Time taken: 0:00:00.259755)
Fetched metadata for 3d088b3c-4281-4d19-8169-9bbe6069eed0 (Time taken: 0:00:00.256784)
Fetched metadata for geoportal-215-84-7993 (Time taken: 0:00:00.244221)
Saved full metadata for 10 datasets to geocat_full_metadata.xml


### DEV: Get only one certain XML File

In [8]:
import requests
import pandas as pd
import os

# Configurations
XML_SAVE_DIR = "saved_metadata_xml"
os.makedirs(XML_SAVE_DIR, exist_ok=True)
MAX_FILES = 1000  # Set the maximum number of files to download

def fetch_and_save_metadata(identifier):
    """Fetch XML metadata from GeoCat API and save it to a folder without encoding issues or extra spaces."""
    url = f"https://www.geocat.ch/geonetwork/srv/api/records/{identifier}/formatters/xml?approved=true"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Ensure correct encoding (UTF-8 handling)
        xml_content = response.content.decode('utf-8')

        # Normalize whitespace (removes excessive blank lines)
        xml_content = '\n'.join([line.strip() for line in xml_content.splitlines() if line.strip()])

        # Save cleaned XML to file
        xml_path = os.path.join(XML_SAVE_DIR, f"{identifier}.xml")
        with open(xml_path, "w", encoding="utf-8") as file:
            file.write(xml_content)
        
        print(f"Saved cleaned XML response to {xml_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching metadata for {identifier}: {e}")

if __name__ == "__main__":
    # Read the identifiers from the CSV file

    identifier = "3b283616-0db1-48b0-beb3-0f276f778ba0"

    fetch_and_save_metadata(identifier)

    print("Metadata retrieval completed.")

Saved cleaned XML response to saved_metadata_xml\3b283616-0db1-48b0-beb3-0f276f778ba0.xml
Metadata retrieval completed.


### Request XML from geocat_datasets.csv


In [2]:
import requests
import pandas as pd
import os
import re

# Configurations
XML_SAVE_DIR = "saved_metadata_xml"
os.makedirs(XML_SAVE_DIR, exist_ok=True)
MAX_FILES = None  # Set to an integer to limit downloads, or None for no limit

def sanitize_filename(identifier):
    """Replace invalid filename characters with underscores."""
    return re.sub(r'[<>:"/\\|?*]', '_', identifier)

def fetch_and_save_metadata(identifier):
    """Fetch XML metadata from GeoCat API and save it to a folder without encoding issues or extra spaces."""
    url = f"https://www.geocat.ch/geonetwork/srv/api/records/{identifier}/formatters/xml?approved=true"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Ensure correct encoding (UTF-8 handling)
        xml_content = response.content.decode('utf-8')

        # Normalize whitespace (removes excessive blank lines)
        xml_content = '\n'.join([line.strip() for line in xml_content.splitlines() if line.strip()])

        # Sanitize filename to avoid OS errors
        safe_identifier = sanitize_filename(identifier)
        xml_path = os.path.join(XML_SAVE_DIR, f"{safe_identifier}.xml")

        # Save cleaned XML to file
        with open(xml_path, "w", encoding="utf-8") as file:
            file.write(xml_content)
        
        print(f"✅ Saved cleaned XML response to {xml_path}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching metadata for {identifier}: {e}")
    except OSError as e:
        print(f"❌ Error saving metadata for {identifier}: {e}")

if __name__ == "__main__":
    # Read the identifiers from the CSV file
    dataset_file = "geocat_datasets.csv"
    df_datasets = pd.read_csv(dataset_file)

    # Determine the number of records to process
    num_records = len(df_datasets) if MAX_FILES is None else min(MAX_FILES, len(df_datasets))

    # Process the dataset
    for index, row in df_datasets.head(num_records).iterrows():
        fetch_and_save_metadata(row["Identifier"])

    print("✅ Metadata retrieval completed.")


❌ Error fetching metadata for 1433b1e4-d318-4aa3-b472-97e8537b37c3: 404 Client Error: Not Found for url: https://www.geocat.ch/geonetwork/srv/api/records/1433b1e4-d318-4aa3-b472-97e8537b37c3/formatters/xml?approved=true
✅ Saved cleaned XML response to saved_metadata_xml\124abf9c-8b21-4807-ad7f-72d821259a47.xml
✅ Metadata retrieval completed.


### Kill all carriage returns :)

In [12]:
import os
import re
import xml.etree.ElementTree as ET

def remove_html_tags(text):
    """Remove HTML tags from a string while preserving content."""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def clean_xml_file(input_path, output_path):
    """Process an XML file to remove HTML tags and save it in a single line."""
    try:
        # Parse XML
        tree = ET.parse(input_path)
        root = tree.getroot()

        # Recursively clean text elements
        for elem in root.iter():
            if elem.text:
                elem.text = remove_html_tags(elem.text)

        # Convert XML tree to a single-line string
        cleaned_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
        cleaned_xml = cleaned_xml.replace('\n', '').replace('\t', '')

        # Write output
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_xml)

        print(f"Processed: {os.path.basename(input_path)} -> {os.path.basename(output_path)}")
    
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

def process_folder(input_folder, output_folder):
    """Process all XML files in the input folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".xml"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            clean_xml_file(input_path, output_path)

# Example Usage
input_folder = "saved_metadata_xml"  # Change to your actual folder path
output_folder = "saved_metadata_xml"

process_folder(input_folder, output_folder)


Processed: 000336c5-ce51-4518-a5f9-957da8c9c57e.xml -> 000336c5-ce51-4518-a5f9-957da8c9c57e.xml
Processed: 0005cc49-b66a-49b4-a30b-0aba36b28d01.xml -> 0005cc49-b66a-49b4-a30b-0aba36b28d01.xml
Processed: 000fdf62-3b9f-4039-8090-af8a0b5eee6e.xml -> 000fdf62-3b9f-4039-8090-af8a0b5eee6e.xml
Processed: 001adc09-3aea-4641-9eb6-c06d9dc9f4af.xml -> 001adc09-3aea-4641-9eb6-c06d9dc9f4af.xml
Processed: 001d9c13-6501-4de5-b0f8-5e97afd8e258.xml -> 001d9c13-6501-4de5-b0f8-5e97afd8e258.xml
Processed: 0020544e-e56c-44ac-8d09-2064e4d560a2-6571.xml -> 0020544e-e56c-44ac-8d09-2064e4d560a2-6571.xml
Processed: 00215252-85d5-49e7-8893-490ed4aa6fc8.xml -> 00215252-85d5-49e7-8893-490ed4aa6fc8.xml
Processed: 0022888a-a4f2-4d5c-881c-00b6591bf178-6571.xml -> 0022888a-a4f2-4d5c-881c-00b6591bf178-6571.xml
Processed: 0025C3A7-2E41-4E38-8D7A-6C970DACFFEE.xml -> 0025C3A7-2E41-4E38-8D7A-6C970DACFFEE.xml
Processed: 00287E28-6589-47F2-864C-E9BB3A51DC4B.xml -> 00287E28-6589-47F2-864C-E9BB3A51DC4B.xml
Processed: 00347ff2-

In [13]:
import os
import re

# Define folder path
folder_path = "saved_metadata_xml"

# Ensure folder exists
if not os.path.exists(folder_path):
    print(f"Error: Folder '{folder_path}' not found.")
    exit()

# Process each XML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)

        # Read file content
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Remove all occurrences of carriage return codes
        content = re.sub(r"(&#13;|&#10;|&#xD;|&#xA;)", " ", content)

        # Ensure all lines end with ">"
        cleaned_lines = []
        temp_line = ""

        for line in content.splitlines():
            line = line.strip()

            # If the line doesn't end with ">", merge it with the next
            if not line.endswith(">"):
                temp_line += line + " "
            else:
                temp_line += line
                cleaned_lines.append(temp_line.strip())
                temp_line = ""

        # If there's any remaining text in temp_line, add it
        if temp_line:
            cleaned_lines.append(temp_line.strip())

        # Save the cleaned file
        with open(file_path, "w", encoding="utf-8") as file:
            file.write("\n".join(cleaned_lines) + "\n")

        print(f"Processed: {filename}")

print("Carriage return and line break cleanup complete for all XML files!")


Processed: 000336c5-ce51-4518-a5f9-957da8c9c57e.xml
Processed: 0005cc49-b66a-49b4-a30b-0aba36b28d01.xml
Processed: 000fdf62-3b9f-4039-8090-af8a0b5eee6e.xml
Processed: 001adc09-3aea-4641-9eb6-c06d9dc9f4af.xml
Processed: 001d9c13-6501-4de5-b0f8-5e97afd8e258.xml
Processed: 0020544e-e56c-44ac-8d09-2064e4d560a2-6571.xml
Processed: 00215252-85d5-49e7-8893-490ed4aa6fc8.xml
Processed: 0022888a-a4f2-4d5c-881c-00b6591bf178-6571.xml
Processed: 0025C3A7-2E41-4E38-8D7A-6C970DACFFEE.xml
Processed: 00287E28-6589-47F2-864C-E9BB3A51DC4B.xml
Processed: 00347ff2-fc9d-4171-bc6d-5fe49658e80b.xml
Processed: 0038bad2-d932-4e69-a627-4a6623a64636-6571.xml
Processed: 004ff1f9-f245-4517-ad82-e4a71cf23371.xml
Processed: 0054E877-00C5-470A-B431-AA093E0CD028.xml
Processed: 00565aa8-a60c-490d-91ed-b2692fa25beb.xml
Processed: 00567aec-423e-498b-9071-c24f9b143d75-8371.xml
Processed: 00573233-565a-4ab4-a9c5-2a8c957aa6ad.xml
Processed: 0068aac4-bc1f-4b73-9bcd-d1de34755ff5.xml
Processed: 006bdf0c-0f80-4e18-a06a-8bb959549

### extract data from the xml 

In [14]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def extract_metadata(xml_file):
    """Extract metadata from an XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    namespace = {
        "gmd": "http://www.isotc211.org/2005/gmd",
        "gco": "http://www.isotc211.org/2005/gco",
        "che": "http://www.geocat.ch/2008/che",
        "xsi": "http://www.w3.org/2001/XMLSchema-instance"
    }

    # Extract fileIdentifier
    file_identifier_element = root.find(".//gmd:fileIdentifier/gco:CharacterString", namespace)
    file_identifier = file_identifier_element.text.strip() if file_identifier_element is not None else "N/A"

    # Extract dataset language
    dataset_language = "N/A"  # Default value

    # Try to extract language from LanguageCode (preferred method)
    language_element = root.find(".//gmd:language/gmd:LanguageCode", namespace)

    if language_element is not None and "codeListValue" in language_element.attrib:
        dataset_language = language_element.attrib["codeListValue"].strip()
    else:
        # If LanguageCode is missing, check gco:CharacterString
        language_element = root.find(".//gmd:language/gco:CharacterString", namespace)
        if language_element is not None and language_element.text:
            dataset_language = language_element.text.strip()

    # Extract dataset titles
    titles = {}
    title_element = root.find(".//gmd:identificationInfo//gmd:citation//gmd:title/gco:CharacterString", namespace)
    if title_element is not None:
        titles["dataset_title"] = title_element.text.strip()

    title_localized_element = root.find(".//gmd:identificationInfo//gmd:citation//gmd:title/gmd:PT_FreeText", namespace)
    if title_localized_element is not None:
        for text_group in title_localized_element.findall("gmd:textGroup/gmd:LocalisedCharacterString", namespace):
            locale = text_group.attrib.get("locale", "").replace("#", "").strip()
            if text_group.text:
                titles[f"dataset_title_{locale}"] = text_group.text.strip()

    # Extract dataset descriptions
    descriptions = {}
    description_element = root.find(".//gmd:identificationInfo//gmd:abstract/gco:CharacterString", namespace)
    descriptions["dataset_description"] = description_element.text.strip() if description_element is not None and description_element.text else "N/A"

    description_localized_element = root.find(".//gmd:identificationInfo//gmd:abstract/gmd:PT_FreeText", namespace)
    if description_localized_element is not None:
        for text_group in description_localized_element.findall("gmd:textGroup/gmd:LocalisedCharacterString", namespace):
            locale = text_group.attrib.get("locale", "").replace("#", "").strip()
            if text_group.text:
                descriptions[f"dataset_description_{locale}"] = text_group.text.strip()

    # Extract dataset issued date
    issued_date = "N/A"  # Default value

    # First, try extracting gco:Date
    issued_date_element = root.find(".//gmd:identificationInfo//gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date", namespace)

    # If gco:Date is missing, try gco:DateTime
    if issued_date_element is None:
        issued_date_element = root.find(".//gmd:identificationInfo//gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:DateTime", namespace)

    # Extract text if an element was found
    if issued_date_element is not None and issued_date_element.text:
        issued_date = issued_date_element.text.strip()


    # Extract dataset publisher name
    publisher_name = "N/A"
    publisher_url = "N/A"

    # Possible XPaths for organisation name
    publisher_name_paths = [
        ".//gmd:contact//gmd:organisationName/gco:CharacterString",
        ".//gmd:pointOfContact//gmd:organisationName/gco:CharacterString"
    ]

    # Extract the first valid publisher name
    for path in publisher_name_paths:
        publisher_element = root.find(path, namespace)
        if publisher_element is not None and publisher_element.text:
            publisher_name = publisher_element.text.strip()
            break  # Stop at the first found value

    # Possible XPaths for publisher URL
    url_paths = [
        ".//gmd:pointOfContact//gmd:contactInfo//gmd:CI_Contact//gmd:onlineResource//gmd:linkage/gco:CharacterString",
        ".//gmd:contact//gmd:contactInfo//gmd:CI_Contact//gmd:onlineResource//gmd:CI_OnlineResource//gmd:linkage/gmd:URL",
        ".//gmd:contact//gmd:contactInfo//gmd:CI_Contact//gmd:onlineResource//gmd:CI_OnlineResource//gmd:linkage[@xsi:type='che:PT_FreeURL_PropertyType']/gmd:URL"
    ]

    # Extract the first valid publisher URL
    for path in url_paths:
        url_element = root.find(path, namespace)
        if url_element is not None and url_element.text:
            publisher_url = url_element.text.strip()
            break  # Stop at the first found value


    # Extract dataset topic categories
    dataset_theme = []  # List to store multiple topic categories

    # Find all MD_TopicCategoryCode elements
    topic_category_elements = root.findall(".//gmd:topicCategory/gmd:MD_TopicCategoryCode", namespace)

    # Extract text from each element and store in the list
    for element in topic_category_elements:
        if element is not None and element.text:
            dataset_theme.append(element.text.strip())

    # If no topic categories are found, return ["N/A"]
    if not dataset_theme:
        dataset_theme = ["N/A"]




    # Extract dataset keywords dynamically
    keywords = {"UNKNOWN": []}  #Separate key for non-localized keywords

    keyword_elements = root.findall(".//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword", namespace)

    for keyword_element in keyword_elements:
        #Extract non-localized keywords (gco:CharacterString)
        keyword_text_element = keyword_element.find("gco:CharacterString", namespace)
        if keyword_text_element is not None and keyword_text_element.text:
            keywords["UNKNOWN"].append(keyword_text_element.text.strip())

        #Extract localized keywords (gmd:PT_FreeText)
        localized_texts = keyword_element.findall("gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString", namespace)
        for text_element in localized_texts:
            lang_code = text_element.attrib.get("locale", "").replace("#", "").strip()
            if text_element.text and lang_code:
                if lang_code not in keywords:
                    keywords[lang_code] = []
                keywords[lang_code].append(text_element.text.strip())


    distribution_formats = []
    format_elements = root.findall(".//gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions//gmd:CI_OnlineResource", namespace)

    for contact_element in format_elements:
        download_url_element = contact_element.find("gmd:linkage/gmd:URL", namespace)
        format_name_element = contact_element.find("gmd:protocol/gco:CharacterString", namespace)
        resource_name_element = contact_element.find("gmd:name/gco:CharacterString", namespace)
        resource_description_element = contact_element.find("gmd:description/gco:CharacterString", namespace)

        download_url = download_url_element.text.strip() if download_url_element is not None and download_url_element.text else "N/A"
        format_name = format_name_element.text.strip() if format_name_element is not None and format_name_element.text else "N/A"
        resource_name = resource_name_element.text.strip() if resource_name_element is not None and resource_name_element.text else "N/A"
        resource_description = resource_description_element.text.strip() if resource_description_element is not None and resource_description_element.text else "N/A"

        # Store as a dictionary
        distribution_formats.append({"format_name": format_name, "download_url": download_url,"resource_name": resource_name, "resource_description": resource_description})


    contact_points = []
    # First variant: Extracting contacts using CHE namespace
    contact_elements_che = root.findall(".//gmd:identificationInfo/che:CHE_MD_DataIdentification/gmd:pointOfContact/che:CHE_CI_ResponsibleParty", namespace)

    # Second variant: Extracting contacts using standard gmd namespace
    contact_elements_gmd = root.findall(".//gmd:identificationInfo//gmd:pointOfContact//gmd:CI_ResponsibleParty", namespace)

    # Combine both contact lists
    all_contact_elements = contact_elements_che + contact_elements_gmd

    for contact_element in all_contact_elements:
        contact_name_element = contact_element.find("gmd:organisationName/gco:CharacterString", namespace)
        contact_email_element = contact_element.find(".//gmd:electronicMailAddress/gco:CharacterString", namespace)

        contact_name = contact_name_element.text.strip() if contact_name_element is not None and contact_name_element.text else "N/A"
        contact_email = contact_email_element.text.strip() if contact_email_element is not None and contact_email_element.text else "N/A"

        contact_points.append({"contact_name": contact_name, "contact_email" : contact_email})


    return (
        file_identifier,
        dataset_language,
        titles,
        descriptions,
        publisher_name,
        publisher_url,
        dataset_theme,
        issued_date,
        keywords,
        distribution_formats,
        contact_points
    )





# Folder path where the XML files are stored
folder_path = "saved_metadata_xml"

# List to store extracted dataset metadata and distribution data
dataset_geocat_data = []
distribution_data = []
contact_data = []

# Iterate over XML files
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)

        # Extract metadata (expecting 8 values)
        file_identifier, dataset_language, titles, descriptions, publisher_name, publisher_url,dataset_theme, issued_date,keywords, distribution_formats,contact_points = extract_metadata(file_path)

        # Base metadata fields
        entry = {
            "dataset_identifier": file_identifier,
            "dataset_language": dataset_language,
            "dataset_publisher_name": publisher_name,
            "dataset_publisher_URL": publisher_url,
            "dataset_theme": dataset_theme,
            "dataset_issued": issued_date,
            "xml_filename": filename,
            "origin": "geocat.ch"
        }

        # Store keywords as lists
        for lang_code, words in keywords.items():
            entry[f"dataset_keyword_{lang_code}"] = words

        # Merge extracted titles and descriptions dynamically
        entry.update(titles)
        entry.update(descriptions)

        dataset_geocat_data.append(entry)

        # Store distribution formats linked to dataset with correct URLs
        for dist_entry in distribution_formats:
            distribution_data.append({
                "xml_filename": filename,
                "distribution_format": dist_entry["format_name"],
                "distribution_download_url": dist_entry["download_url"],
                "resource_name": dist_entry["resource_name"],
                "resource_description": dist_entry["resource_description"],
                "origin": "geocat.ch",
            })

        for contact_entry in contact_points:
            contact_data.append({
                "contact_name_xml_filename": filename,
                "contact_name": contact_entry["contact_name"],
                "contact_email": contact_entry["contact_email"],
                "origin": "geocat.ch",
            })


# Convert dataset metadata to DataFrame
df_dataset_geocat = pd.DataFrame(dataset_geocat_data)

# Convert distribution formats to DataFrame
df_distribution = pd.DataFrame(distribution_data)

# Convert contact points to DataFrame
df_contact = pd.DataFrame(contact_data)

# Save to CSV files
df_dataset_geocat.to_csv("geocat_dataset_metadata.csv", index=False)
df_distribution.to_csv("geocat_distribution_metadata.csv", index=False)
df_contact.to_csv("geocat_contact_metadata.csv", index=False)



### Transform

### Solve language mapping

In [18]:
import pandas as pd

# Load dataset
df_dataset = pd.read_csv("geocat_dataset_metadata.csv")

# Define language code mappings
language_mapping = {
    "deu": "de",
    "ger": "de",
    "eng": "en",
    "fra": "fr",
    "fre": "fr",
    "ita": "it"
    # Add more mappings as needed
}

# Ensure dataset_language column is parsed as lists if stored as strings
def parse_language_column(lang_value):
    if isinstance(lang_value, str):
        try:
            # Convert string to list
            lang_list = eval(lang_value) if lang_value.startswith("[") else [lang_value]
            # Map values
            return sorted([language_mapping.get(lang, lang) for lang in lang_list])
        except:
            return lang_value  # If eval fails, keep original value
    return lang_value

# Apply mapping to dataset_language column
df_dataset["dataset_language"] = df_dataset["dataset_language"].apply(parse_language_column)

# Function to merge two columns (GE -> DE) and remove the GE column
def merge_columns(df, target_col, source_col):
    if source_col in df.columns:
        df[target_col] = df[target_col].fillna(df[source_col])  # Fill empty DE values with GE values
        df.drop(columns=[source_col], inplace=True)  # Drop GE column

# Merge 'dataset_description_GE' into 'dataset_description_DE' and delete 'dataset_description_GE'
merge_columns(df_dataset, "dataset_description_DE", "dataset_description_GE")

# Merge 'dataset_title_GE' into 'dataset_title_DE' and delete 'dataset_title_GE'
merge_columns(df_dataset, "dataset_title_DE", "dataset_title_GE")

# Save the updated dataset
df_dataset.to_csv("geocat_dataset_metadata.csv", index=False)

print("✅ Language codes mapped, and GE columns merged into DE.")


✅ Language codes mapped, and GE columns merged into DE.


### Same precision for date format dataset

In [19]:
import pandas as pd
from datetime import datetime

# Load dataset
df_dataset = pd.read_csv("geocat_dataset_metadata.csv")


# Function to transform date to the required precision
def transform_date(date_str):
    try:
        if pd.isna(date_str) or date_str.strip() == "":
            return "N/A"  # Handle empty or NaN values
        return datetime.fromisoformat(date_str).strftime("%Y-%m-%dT%H:%M:%S")
    except ValueError:
        return date_str  # Return as-is if conversion fails

# Transform date columns
date_columns = ["dataset_issued"]
for col in date_columns:
    if col in df_dataset.columns:
        df_dataset[col] = df_dataset[col].apply(transform_date)

# Save the updated dataset with sorted language lists and formatted dates
df_dataset.to_csv("geocat_dataset_metadata.csv", index=False)

print("✅ Language lists sorted and dates transformed in geocat_dataset_metadata.csv.")

✅ Language lists sorted and dates transformed in geocat_dataset_metadata.csv.


### Remove all N/A and ['N/A']

In [20]:
import pandas as pd
import ast

# List of CSV file paths (modifying the original files)
csv_files = ["geocat_dataset_metadata.csv", "geocat_distribution_metadata.csv", "geocat_contact_metadata.csv"]

# Define values to be removed
values_to_remove = {"N/A", "[N/A]"}

# Function to clean individual values in the DataFrame
def clean_value(value):
    if pd.isna(value) or str(value).strip() in values_to_remove:
        return ""  # Replace with an empty string instead of NaN
    try:
        # Convert string representation of lists into actual lists
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, list):
            # Remove 'N/A' from lists
            parsed_value = [item for item in parsed_value if str(item).strip() not in values_to_remove]
            return parsed_value if parsed_value else ""  # Convert empty lists to empty string
    except (ValueError, SyntaxError):
        pass
    return value.strip()  # Strip whitespace from normal strings

# Process each CSV file and overwrite with cleaned data
for file in csv_files:
    df = pd.read_csv(file, dtype=str)  # Read all columns as strings
    df = df.applymap(clean_value)  # Apply cleaning function
    df.to_csv(file, index=False)  # Overwrite original file

print("Cleaning complete. Original CSV files have been updated.")


  df = df.applymap(clean_value)  # Apply cleaning function
  df = df.applymap(clean_value)  # Apply cleaning function
  df = df.applymap(clean_value)  # Apply cleaning function


Cleaning complete. Original CSV files have been updated.
