In [1]:
import requests
import xml.etree.ElementTree as ET
import os
import time
import re
import csv

# CSW endpoint (no schema specified)
CSW_URL = "https://www.geocat.ch/geonetwork/srv/deu/csw"
BATCH_SIZE = 200
WAIT_TIME = 0  # Time in seconds between requests
MAX_RECORDS = None  # Set to an integer for development, or None to fetch all
CSV_FILE = "titles_and_ids.csv"
LOG_FILE = "skipped_records.log"

def fetch_records(start_position, max_records=100):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "namespace": "xmlns(csw=http://www.opengis.net/cat/csw/2.0.2)",
        "typeNames": "csw:Record",
        "elementSetName": "full",
        "resultType": "results",
        "outputFormat": "application/xml",
        "maxRecords": max_records,
        "startPosition": start_position
    }

    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    return response.text

def is_exception_report(xml_text):
    try:
        root = ET.fromstring(xml_text)
        return root.tag.endswith("ExceptionReport")
    except ET.ParseError:
        return False

def extract_title_and_id(xml_text, start_position):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/"
    }
    try:
        if is_exception_report(xml_text):
            raise ValueError(f"ExceptionReport at position {start_position}")

        root = ET.fromstring(xml_text)
        records = root.findall(".//csw:Record", namespaces)
        results = []
        for record in records:
            title_el = record.find("dc:title", namespaces)
            identifier_el = record.find("dc:identifier", namespaces)
            title = title_el.text.strip() if title_el is not None else "(kein Titel)"
            identifier = identifier_el.text.strip() if identifier_el is not None else "(kein Identifier)"
            results.append((identifier, title))
        return results, root
    except ET.ParseError as e:
        with open(LOG_FILE, mode="a", encoding="utf-8") as log:
            log.write(f"ParseError at position {start_position}: {e}\n")
        return [], ET.Element("EmptyRoot")

def get_next_record(xml_root):
    search_results = xml_root.find(".//{http://www.opengis.net/cat/csw/2.0.2}SearchResults")
    if search_results is not None and "nextRecord" in search_results.attrib:
        return int(search_results.attrib["nextRecord"])
    return 0

def get_total_records(xml_root):
    search_results = xml_root.find(".//{http://www.opengis.net/cat/csw/2.0.2}SearchResults")
    if search_results is not None and "numberOfRecordsMatched" in search_results.attrib:
        return int(search_results.attrib["numberOfRecordsMatched"])
    return None

def robust_fetch(start, end):
    batch_size = end - start + 1
    try:
        xml_data = fetch_records(start_position=start, max_records=batch_size)
        title_id_pairs, xml_root = extract_title_and_id(xml_data, start)
        next_record = get_next_record(xml_root)
        return title_id_pairs, next_record
    except Exception as e:
        if batch_size == 1:
            with open(LOG_FILE, mode="a", encoding="utf-8") as log:
                log.write(f"FinalFetchError at position {start}: {e}\n")
            return [], start + 1
        else:
            mid = (start + end) // 2
            print(mid)
            results1, next1 = robust_fetch(start, mid)
            results2, next2 = robust_fetch(mid + 1, end)
            return results1 + results2, max(next1, next2)

def download_metadata():
    print("Downloading metadata...")
    if os.path.exists(LOG_FILE):
        open(LOG_FILE, mode="w").close()

    all_titles_and_ids = []
    total_fetched = 0

    try:
        first_response = fetch_records(start_position=1, max_records=1)
        _, first_root = extract_title_and_id(first_response, 1)
        total_possible = get_total_records(first_root)
        print(f"Total records available: {total_possible}")
        if not total_possible:
            raise ValueError("No records found on server.")

        first_half_limit = 15000
        remaining = total_possible - first_half_limit

        start = 1
        while start <= first_half_limit:
            end = min(start + BATCH_SIZE - 1, first_half_limit)
            title_id_pairs, next_record = robust_fetch(start, end)
            print(f"Fetched records {start} to {end} (next: {next_record})")
            all_titles_and_ids.extend(title_id_pairs)
            total_fetched += len(title_id_pairs)

            if next_record == 0 or next_record <= start:
                start += 1
            else:
                start = next_record
            time.sleep(WAIT_TIME)

        start = remaining
        while start > 0:
            end = max(start - BATCH_SIZE + 1, 1)
            title_id_pairs, _ = robust_fetch(end, start)
            print(f"Fetched records {end} to {start}")
            all_titles_and_ids.extend(title_id_pairs[::-1])
            total_fetched += len(title_id_pairs)
            start = end - 1
            time.sleep(WAIT_TIME)

    except Exception as e:
        print(f"Error during metadata download: {e}")

    with open(CSV_FILE, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Identifier", "Title"])
        writer.writerows(all_titles_and_ids)
    download_metadata()
    print(f"Done. Total records fetched: {total_fetched}")
    print(f"Results saved to {CSV_FILE}")

if __name__ == "__main__":
    print("Starting download of metadata records (Dublin Core schema)...")
    download_metadata()


Starting download of metadata records (Dublin Core schema)...
Downloading metadata...
Total records available: 16411
100
50
75
63
69
72
71
Fetched records 1 to 200 (next: 201)
Fetched records 201 to 400 (next: 401)
500
550
525
513
507
504
502
503
519
516
515
518
517
522
521
520
524
523
538
532
529
527
526
528
531
530
535
534
533
537
536
544
541
540
539
543
542
547
546
545
549
548
575
563
557
554
552
551
553
556
555
560
559
558
562
561
569
566
565
564
568
567
572
571
570
574
573
588
582
579
577
576
578
581
580
585
584
583
587
586
594
591
590
589
593
592
597
596
595
599
598
Fetched records 401 to 600 (next: 601)
700
650
625
613
607
604
602
603
606
605
610
609
608
612
611
619
616
615
614
618
617
622
621
620
624
623
638
632
629
627
626
628
631
630
635
634
633
637
636
644
641
640
639
643
642
647
646
645
649
648
675
663
657
654
652
651
656
655
660
662
661
688
694
697
699
698
Fetched records 601 to 800 (next: 801)
Fetched records 801 to 1000 (next: 1001)
Fetched records 1001 to 1200 (next: 12

: 

### Delovopemnte Version

In [1]:
import requests
import xml.etree.ElementTree as ET
import os
import time
import csv

CSW_URL = "https://www.geocat.ch/geonetwork/srv/deu/csw"
BATCH_SIZE = 1000
WAIT_TIME = 0
START_POSITION = 1  # Set manually for dev mode
CSV_FILE = "dev_titles_and_ids.csv"
LOG_FILE = "dev_skipped_records.log"


def fetch_records_sorted_by_identifier(start_position=1, max_records=100, ascending=True):
    sort_order = "ASC" if ascending else "DESC"
    
    headers = {
        'Content-Type': 'application/xml'
    }

    xml_payload = f"""<?xml version="1.0" encoding="UTF-8"?>
<csw:GetRecords
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
    xmlns:ogc="http://www.opengis.net/ogc"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    service="CSW"
    version="2.0.2"
    resultType="results"
    startPosition="{start_position}"
    maxRecords="{max_records}"
    outputFormat="application/xml"
    outputSchema="http://www.opengis.net/cat/csw/2.0.2"
    xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2
        http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
    <csw:Query typeNames="csw:Record">
        <csw:ElementSetName>full</csw:ElementSetName>
        <ogc:SortBy>
            <ogc:SortProperty>
                <ogc:PropertyName>dc:identifier</ogc:PropertyName>
                <ogc:SortOrder>{sort_order}</ogc:SortOrder>
            </ogc:SortProperty>
        </ogc:SortBy>
    </csw:Query>
</csw:GetRecords>"""

    response = requests.post(CSW_URL, headers=headers, data=xml_payload)
    response.raise_for_status()
    return response.text


def is_exception_report(xml_text):
    try:
        root = ET.fromstring(xml_text)
        return root.tag.endswith("ExceptionReport")
    except ET.ParseError:
        return False


def extract_title_and_id(xml_text, start_position):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/"
    }
    try:
        if is_exception_report(xml_text):
            raise ValueError(f"ExceptionReport at position {start_position}")

        root = ET.fromstring(xml_text)
        records = root.findall(".//csw:Record", namespaces)
        results = []
        for record in records:
            title_el = record.find("dc:title", namespaces)
            identifier_el = record.find("dc:identifier", namespaces)
            title = title_el.text.strip() if title_el is not None else "(kein Titel)"
            identifier = identifier_el.text.strip() if identifier_el is not None else "(kein Identifier)"
            results.append((identifier, title))
        return results, len(records), root
    except ET.ParseError as e:
        with open(LOG_FILE, mode="a", encoding="utf-8") as log:
            log.write(f"ParseError at position {start_position}: {e}\n")
        return [], 0, None


def get_total_records(xml_root):
    search_results = xml_root.find(".//{http://www.opengis.net/cat/csw/2.0.2}SearchResults")
    if search_results is not None and "numberOfRecordsMatched" in search_results.attrib:
        return int(search_results.attrib["numberOfRecordsMatched"])
    return None


def robust_fetch(start, end, ascending=True):
    batch_size = end - start + 1
    try:
        xml_data = fetch_records_sorted_by_identifier(start_position=start, max_records=batch_size, ascending=ascending)
        title_id_pairs, _, _ = extract_title_and_id(xml_data, start)
        return title_id_pairs
    except Exception as e:
        if batch_size == 1:
            with open(LOG_FILE, mode="a", encoding="utf-8") as log:
                log.write(f"FinalFetchError at position {start}: {e}\n")
            return []
        else:
            mid = (start + end) // 2
            print(f"Splitting range: {start}–{end} at {mid}")
            results1 = robust_fetch(start, mid, ascending)
            results2 = robust_fetch(mid + 1, end, ascending)
            return results1 + results2


def download_metadata_from(start_pos=None):
    print("Downloading metadata records...")
    if os.path.exists(LOG_FILE):
        open(LOG_FILE, mode="w").close()

    all_titles_and_ids = []
    total_fetched = 0
    ascending = True

    try:
        initial_xml = fetch_records_sorted_by_identifier(start_position=1, max_records=1, ascending=ascending)
        _, _, root = extract_title_and_id(initial_xml, 1)
        total_records = get_total_records(root)
        print(f"Total records available: {total_records}")
    except Exception as e:
        print(f"Failed to fetch total number of records: {e}")
        return

    first_half_limit = 15000

    if start_pos:
        ascending = True
        start = start_pos
        while start < first_half_limit:
            end = min(start + BATCH_SIZE - 1, first_half_limit - 1)
            title_id_pairs = robust_fetch(start, end, ascending)
            print(f"Fetched records {start} to {end}")
            all_titles_and_ids.extend(title_id_pairs)
            total_fetched += len(title_id_pairs)
            start += BATCH_SIZE
            time.sleep(WAIT_TIME)

        # Reversed order for second part
        total_reverse_fetch = total_records - first_half_limit
        ascending = False
        print(f"Starting reversed batch loop from second part: {total_reverse_fetch} records")
        for i in range(0, total_reverse_fetch, BATCH_SIZE):
            rel_start = i + 1
            rel_end = min(i + BATCH_SIZE, total_reverse_fetch)
            title_id_pairs = robust_fetch(rel_start, rel_end, ascending)
            print(f"Fetched records {rel_start} to {rel_end} (reversed order)")
            all_titles_and_ids.extend(title_id_pairs[::-1])
            total_fetched += len(title_id_pairs)
            time.sleep(WAIT_TIME)

    else:
        start = 1
        while start <= first_half_limit:
            end = min(start + BATCH_SIZE - 1, first_half_limit)
            title_id_pairs = robust_fetch(start, end, ascending)
            print(f"Fetched records {start} to {end}")
            all_titles_and_ids.extend(title_id_pairs)
            total_fetched += len(title_id_pairs)
            start += BATCH_SIZE
            time.sleep(WAIT_TIME)

        start = total_records
        ascending = False
        while start > first_half_limit:
            end = max(start - BATCH_SIZE + 1, first_half_limit + 1)
            title_id_pairs = robust_fetch(end, start, ascending)
            print(f"Fetched records {end} to {start}")
            all_titles_and_ids.extend(title_id_pairs[::-1])
            total_fetched += len(title_id_pairs)
            start = end - 1
            time.sleep(WAIT_TIME)

    with open(CSV_FILE, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Identifier", "Title"])
        writer.writerows(all_titles_and_ids)

    print(f"Done. Total records fetched: {total_fetched}")
    print(f"Results saved to {CSV_FILE}")


if __name__ == "__main__":
    download_metadata_from(start_pos=START_POSITION)


Downloading metadata records...
Total records available: 16411
Splitting range: 1–1000 at 500


In [23]:
def fetch_records_sorted_by_identifier(start_position=1, max_records=100, ascending=True):
    sort_order = "ASC" if ascending else "DESC"

    headers = {
        'Content-Type': 'application/xml'
    }

    xml_payload = f"""<?xml version="1.0" encoding="UTF-8"?>
<csw:GetRecords
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
    xmlns:ogc="http://www.opengis.net/ogc"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    service="CSW"
    version="2.0.2"
    resultType="results"
    startPosition="{start_position}"
    maxRecords="{max_records}"
    outputFormat="application/xml"
    outputSchema="http://www.opengis.net/cat/csw/2.0.2"
    xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2
        http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
    <csw:Query typeNames="csw:Record">
        <csw:ElementSetName>full</csw:ElementSetName>
        <ogc:SortBy>
            <ogc:SortProperty>
                <ogc:PropertyName>dc:identifier</ogc:PropertyName>
                <ogc:SortOrder>{sort_order}</ogc:SortOrder>
            </ogc:SortProperty>
        </ogc:SortBy>
    </csw:Query>
</csw:GetRecords>"""

    response = requests.post(CSW_URL, headers=headers, data=xml_payload)
    response.raise_for_status()
    return response.text


print(fetch_records_sorted_by_identifier(1,10,True))
# print(fetch_records_sorted_by_identifier(1,20,False))

<?xml version="1.0" encoding="UTF-8"?>
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
  <csw:SearchStatus timestamp="2025-04-09T20:27:19.015Z" />
  <csw:SearchResults numberOfRecordsMatched="16411" numberOfRecordsReturned="10" elementSet="full" nextRecord="11">
    <csw:Record xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:geonet="http://www.fao.org/geonetwork" xmlns:ows="http://www.opengis.net/ows" xmlns:dct="http://purl.org/dc/terms/">
      <dc:identifier>000336c5-ce51-4518-a5f9-957da8c9c57e</dc:identifier>
      <dc:date>2024-05-17T12:41:17.758Z</dc:date>
      <dc:title>Kommunaler Nutzungsplan (Gemeinde Hölstein)</dc:title>
      <dc:type>dataset</dc:type>
      <dc:subject>Nutzungsplan</dc:subject>
      <dc:subject>Nutzungsplanung</dc:subject>
      <dc:subject>Zonenplan</dc:subject>
 