In [3]:
import requests
import time
import csv
import xml.etree.ElementTree as ET

# Configuration Variables
BATCH_SIZE = 100  # Number of records per request
WAIT_TIME = 1  # Time in seconds between requests
FETCH_LIMIT = None  # Set a fixed limit for testing, or None to fetch all datasets

# CSW API Base URL
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# CSV output file
OUTPUT_CSV = "geocat_datasets.csv"

# Function to get the total number of datasets
def get_total_records():
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": 1,
        "startPosition": 1,
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    root = ET.fromstring(response.text)
    namespaces = {"csw": "http://www.opengis.net/cat/csw/2.0.2"}
    total_records = int(root.find(".//csw:SearchResults", namespaces).attrib.get("numberOfRecordsMatched", 0))
    return total_records

# Function to fetch records
def fetch_csw_records(start_position, sort_order="A", max_records=BATCH_SIZE):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": max_records,
        "startPosition": start_position,
        "sortBy": f"title:{sort_order}",
    }

    response = requests.get(CSW_URL, params=params)
    print(response.url)
    response.raise_for_status()
    return response.text

# Function to parse XML and extract dataset ID and Title
def parse_csw_response(xml_data):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/",
    }
    root = ET.fromstring(xml_data)
    records = []
    for record in root.findall(".//csw:BriefRecord", namespaces):
        dataset_id = record.find("dc:identifier", namespaces).text.strip() if record.find("dc:identifier", namespaces) is not None else "N/A"
        dataset_title = record.find("dc:title", namespaces).text.strip() if record.find("dc:title", namespaces) is not None else "N/A"
        records.append((dataset_id, dataset_title))
    return records

# Function to save to CSV
def save_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Identifier", "Title"])  # CSV Header
        writer.writerows(data)

# Main script execution
if __name__ == "__main__":
    total_records = get_total_records()
    if FETCH_LIMIT:
        total_records = min(total_records, FETCH_LIMIT)  # Use FETCH_LIMIT if set
    
    all_records = []
    print(f"Fetching {total_records} records in steps of {BATCH_SIZE}...")
    for start_pos in range(1, min(15001, total_records + 1), BATCH_SIZE):
        print(f"Fetching records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
        xml_response = fetch_csw_records(start_pos, sort_order="A")
        records = parse_csw_response(xml_response)
        all_records.extend(records)
        time.sleep(WAIT_TIME)

    if total_records > 15000:
        remaining_records = total_records - 15000
        print(f"Fetching last {remaining_records} records in descending order...")
        for start_pos in range(1, remaining_records + 1, BATCH_SIZE):
            print(f"Fetching reversed records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
            xml_response = fetch_csw_records(start_pos, sort_order="D")
            records = parse_csw_response(xml_response)
            all_records.extend(records)
            time.sleep(WAIT_TIME)

    print(f"Saving {len(all_records)} records to CSV...")
    save_to_csv(all_records, OUTPUT_CSV)
    print(f"Dataset saved to {OUTPUT_CSV}")

Fetching 1 records in steps of 1...
Fetching records from 1 to 1...
https://www.geocat.ch/geonetwork/srv/eng/csw?service=CSW&version=2.0.2&request=GetRecords&resultType=results&outputFormat=application%2Fxml&outputSchema=http%3A%2F%2Fwww.opengis.net%2Fcat%2Fcsw%2F2.0.2&typeNames=csw%3ARecord&elementSetName=brief&maxRecords=1&startPosition=1&sortBy=title%3AA
Saving 1 records to CSV...
Dataset saved to geocat_datasets.csv


In [15]:
import requests
import time
import csv
import xml.etree.ElementTree as ET
from datetime import datetime

# Configuration Variables
BATCH_SIZE = 100  # Number of records per request
WAIT_TIME = 1  # Time in seconds between requests
FETCH_LIMIT = None  # Set a fixed limit for testing, or None to fetch all datasets

# CSW API Base URL
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# CSV output file
OUTPUT_CSV = "geocat_datasets.csv"

# Function to get the total number of datasets
def get_total_records():
    start_time = datetime.now()
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": 1,
        "startPosition": 1,
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    root = ET.fromstring(response.text)
    namespaces = {"csw": "http://www.opengis.net/cat/csw/2.0.2"}
    total_records = int(root.find(".//csw:SearchResults", namespaces).attrib.get("numberOfRecordsMatched", 0))
    elapsed_time = datetime.now() - start_time
    print(f"Total records found: {total_records} (Time taken: {elapsed_time})")
    return total_records

# Function to fetch records
def fetch_csw_records(start_position, sort_order="A", max_records=BATCH_SIZE):
    start_time = datetime.now()
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": max_records,
        "startPosition": start_position,
        "sortBy": f"title:{sort_order}",
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    elapsed_time = datetime.now() - start_time
    print(f"Fetched records {start_position} to {start_position + max_records - 1} (Time taken: {elapsed_time})")
    return response.text

# Function to parse XML and extract dataset ID and Title
def parse_csw_response(xml_data):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/",
    }
    root = ET.fromstring(xml_data)
    records = []
    for record in root.findall(".//csw:BriefRecord", namespaces):
        dataset_id = record.find("dc:identifier", namespaces).text.strip() if record.find("dc:identifier", namespaces) is not None else "N/A"
        dataset_title = record.find("dc:title", namespaces).text.strip() if record.find("dc:title", namespaces) is not None else "N/A"
        records.append((dataset_id, dataset_title))
    return records

# Function to save to CSV
def save_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Identifier", "Title"])  # CSV Header
        writer.writerows(data)

# Main script execution
if __name__ == "__main__":
    total_records = get_total_records()
    if FETCH_LIMIT:
        total_records = min(total_records, FETCH_LIMIT)  # Use FETCH_LIMIT if set
    
    all_records = []
    print(f"Fetching {total_records} records in steps of {BATCH_SIZE}...")
    for start_pos in range(1, min(15001, total_records + 1), BATCH_SIZE):
        xml_response = fetch_csw_records(start_pos, sort_order="A")
        records = parse_csw_response(xml_response)
        all_records.extend(records)
        time.sleep(WAIT_TIME)

    if total_records > 15000:
        remaining_records = total_records - 15000
        print(f"Fetching last {remaining_records} records in descending order...")
        for start_pos in range(1, remaining_records + 1, BATCH_SIZE):
            xml_response = fetch_csw_records(start_pos, sort_order="D")
            records = parse_csw_response(xml_response)
            all_records.extend(records)
            time.sleep(WAIT_TIME)

    print(f"Saving {len(all_records)} records to CSV...")
    save_to_csv(all_records, OUTPUT_CSV)
    print(f"Dataset saved to {OUTPUT_CSV}")


Total records found: 16219 (Time taken: 0:00:00.505865)
Fetching 1000 records in steps of 75...
Fetched records 1 to 75 (Time taken: 0:00:07.414833)
Fetched records 76 to 150 (Time taken: 0:00:03.413943)
Fetched records 151 to 225 (Time taken: 0:00:04.892857)
Fetched records 226 to 300 (Time taken: 0:00:05.312192)
Fetched records 301 to 375 (Time taken: 0:00:05.930582)
Fetched records 376 to 450 (Time taken: 0:00:05.750582)
Fetched records 451 to 525 (Time taken: 0:00:05.963369)
Fetched records 526 to 600 (Time taken: 0:00:05.329706)
Fetched records 601 to 675 (Time taken: 0:00:08.795084)
Fetched records 676 to 750 (Time taken: 0:00:04.164912)
Fetched records 751 to 825 (Time taken: 0:00:05.290947)
Fetched records 826 to 900 (Time taken: 0:00:06.948083)
Fetched records 901 to 975 (Time taken: 0:00:05.935938)
Fetched records 976 to 1050 (Time taken: 0:00:04.322687)
Saving 1050 records to CSV...
Dataset saved to geocat_datasets.csv


In [9]:
import requests
import time
from datetime import datetime

# Configuration
BATCH_SIZE = 1000  # Number of records per request
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# Function to measure request time
def measure_request_time(batch_size):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": batch_size,
        "startPosition": 1,
    }
    
    start_time = datetime.now()
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    elapsed_time = datetime.now() - start_time
    
    print(f"Request for {batch_size} records took: {elapsed_time}")

if __name__ == "__main__":
    measure_request_time(BATCH_SIZE)


Request for 1000 records took: 0:04:18.592734
