In [1]:
import requests
import time
import csv
import xml.etree.ElementTree as ET

# Configuration Variables
BATCH_SIZE = 100  # Number of records per request
WAIT_TIME = 1  # Time in seconds between requests
FETCH_LIMIT = None  # Set a fixed limit for testing, or None to fetch all datasets

# CSW API Base URL
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# CSV output file
OUTPUT_XML = "geocat_datasets.csv"

# Function to get the total number of datasets
def get_total_records():
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": 1,
        "startPosition": 1,
    }
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    root = ET.fromstring(response.text)
    namespaces = {"csw": "http://www.opengis.net/cat/csw/2.0.2"}
    total_records = int(root.find(".//csw:SearchResults", namespaces).attrib.get("numberOfRecordsMatched", 0))
    return total_records

# Function to fetch records
def fetch_csw_records(start_position, sort_order="A", max_records=BATCH_SIZE):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": max_records,
        "startPosition": start_position,
        "sortBy": f"title:{sort_order}",
    }

    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    return response.text

# Function to parse XML and extract dataset ID and Title
def parse_csw_response(xml_data):
    namespaces = {
        "csw": "http://www.opengis.net/cat/csw/2.0.2",
        "dc": "http://purl.org/dc/elements/1.1/",
    }
    root = ET.fromstring(xml_data)
    records = []
    for record in root.findall(".//csw:BriefRecord", namespaces):
        dataset_id = record.find("dc:identifier", namespaces).text.strip() if record.find("dc:identifier", namespaces) is not None else "N/A"
        dataset_title = record.find("dc:title", namespaces).text.strip() if record.find("dc:title", namespaces) is not None else "N/A"
        records.append((dataset_id, dataset_title))
    return records

# Function to save to CSV
def save_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Identifier", "Title"])  # CSV Header
        writer.writerows(data)

# Main script execution
if __name__ == "__main__":
    total_records = get_total_records()
    if FETCH_LIMIT:
        total_records = min(total_records, FETCH_LIMIT)  # Use FETCH_LIMIT if set

    all_records = []
    print(f"Fetching {total_records} records in steps of {BATCH_SIZE}...")
    for start_pos in range(1, min(15001, total_records + 1), BATCH_SIZE):
        print(f"Fetching records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
        xml_response = fetch_csw_records(start_pos, sort_order="A")
        records = parse_csw_response(xml_response)
        all_records.extend(records)
        time.sleep(WAIT_TIME)

    if total_records > 15000:
        remaining_records = total_records - 15000
        print(f"Fetching last {remaining_records} records in descending order...")
        for start_pos in range(1, remaining_records + 1, BATCH_SIZE):
            print(f"Fetching reversed records from {start_pos} to {start_pos + BATCH_SIZE - 1}...")
            xml_response = fetch_csw_records(start_pos, sort_order="D")
            records = parse_csw_response(xml_response)
            all_records.extend(records)
            time.sleep(WAIT_TIME)

    print(f"Saving {len(all_records)} records to CSV...")
    save_to_csv(all_records, OUTPUT_XML)
    print(f"Dataset saved to {OUTPUT_XML}")

Fetching 16221 records in steps of 100...
Fetching records from 1 to 100...
Fetching records from 101 to 200...
Fetching records from 201 to 300...
Fetching records from 301 to 400...
Fetching records from 401 to 500...
Fetching records from 501 to 600...
Fetching records from 601 to 700...
Fetching records from 701 to 800...
Fetching records from 801 to 900...
Fetching records from 901 to 1000...
Fetching records from 1001 to 1100...
Fetching records from 1101 to 1200...
Fetching records from 1201 to 1300...
Fetching records from 1301 to 1400...
Fetching records from 1401 to 1500...
Fetching records from 1501 to 1600...
Fetching records from 1601 to 1700...
Fetching records from 1701 to 1800...
Fetching records from 1801 to 1900...
Fetching records from 1901 to 2000...
Fetching records from 2001 to 2100...
Fetching records from 2101 to 2200...
Fetching records from 2201 to 2300...
Fetching records from 2301 to 2400...
Fetching records from 2401 to 2500...
Fetching records from 2501 to

### Time for request

In [9]:
import requests
import time
from datetime import datetime

# Configuration
BATCH_SIZE = 1000  # Number of records per request
CSW_URL = "https://www.geocat.ch/geonetwork/srv/eng/csw"

# Function to measure request time
def measure_request_time(batch_size):
    params = {
        "service": "CSW",
        "version": "2.0.2",
        "request": "GetRecords",
        "resultType": "results",
        "outputFormat": "application/xml",
        "outputSchema": "http://www.opengis.net/cat/csw/2.0.2",
        "typeNames": "csw:Record",
        "elementSetName": "brief",
        "maxRecords": batch_size,
        "startPosition": 1,
    }
    
    start_time = datetime.now()
    response = requests.get(CSW_URL, params=params)
    response.raise_for_status()
    elapsed_time = datetime.now() - start_time
    
    print(f"Request for {batch_size} records took: {elapsed_time}")

if __name__ == "__main__":
    measure_request_time(BATCH_SIZE)


Request for 1000 records took: 0:04:18.592734


### get Metadata

In [3]:
https://www.geocat.ch/geonetwork/srv/api/records/geoportal-152-69/formatters/xml?approved=true

Fetched metadata for geoportal-689-111 (Time taken: 0:00:00.296855)
Fetched metadata for geoportal-152-69 (Time taken: 0:00:00.288492)
Fetched metadata for afe12880-3dc3-8a66-94be-7cb3aa19529f (Time taken: 0:00:00.333897)
Fetched metadata for a02205bf-c14f-4c0d-9fb4-cfaf700b1525 (Time taken: 0:00:00.289680)
Fetched metadata for 9a5c3b20-8e56-8044-3df4-33b2a92b8316 (Time taken: 0:00:00.227687)
Fetched metadata for geoportal-7-135 (Time taken: 0:00:00.246135)
Fetched metadata for 6f90d5ba-c8aa-45ea-be6c-0e1c1b03dbd0 (Time taken: 0:00:00.317112)
Fetched metadata for geoportal-951-78 (Time taken: 0:00:00.259755)
Fetched metadata for 3d088b3c-4281-4d19-8169-9bbe6069eed0 (Time taken: 0:00:00.256784)
Fetched metadata for geoportal-215-84-7993 (Time taken: 0:00:00.244221)
Saved full metadata for 10 datasets to geocat_full_metadata.xml


In [6]:
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    """Parses an XML string and extracts data dynamically."""
    try:
        root = ET.fromstring(xml_string)
        data = {}
        for elem in root.iter():
            if elem.tag not in data:  # Avoid duplicate keys
                data[elem.tag] = elem.text.strip() if elem.text else None
        return data
    except ET.ParseError:
        return {}

def process_metadata(csv_input_path, csv_output_path):
    """Reads a CSV file, parses XML metadata, and saves structured data to a new CSV file."""
    # Read CSV file
    df = pd.read_csv(csv_input_path, encoding="utf-8")
    
    # Process all XML entries
    parsed_data = []
    for _, row in df.iterrows():
        xml_content = row["Metadata"]
        parsed_entry = parse_xml(xml_content)
        parsed_entry["Identifier"] = row["Identifier"]  # Retain original identifier
        parsed_data.append(parsed_entry)
    
    # Convert to DataFrame
    parsed_df = pd.DataFrame(parsed_data)
    
    # Save to CSV
    parsed_df.to_csv(csv_output_path, index=False, encoding="utf-8")
    print(f"Parsed data saved to: {csv_output_path}")

if __name__ == "__main__":
    input_csv = "geocat_full_metadata.csv"  # Update with your file path
    output_csv = "parsed_metadata.csv"
    process_metadata(input_csv, output_csv)
    parsed_metadata = pd.read_csv("parsed_metadata.csv")




Parsed data saved to: parsed_metadata.csv


### get XML form Metadata 

In [1]:
import pandas as pd
import requests
import datetime
import xml.etree.ElementTree as ET

def fetch_metadata(identifier):
    """Fetch XML metadata from GeoCat API with error handling."""
    url = f"https://www.geocat.ch/geonetwork/srv/api/records/{identifier}/formatters/xml?approved=true"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching metadata for {identifier}: {e}")
        return None



def parse_xml(xml_string):
    """Parse XML metadata and extract relevant data with error handling."""
    try:
        root = ET.fromstring(xml_string)
        data = {}
        for elem in root.iter():
            if elem.tag not in data:
                data[elem.tag] = elem.text.strip() if elem.text else None
        return data
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        return {}

def save_to_csv(df, filename="geocat_metadata.csv"):
    """Save the merged DataFrame to a CSV file."""
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Saved harvested metadata to {filename}")

if __name__ == "__main__":
    # Read dataset identifiers from CSV
    datasets_df = pd.read_csv("geocat_datasets.csv")
    
    # Select random datasets
    sampled_datasets = datasets_df.sample(n=100, random_state=42)
    
    # List to store dataframes
    dataframes = []
    
    # Fetch and store API responses for selected datasets
    for _, row in sampled_datasets.iterrows():
        identifier = row["Identifier"]
        title = row["Title"]
        timestamp = datetime.datetime.now().isoformat()

        # Fetch metadata
        xml_data = fetch_metadata(identifier)
        if not xml_data:
            continue  # Skip if request failed

    

        # Parse XML
        parsed_data = parse_xml(xml_data)
        parsed_data["Identifier"] = identifier
        parsed_data["Title"] = title
        parsed_data["Request_Timestamp"] = timestamp
    

        # Convert to DataFrame
        df = pd.DataFrame([parsed_data])
        dataframes.append(df)
    
    # Merge all dataframes
    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True, sort=False)
        save_to_csv(merged_df)
    else:
        print("No valid metadata retrieved.")


Error fetching metadata for 61ed4452-ac5b-4913-a3f8-fb5af1c0c157: 404 Client Error: Not Found for url: https://www.geocat.ch/geonetwork/srv/api/records/61ed4452-ac5b-4913-a3f8-fb5af1c0c157/formatters/xml?approved=true


KeyboardInterrupt: 

In [9]:
import pandas as pd

# Load the parsed metadata CSV
parsed_df = pd.read_csv("geocat_metadata.csv")

def remove_sparse_columns(df, threshold):
    """Remove columns where more than a certain percentage of values are NaN."""
    non_na_threshold = int(threshold * len(df))
    df_cleaned = df.dropna(axis=1, thresh=non_na_threshold)
    return df_cleaned

# Set threshold for column removal
sparse_threshold = 0.05  # Example: Remove columns with more than 50% NaN values

# Apply the function
cleaned_df = remove_sparse_columns(parsed_df, sparse_threshold)

# Save cleaned data
cleaned_df.to_csv("geocat_metadata_cleaned.csv", index=False, encoding='utf-8')
print("Saved cleaned metadata to geocat_metadata_cleaned.csv")


Saved cleaned metadata to geocat_metadata_cleaned.csv


### debug


In [7]:
import requests
import pandas as pd
import os

# Configurations
XML_SAVE_DIR = "saved_metadata_xml"
os.makedirs(XML_SAVE_DIR, exist_ok=True)
MAX_FILES = 1000  # Set the maximum number of files to download

def fetch_and_save_metadata(identifier):
    """Fetch XML metadata from GeoCat API and save it to a folder without encoding issues or extra spaces."""
    url = f"https://www.geocat.ch/geonetwork/srv/api/records/{identifier}/formatters/xml?approved=true"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Ensure correct encoding (UTF-8 handling)
        xml_content = response.content.decode('utf-8')

        # Normalize whitespace (removes excessive blank lines)
        xml_content = '\n'.join([line.strip() for line in xml_content.splitlines() if line.strip()])

        # Save cleaned XML to file
        xml_path = os.path.join(XML_SAVE_DIR, f"{identifier}.xml")
        with open(xml_path, "w", encoding="utf-8") as file:
            file.write(xml_content)
        
        print(f"Saved cleaned XML response to {xml_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching metadata for {identifier}: {e}")

if __name__ == "__main__":
    # Read the identifiers from the CSV file
    dataset_file = "geocat_datasets.csv"
    df_datasets = pd.read_csv(dataset_file)

    # Process only up to MAX_FILES records
    for index, row in df_datasets.iterrows():
        if index >= MAX_FILES:
            break  # Stop processing after reaching the limit

        identifier = row["Identifier"]
        fetch_and_save_metadata(identifier)

    print("Metadata retrieval completed.")


Saved cleaned XML response to saved_metadata_xml\efbeacb0-f492-3fe6-6cd3-63127a09bf0b.xml
Saved cleaned XML response to saved_metadata_xml\da4696e6-c546-4a4f-bbf4-36d5d01c9e2f-6571.xml
Saved cleaned XML response to saved_metadata_xml\eef7063e-7c15-47f1-aa10-ce61f81778ad-6571.xml
Saved cleaned XML response to saved_metadata_xml\544e34c5-88b5-4c81-8290-05c3dd2f0a4f-6571.xml
Saved cleaned XML response to saved_metadata_xml\0e57f315-2b70-48c3-9802-4cde2db10c49-6571.xml
Metadata retrieval completed.
