# Geodatenmodelle von BAFU Webseite extrahieren
Das BAFU publiziert auf folgender Webseite minimale Geodatenmodelle mit Dokumentation in PDF-Files.
https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle.html

Mit folgendem Skript werden die Metadaten (URL, Beschreibung, Titel, Datum) der publizierten ZIP-Files extrahiert. Die Metadaten werden anschliessend in den BAFU Datenkatalog integriert.

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import datetime

In [23]:
# Alle BAFU-Webseiten mit MGDM ZIP-Files
BafuSeiten = {"https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle.html":"BAFU",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/abfall--geodatenmodelle.html": "Abfall",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/altlasten--geodatenmodelle.html": "Altlasten",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/biodiversitaet--geodatenmodelle.html": "Biodiversität",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/biotechnologie--geodatenmodelle.html": "Biotechnologie",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/boden--geodatenmodelle.html": "Boden",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/chemikalien--geodatenmodelle.html": "Chemikalien",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/landschaft--geodatenmodelle.html": "Landschaft",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/laerm--geodatenmodelle.html": "Lärm",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/luft--geodatenmodelle.html": "Luft",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/naturgefahren--geodatenmodelle.html": "Naturgefahren",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/stoerfallvorsorge--geodatenmodelle.html": "Störfallvorsorge",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/wald--geodatenmodelle.html": "Wald",
              "https://www.bafu.admin.ch/bafu/de/home/zustand/daten/geodatenmodelle/wasser--geodatenmodelle.html": "Wasser"}

In [24]:
# Funktion mit welcher das Datum aus einem String extrahiert wird
def extract_date_from_text(text):
    # Look for date patterns in the text (e.g., DD.MM.YYYY, YYYY-MM-DD)
    date_patterns = [
        r'\d{2}\.\d{2}\.\d{4}',  # DD.MM.YYYY
        r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
        r'\d{1,2}\.\d{1,2}\.\d{4}', # D.M.YYYY or DD.M.YYYY or D.MM.YYYY
        r'\d{1,2}\.\d{1,2}\.\d{2}' # D.M.YY etc
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                # Attempt to parse the matched date
                # Add more date formats as needed
                if '.' in match.group(0):
                    date_object = datetime.datetime.strptime(match.group(0), '%d.%m.%Y')
                elif '-' in match.group(0):
                    date_object = datetime.datetime.strptime(match.group(0), '%Y-%m-%d')
                return date_object.strftime('%Y-%m-%d') # Standardize to YYYY-MM-DD
            except ValueError:
                # If parsing fails, it might not be a valid date despite matching pattern
                continue
    return None # Return None if no date is found or parsed

In [22]:
# Funktion welche Metadaten aller MGDM ZIP-Files von der BAFU Webseite holt

def extract_zip_metadata(url, keyword):
    """
    Extracts metadata of ZIP files linked on a given URL using BeautifulSoup.

    Args:
        url (str): The URL to scrape.
        keyword (str): A keyword associated with the URL (used in the output DataFrame).

    Returns:
        pandas.DataFrame: A DataFrame containing metadata (url, filename, description,
                          link_text, extracted_date, keyword) for each ZIP file found.
                          Returns an empty DataFrame if no ZIP files are found or
                          if an error occurs during the request.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags (links)
    links = soup.find_all('a', href=True)

    zip_data = []

    for link in links:
        href = link['href']
        # Check if the link points to a ZIP file (case-insensitive)
        if href.lower().endswith('.zip'):
            # Get the full URL if it's a relative path
            if href.startswith('/'):
                zip_url = f"https://www.bafu.admin.ch{href}"
            elif href.startswith('./'):
                # This relative path handling might need adjustment based on the specific URL structure
                # Assuming the base for './' is the directory of the current page
                base_url_match = re.match(r'(https?://[^/]+/.*/)', url)
                if base_url_match:
                    base_url = base_url_match.group(1)
                    zip_url = f"{base_url}{href[2:]}"
                else:
                     zip_url = href # Fallback to original href if base cannot be determined
            else:
                zip_url = href

            # Attempt to get metadata (e.g., filename, potential description from surrounding text)
            filename_match = re.search(r'/([^/]+\.zip)$', zip_url)
            filename = filename_match.group(1) if filename_match else 'N/A'

            # Try to find a descriptive text near the link
            description = 'N/A'
            # Look for preceding text or sibling elements
            prev_sibling = link.previous_sibling
            if prev_sibling and isinstance(prev_sibling, str):
                description = prev_sibling.strip()
            elif link.parent:
                # Check if the parent has relevant text
                parent_text = link.parent.get_text().replace(link.get_text(), '').strip()
                if parent_text:
                     description = parent_text
                else:
                    # Look for text within the immediate siblings
                    for sibling in link.find_previous_siblings():
                        if isinstance(sibling, str) and sibling.strip():
                            description = sibling.strip()
                            break
                    if description == 'N/A':
                        for sibling in link.find_next_siblings():
                             if isinstance(sibling, str) and sibling.strip():
                                description = sibling.strip()
                                break

            # Extract the text within the <a> tag
            link_text = link.get_text().strip()

            # Extract date from link_text
            extracted_date = extract_date_from_text(link_text)


            zip_data.append({'url': zip_url,
                             'filename': filename,
                             'description': description,
                             'link_text': link_text,
                             'extracted_date': extracted_date,
                             'keyword': keyword})

    # Create a pandas DataFrame
    df = pd.DataFrame(zip_data)

    # Clean link_text by removing content in parentheses
    df['link_text_cleaned'] = df['link_text'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())

    return df

In [44]:
# Alle Webseiten aufrufen und alle Metadaten als CSV speichern

# Create an empty list to store the results from each page
all_zip_metadata = []

# Iterate over the BafuSeiten dictionary
for url, keyword in BafuSeiten.items():
    # Call the function for each entry
    df_page = extract_zip_metadata(url, keyword)
    # Append the resulting DataFrame to the list
    if not df_page.empty:
        all_zip_metadata.append(df_page)

# Concatenate all DataFrames into a single DataFrame
if all_zip_metadata:
    final_df = pd.concat(all_zip_metadata, ignore_index=True)
    print("\nCombined DataFrame of ZIP metadata:")
    print(f"\nTotal number of ZIP files found: {len(final_df)}")
else:
    print("\nNo ZIP files found on any of the specified BAFU pages.")
    final_df = pd.DataFrame() # Create an empty DataFrame if no data was collected




Combined DataFrame of ZIP metadata:

Total number of ZIP files found: 82


In [58]:
# prompt: in final_df["description"] die Zeilenumbrüche löschen und durch ", " ersetzen

final_df["description"] = final_df["description"].str.replace('\n', ', ', regex=False)
final_df["title"] = "Geodatenmodell " + final_df["link_text_cleaned"]
final_df["kontakt"] = "gis@bafu.admin.ch"
final_df["Typ"] = "Geodatenmodell"

Geodatenmodelle_df = final_df.rename(columns={'keyword': 'keyword',
                                              'title': 'title',
                                              'description': 'description',
                                              'extracted_date':'modified',
                                              'Typ': 'Typ',
                                              'kontakt': 'Kontakt',
                                              'url': 'URL'
                                              })
Geodatenmodelle_df = Geodatenmodelle_df[['keyword', 'title', 'description', 'modified', 'Typ', 'Kontakt', 'URL']]

In [65]:
# prompt: Geodatenmodelle_df exportieren als csv mit ";" als Trennzeichen und

Geodatenmodelle_df.to_csv('Geodatenmodelle.csv', sep=';', index=False)