In [1]:
import psycopg2
import json
import re
import pandas as pd
from IPython.display import display

# --- Load DB config ---
config_file = r"C:\FHNW_lokal\6000\4M\01_ETL\21_load\db_config.json"

def load_config(config_path=config_file):
    with open(config_path) as f:
        return json.load(f)

# --- SQL: get random text samples (limit 10) ---
query = """
SELECT
    ds.dataset_identifier,
    ds.dataset_title_DE, 
    ds.dataset_description_DE, 
    ds.dataset_publisher_name,
    dist.distribution_title_DE,
    dist.distribution_description_DE
FROM merged_dataset_metadata ds
LEFT JOIN merged_distribution_metadata dist 
    ON ds.dataset_identifier = dist.dataset_identifier
WHERE ds.dataset_title_DE IS NOT NULL
"""

# --- Load label data (for regex) ---
with open("03_Label/gemeinden_labels_v2.json", "r", encoding="utf-8") as f:
    categorized_labels = json.load(f)

label_lookup = {entry["label"]: entry for entry in categorized_labels}
sorted_labels = sorted(label_lookup.keys(), key=len, reverse=True)
pattern = r'(?<!\w)(' + '|'.join(re.escape(label) for label in sorted_labels) + r')(?!\w)'
label_regex = re.compile(pattern)

# --- Helper: find matches with metadata ---
def find_matches_with_meta(text):
    matches = []
    for match in label_regex.finditer(text):
        label_text = match.group()
        metadata = label_lookup.get(label_text)
        if metadata:
            matches.append({
                "text": label_text,
                "label_id": metadata.get("label_id"),
                "level": metadata.get("level"),
                "canton": metadata.get("canton"),
                "district": metadata.get("district")
            })
    return matches

# --- Connect to DB and fetch random samples ---
config = load_config()

try:
    conn = psycopg2.connect(
        database="4M",
        user=config["user"],
        password=config["password"],
        host=config["host"],
        port=config["port"]
    )
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    conn.close()

    data = []
    for row in rows:
        dataset_identifier, title, description, publisher, dist_title, dist_description = row

        data.append({
            "dataset_identifier": dataset_identifier,
            "dataset_title_DE": title,
            "dataset_description_DE": description,
            "dataset_publisher_name": publisher,
            "distribution_title_DE": dist_title,
            "distribution_description_DE": dist_description,
            "matches_title": find_matches_with_meta(title or ""),
            "matches_description": find_matches_with_meta(description or ""),
            "matches_publisher": find_matches_with_meta(publisher or ""),
            "matches_distribution_title": find_matches_with_meta(dist_title or ""),
            "matches_distribution_description": find_matches_with_meta(dist_description or "")
        })

    df = pd.DataFrame(data)
    # display(df)

except Exception as e:
    print("❌ Error:", e)


In [2]:
def extract_location_info(text, source_text_id=None):
    matches = find_matches_with_meta(text or "")

    # 1. Only one match → use label_id
    if len(matches) == 1:
        return {"label_id": matches[0].get("label_id")}

    if len(matches) > 1:
        label_ids = {m.get("label_id") for m in matches}
        if len(label_ids) == 1:
            return {"label_id": matches[0].get("label_id")}

        # 2. All matches must have the same non-null district
        districts = [m.get("district") for m in matches]
        unique_districts = set(districts)
        if None not in districts and len(unique_districts) == 1:
            return {
                "label_id": next(
                    m.get("label_id") for m in matches
                    if m.get("district") == list(unique_districts)[0]
                )
            }

        # 3. All in the same canton → return canton's label_id (level 1)
        cantons = {m.get("canton") for m in matches if m.get("canton")}
        if len(cantons) == 1:
            canton_value = list(cantons)[0]
            for m in matches:
                if m.get("level") == 1 and m.get("canton") == canton_value:
                    return {"label_id": m.get("label_id")}

        # 4. Fallback: lowest level — only if unambiguous
        levels = [m.get("level", float("inf")) for m in matches]
        min_level = min(levels)
        lowest_level_matches = [m for m in matches if m.get("level", float("inf")) == min_level]

        if len(lowest_level_matches) == 1:
            return {"label_id": lowest_level_matches[0].get("label_id")}

        # 5. If all label_ids contain "CH", assign default fallback
        if all("CH" in (m.get("label_id") or "") for m in matches):
            return {"label_id": "CH0000000000"}

        # 6. Fallbacks for neighboring countries of Switzerland
        neighbor_fallbacks = {
            "AT": "AT0000000000",  # Österreich
            "DE": "DE0000000000",  # Deutschland
            "FR": "FR0000000000",  # Frankreich
            "IT": "IT0000000000",  # Italien
            "LI": "LI0000000000",  # Liechtenstein
        }

        # Check each neighboring country code
        for country_code, fallback_id in neighbor_fallbacks.items():
            if all(country_code in (m.get("label_id") or "") for m in matches):
                return {"label_id": fallback_id}


    return {"label_id": None}


In [3]:
fallback_label_id = "no_location_found"  # 👈 set your desired default

def get_priority_location_data(row):
    fields = [
        "dataset_title_DE",
        "dataset_description_DE",
        "dataset_publisher_name",
        "distribution_title_DE",
        "distribution_description_DE"
    ]

    for field in fields:
        result = extract_location_info(row.get(field, ""), source_text_id=row.name)
        if result.get("label_id") is not None:
            return {
                "label_id": result["label_id"],
                "source_field": field
            }

    # If no confident match is found → use fallback label_id
    return {
        "label_id": fallback_label_id,
        "source_field": "fallback"
    }


In [4]:
def get_priority_location_data(row):
    fields = [
        "dataset_title_DE",
        "dataset_description_DE",
        "dataset_publisher_name",
        "distribution_title_DE",
        "distribution_description_DE"
    ]

    for field in fields:
        text = row.get(field, "")
        matches = find_matches_with_meta(text or "")
        result = extract_location_info(text, source_text_id=row.name)
        if result["label_id"]:
            # Add match info directly
            return {
                "label_id": result["label_id"],
                "match_field": field,
                "match_level": next((m["level"] for m in matches if m["label_id"] == result["label_id"]), None)
            }

    return {"label_id": None, "match_field": None, "match_level": None}


In [8]:
from tqdm.notebook import tqdm
tqdm.pandas()  # Enable tqdm for Pandas

# Apply with a progress bar
df[["label_id", "match_field", "match_level"]] = df.progress_apply(
    get_priority_location_data, axis=1, result_type="expand"
)

display(df)


  0%|          | 0/90888 [00:00<?, ?it/s]

Unnamed: 0,dataset_identifier,dataset_title_DE,dataset_description_DE,dataset_publisher_name,distribution_title_DE,distribution_description_DE,matches_title,matches_description,matches_publisher,matches_distribution_title,matches_distribution_description,label_id,match_field,match_level
0,100081@kanton-basel-stadt,"Smart Climate Feinstaubmessungen, csv, fgb, ge...",Im Rahmen des Projektes [«Smart Climate» von S...,,geojson,Smart Climate Feinstaubmessungen (geojson),[],"[{'text': 'Basel', 'label_id': 'CH12120002701'...",[],[],[],CH0000000000,dataset_description_DE,
1,election-landratswahlen-2007-region-2-5-muench...,"Landratswahlen 2007 Wahlkreis Münchenstein, la...","Schlussresultate der regionalen Wahl ""Landrats...",,landratswahlen-2007-wahlkreis-muenchenstein-pa...,,"[{'text': 'Münchenstein', 'label_id': 'CH13130...","[{'text': 'Münchenstein', 'label_id': 'CH13130...",[],[],[],CH13130102769,dataset_title_DE,3.0
2,vote-volksinitiative-vom-21-september-2007-fue...,"Volksinitiative vom 21. September 2007 ""für ei...",Schlussresultate der eidgenössischen Abstimmun...,,volksinitiative-vom-21-september-2007-fuer-ein...,,[],"[{'text': 'eidgenössischen', 'label_id': 'CH00...",[],[],[],CH1300000000,dataset_description_DE,1.0
3,693@statistisches-amt-kanton-zuerich,"Archiv Wahlresultate Nationalratswahlen, Parte...",Ergebnisse der Erneuerungswahlen der zürcheris...,,Listenresultate Erneuerungswahl Nationalrat 2007,Anzahl erhaltener Stimmen pro Liste und Gebiet,[],"[{'text': 'Kanton Zürich', 'label_id': 'CH0100...",[],[],[],CH0100000000,dataset_description_DE,1.0
4,dd2c9dc4-b27f-4b40-9640-f036a620257b@stadt-zurich,Fahrzeiten 2018 der VBZ im SOLL-IST-Vergleich ...,Diese Daten ermöglichen einen sehr genauen Ver...,,Fahrzeiten_SOLL_IST_20180902_20180908.csv,,[],"[{'text': 'Zürich', 'label_id': 'CH0100000000'...",[],[],[],CH0100000000,dataset_description_DE,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90883,geoportal-294-68,Rapperswil-Jona - Kleinstrukturen Gde,"Kleinstrukturen umfassen Einzelbäume, Hecken, ...",Rapperswil-Jona,,,"[{'text': 'Rapperswil-Jona', 'label_id': 'CH17...",[],"[{'text': 'Rapperswil-Jona', 'label_id': 'CH17...",[],[],CH17172603340,dataset_title_DE,3.0
90884,geoportal-229-57,Wattwil - Parkplatzbewirtschaftung,Diese Karte zeigt die Verkehrszonen und Parkpl...,Wattwil,,,"[{'text': 'Wattwil', 'label_id': 'CH1717270337...",[],"[{'text': 'Wattwil', 'label_id': 'CH1717270337...",[],[],CH17172703379,dataset_title_DE,3.0
90885,geoportal-118-65,Altstätten - Werkplan Abwasser,Diese Karte zeigt die Werkleitungen der Abwass...,Altstätten,,,"[{'text': 'Altstätten', 'label_id': 'CH1717230...",[],"[{'text': 'Altstätten', 'label_id': 'CH1717230...",[],[],CH17172303251,dataset_title_DE,3.0
90886,geoportal-218-36,Eggersriet - GEP Abwassersanierungsplan,Diese Karte stellt den Abwassersanierungsplan ...,Ingenieurbüro Rüttimann AG,,,"[{'text': 'Eggersriet', 'label_id': 'CH1717210...",[],"[{'text': 'AG', 'label_id': 'CH1900000000', 'l...",[],[],CH17172103212,dataset_title_DE,3.0


In [10]:
import psycopg2
import pandas as pd
import json

# Load the label map
with open("03_Label/gemeinden_label_map_v2.json", "r", encoding="utf-8") as f:
    label_map_entries = json.load(f)

# Create lookup: label_id → label
label_id_to_label = {entry["label_id"]: entry["label"] for entry in label_map_entries}

# Add new column to df
df["dataset_location"] = df["label_id"].map(label_id_to_label)

# --- Load config ---
config = load_config()

# --- Update the DB with dataset_location ---
try:
    conn = psycopg2.connect(
        database="4M",
        user=config["user"],
        password=config["password"],
        host=config["host"],
        port=config["port"]
    )
    cur = conn.cursor()

    for _, row in df.iterrows():
        location = row["dataset_location"]
        identifier = row["dataset_identifier"]

        if pd.notna(location) and pd.notna(identifier):
            cur.execute("""
                UPDATE merged_dataset_metadata
                SET dataset_location = %s
                WHERE dataset_identifier = %s
            """, (location, identifier))

    conn.commit()
    cur.close()
    conn.close()
    print("✅ dataset_location successfully written to merged_dataset_metadata.")

except Exception as e:
    print("❌ Database update failed:", e)


✅ dataset_location successfully written to merged_dataset_metadata.


In [13]:
import psycopg2
import pandas as pd
import json

# --- Load label map ---
with open("03_Label/gemeinden_label_map_v2.json", "r", encoding="utf-8") as f:
    label_map_entries = json.load(f)

# --- Create lookups ---
label_id_to_label = {entry["label_id"]: entry["label"] for entry in label_map_entries}
label_id_to_district = {entry["label_id"]: entry.get("district") for entry in label_map_entries}
label_id_to_canton = {entry["label_id"]: entry.get("canton") for entry in label_map_entries}

# --- Prepare DataFrame with new columns ---
df["dataset_location"] = df["label_id"].map(label_id_to_label)
df["dataset_location_district"] = df["label_id"].map(label_id_to_district)
df["dataset_location_canton"] = df["label_id"].map(label_id_to_canton)
df["dataset_location_country"] = df["label_id"].str.slice(0, 2)  # First 2 characters for country

# --- Replace missing values with 'not_found' ---
df["dataset_location"] = df["dataset_location"].fillna("not_found")
df["dataset_location_district"] = df["dataset_location_district"].fillna("not_found")
df["dataset_location_canton"] = df["dataset_location_canton"].fillna("not_found")
df["dataset_location_country"] = df["dataset_location_country"].fillna("not_found")

# --- Load config ---
config = load_config()

# --- Update DB with all location attributes ---
try:
    conn = psycopg2.connect(
        database="4M",
        user=config["user"],
        password=config["password"],
        host=config["host"],
        port=config["port"]
    )
    cur = conn.cursor()

    for _, row in df.iterrows():
        location = row["dataset_location"]
        district = row["dataset_location_district"]
        canton = row["dataset_location_canton"]
        country = row["dataset_location_country"]
        identifier = row["dataset_identifier"]

        if pd.notna(identifier):
            cur.execute("""
                UPDATE merged_dataset_metadata
                SET
                    dataset_location = %s,
                    dataset_location_district = %s,
                    dataset_location_canton = %s,
                    dataset_location_country = %s
                WHERE dataset_identifier = %s
            """, (location, district, canton, country, identifier))

    conn.commit()
    cur.close()
    conn.close()
    print(f"✅ Successfully updated {len(df)} records in merged_dataset_metadata.")

except Exception as e:
    print("❌ Database update failed:", e)


✅ Successfully updated 90888 records in merged_dataset_metadata.


In [9]:
print("finished")

finished
