In [1]:
import os
import polars as pl

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

import re
import unicodedata

In [2]:
zkraceni = {
    "K - adresa knihovny: ulice": "ulice",
    "K - adresa knihovny: PSČ": "psc",
    "K - adresa knihovny: město": "mesto",
    "K - adresa knihovny: kraj": "kraj"
}

In [3]:
df = pl.read_excel(
    "data/knihovny/Evidence knihoven-11122025.xlsx"
).filter(
    pl.col("aktivní / zrušená (vyřazená z evidence)") == "A"
).rename(
    zkraceni
).with_columns(
    (
        pl.col("ulice") + ", " + 
        pl.col("mesto") + ", " + 
        pl.col("psc") + ", " + 
        pl.col("kraj") + ", Czechia"
    ).alias("search_query")
)

Could not determine dtype for column 7, falling back to string
Could not determine dtype for column 9, falling back to string
Could not determine dtype for column 10, falling back to string
Could not determine dtype for column 20, falling back to string
Could not determine dtype for column 24, falling back to string
Could not determine dtype for column 34, falling back to string


In [4]:
len(df)

6040

In [5]:
df = df.unique(
    subset=['search_query']
)

In [6]:
len(df)

6005

In [7]:
geolocator = Nominatim(user_agent="cz_address_geocoder_project_v1")

In [8]:
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=3)

In [9]:
def sanitize_filename(filename, max_length=255):
    # Normalize unicode characters
    filename = unicodedata.normalize('NFKD', filename)
    filename = filename.encode('ascii', 'ignore').decode('ascii')
    
    # Remove invalid characters (different for Windows vs Unix)
    # This handles Windows which is most restrictive
    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
    
    # Remove control characters
    filename = re.sub(r'[\x00-\x1f\x7f]', '', filename)
    
    # Remove leading/trailing spaces and dots
    filename = filename.strip('. ')
    
    # Replace spaces with underscores (optional)
    filename = filename.replace(' ', '_')
    
    # Ensure filename isn't empty
    if not filename:
        filename = 'unnamed'
    
    # Truncate to max length
    if len(filename) > max_length:
        name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
        if ext:
            name = name[:max_length - len(ext) - 1]
            filename = f"{name}.{ext}"
        else:
            filename = filename[:max_length]
    
    return filename

In [10]:
def get_location_with_fallback(row):
    # Ošetření prázdných hodnot (aby Python nespadl na NoneType)
    ulice = row['ulice'] if row['ulice'] else ""
    mesto = row['mesto'] if row['mesto'] else ""
    psc = row['psc'] if row['psc'] else ""
    kraj = row['kraj'] if row['kraj'] else ""

    # POKUS 1: Plná přesná adresa
    query_exact = f"{ulice}, {mesto}, {psc}, {kraj}, Czechia"
    
    try:
        loc = geocode(query_exact, country_codes="cz")
        if loc:
            return {"lat": loc.latitude, "lon": loc.longitude, "precision": "exact"}
    except:
        pass # Pokud nastane chyba sítě, zkusíme fallback nebo vrátíme chybu později

    # POKUS 2: Fallback (Město + PSČ + Kraj) - Vynecháme ulici
    # To často pomůže, pokud je v ulici překlep nebo číslo popisné nesedí
    query_city_zip = f"{mesto}, {psc}, {kraj}, Czechia"
    try:
        loc = geocode(query_city_zip, country_codes="cz")
        if loc:
            return {"lat": loc.latitude, "lon": loc.longitude, "precision": "city_zip_fallback"}
    except:
        pass

    # POKUS 3: Hrubý Fallback (Jen Město + Kraj)
    # Poslední záchrana, pokud je špatně i PSČ
    query_city_only = f"{mesto}, {kraj}, Czechia"
    try:
        loc = geocode(query_city_only, country_codes="cz")
        if loc:
            return {"lat": loc.latitude, "lon": loc.longitude, "precision": "city_only_fallback"}
    except:
        pass

    # Když selže všechno
    return {"lat": None, "lon": None, "precision": "not_found"}

In [11]:
output_schema = pl.Struct({
    "lat": pl.Float64, 
    "lon": pl.Float64, 
    "precision": pl.Utf8
})

In [12]:
cesta = 'data/knihovny/adresy'

In [13]:
os.makedirs(cesta, exist_ok=True)

In [14]:
stazene = set([f'{x}.parquet' for x in os.listdir(cesta)])

In [None]:
for radek in df.iter_rows(named=True):

    soubor = sanitize_filename(radek['search_query'])
    if soubor not in stazene:
    
        try:
            radek = pl.DataFrame(radek)
            radek.with_columns(
            pl.struct(["ulice", "mesto", "psc", "kraj"])
            .map_elements(get_location_with_fallback, return_dtype=output_schema)
            .alias("geo_data")
        ).with_columns(
            pl.col('geo_data').struct.field("lat").alias("lat"),
            pl.col('geo_data').struct.field("lon").alias("lon"),
            pl.col('geo_data').struct.field("precision").alias("geo_precision")
        ).write_parquet(
            f"{cesta}/{soubor}.parquet"
        )
        except Exception as e:
            print(f'{e}: {soubor}')