In [1]:
import os
import osmium
import pyarrow as pa
import pyarrow.parquet as pq
from shapely import from_wkb
from collections import defaultdict
from tqdm.auto import tqdm
import pandas as pd
import reverse_geocoder as rg

# ================= CONFIGURATION =================

# TUPLE MAPPING (Key, Value) -> (Class, Group)
# (Key, Value) are taken from tags, all of them are kept at the end.
# (Class, Group) are used to group similar tags together and are defined by us however we want
POI_MAPPING = {
    # # --- ACCOMMODATION ---
    # ("tourism","hotel"): ("hotel","Accommodation"),
    # ("tourism","hostel"): ("hostel","Accommodation"),
    # ("tourism","guest_house"): ("guest_house","Accommodation"),
    # ("tourism","bed_and_breakfast"): ("b_and_b","Accommodation"),
    # ("tourism","apartment"): ("apartment","Accommodation"),
    # ("tourism","chalet"): ("chalet","Accommodation"),
    # ("tourism","camp_site"): ("camp_site","Accommodation"),
    # ("tourism","caravan_site"): ("caravan_site","Accommodation"),
    # ("tourism","alpine_hut"): ("alpine_hut","Accommodation"),

    # --- FOOD ---
    ("amenity","restaurant"): ("restaurant","Food"),
    ("amenity","fast_food"): ("fast_food","Food"),
    ("amenity","cafe"): ("cafe","Food"),
    ("amenity","food_court"): ("food_court","Food"),
    ("amenity","ice_cream"): ("ice_cream","Food"),
    ("amenity","biergarten"): ("biergarten","Food"),

    # --- NIGHTLIFE ---
    ("amenity","bar"): ("bar","Nightlife"),
    ("amenity","pub"): ("pub","Nightlife"),
    ("amenity","nightclub"): ("nightclub","Nightlife"),
    ("amenity","casino"): ("casino","Nightlife"),

    # --- CULTURE ---
    ("tourism","museum"): ("museum","Culture"),
    ("amenity","arts_centre"): ("arts_centre","Culture"),
    ("amenity","theatre"): ("theatre","Culture"),
    ("amenity","cinema"): ("cinema","Culture"),
    ("tourism","gallery"): ("gallery","Culture"),
    ("amenity","library"): ("library","Culture"),

    # --- SIGHTSEEING ---
    ("tourism","attraction"): ("attraction","Sightseeing"),
    ("tourism","viewpoint"): ("viewpoint","Sightseeing"),
    ("tourism","artwork"): ("artwork","Sightseeing"),
    ("historic","monument"): ("monument","Sightseeing"),
    ("historic","memorial"): ("memorial","Sightseeing"),
    ("historic","castle"): ("castle","Sightseeing"),
    ("historic","ruins"): ("ruins","Sightseeing"),
    ("historic","archaeological_site"): ("archaeological","Sightseeing"),
    ("historic","fort"): ("fort","Sightseeing"),
    ("amenity","place_of_worship"): ("place_of_worship","Sightseeing"),

    # --- NATURE ---
    ("natural","beach"): ("beach","Nature"),
    ("natural","peak"): ("peak","Nature"),
    ("natural","volcano"): ("volcano","Nature"),
    ("natural","cave_entrance"): ("cave_entrance","Nature"),
    ("natural","glacier"): ("glacier","Nature"),
    ("leisure","park"): ("park","Nature"),
    ("leisure","garden"): ("garden","Nature"),
    ("leisure","nature_reserve"): ("nature_reserve","Nature"),

    # --- LEISURE / FAMILY ---
    ("leisure","playground"): ("playground","Family"),
    ("tourism","zoo"): ("zoo","Family"),
    ("leisure","water_park"): ("water_park","Family"),
    ("leisure","swimming_pool"): ("swimming_pool","Family"),
    ("leisure","stadium"): ("stadium","Leisure"),
    ("leisure","sports_centre"): ("sports_centre","Leisure"),
    ("leisure","marina"): ("marina","Leisure"),
    ("tourism","picnic_site"): ("picnic_site","Leisure"),

    # --- SUPPLIES ---
    ("shop","supermarket"): ("supermarket","Supplies"),
    ("shop","convenience"): ("convenience","Supplies"),
    ("shop","bakery"): ("bakery","Supplies"),
    ("shop","greengrocer"): ("greengrocer","Supplies"),
    ("shop","general"): ("general_store","Supplies"),

    # --- SHOPPING ---
    ("shop","mall"): ("mall","Shopping"),
    ("shop","department_store"): ("department_store","Shopping"),
    ("shop","clothes"): ("clothes","Shopping"),
    ("shop","gift"): ("gift_shop","Shopping"),
    ("shop","books"): ("bookshop","Shopping"),

    # --- SERVICES ---
    ("tourism","information"): ("tourist_info","Services"),
    ("amenity","toilets"): ("toilets","Services"),
    ("amenity","bank"): ("bank","Services"),
    ("amenity","atm"): ("atm","Services"),
    ("amenity","post_office"): ("post_office","Services"),
    ("shop","laundry"): ("laundry","Services"),
    ("amenity","car_rental"): ("car_rental","Services"),
    ("amenity","bicycle_rental"): ("bicycle_rental","Services"),
    ("amenity","travel_agent"): ("travel_agent","Services"),

    # --- HEALTH & SAFETY ---
    ("amenity","pharmacy"): ("pharmacy","Health"),
    ("amenity","hospital"): ("hospital","Health"),
    ("amenity","clinic"): ("clinic","Health"),
    ("amenity","doctors"): ("doctors","Health"),
    ("amenity","police"): ("police","Health"),

    # --- TRANSPORT ---
    ("railway","station"): ("train_station","Transport"),
    # ("railway","tram_stop"): ("tram_stop","Transport"),
    ("railway","subway_entrance"): ("subway_entrance","Transport"),
    # ("highway","bus_stop"): ("bus_stop","Transport"),
    ("amenity","bus_station"): ("bus_station","Transport"),
    ("amenity","taxi"): ("taxi","Transport"),
    ("amenity","ferry_terminal"): ("ferry_terminal","Transport"),
    ("amenity","airport"): ("airport","Transport"),
}

# Updated Priority Order (Lower number = Higher Priority)
GROUP_PRIORITY = {
    "Accommodation": 1,
    "Sightseeing": 2,
    "Culture": 3,
    "Family": 4,
    "Nightlife": 5,
    "Food": 6,
    "Nature": 7,
    "Transport": 8,
    "Leisure": 9,
    "Shopping": 10,
    "Supplies": 11,
    "Services": 12,
    "Health": 13
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class LimitReached(Exception):
    pass

class POIHandler(osmium.SimpleHandler):
    def __init__(self, output_path, total_elements=None, batch_size=500_000, max_pois=None, step=1):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.columns = defaultdict(list)
        self.output_path = output_path
        self.batch_size = batch_size
        self.writer = None
        self.max_pois = max_pois
        self.step = step

        # Counters
        self.total_pois = 0
        self.batch_count = 0
        self.batch_num = 0
        self.scanned_count = 0

        # Load Geocoder
        print("Loading Geocoder Database...")
        rg.search((0,0))
        print("Geocoder Ready.")

        self.pbar = tqdm(total=total_elements, unit=" elems", desc="Scanning", mininterval=2.0)

        # Schema
        self.schema = pa.schema([
            ("osm_type", pa.string()),
            ("osm_id", pa.int64()),
            ("lat", pa.float64()),
            ("lon", pa.float64()),
            ("poi_class", pa.string()),
            ("poi_group", pa.string()),
            ("tags", pa.map_(pa.string(), pa.string())),
        ])

    def _extract_poi(self, tags, geom_func, osm_type, osm_id):
        # Find all matches
        matches = []
        for k, v in tags:
            if (k, v) in POI_MAPPING:
                p_class, p_group = POI_MAPPING[(k, v)]
                priority = GROUP_PRIORITY.get(p_group, 99)
                matches.append((priority, p_class, p_group))

        if not matches:
            return

        # Picking best match
        # Sort by priority score (lowest number wins)
        matches.sort(key=lambda x: x[0])
        best_match = matches[0]
        poi_class, poi_group = best_match[1], best_match[2]

        # Geometry
        try:
            wkb_data = geom_func()
            geom = from_wkb(wkb_data)
            centroid = geom.centroid
        except Exception:
            return

        # Save all tags
        tag_items = [(t.k, t.v) for t in tags]

        self.columns["osm_type"].append(osm_type)
        self.columns["osm_id"].append(osm_id)
        self.columns["lat"].append(centroid.y)
        self.columns["lon"].append(centroid.x)
        self.columns["poi_class"].append(poi_class)
        self.columns["poi_group"].append(poi_group)
        self.columns["tags"].append(tag_items)

        self.batch_count += 1
        self.total_pois += 1

        if self.total_pois % 1000 == 0:
            self.pbar.set_postfix({"Found POIs": f"{self.total_pois:,}"})

        if self.max_pois and self.total_pois >= self.max_pois:
            raise LimitReached()

        if self.batch_count >= self.batch_size:
            self._flush()

    def _flush(self):
        if not self.columns["osm_id"]: return
        table = pa.Table.from_pydict(self.columns, schema=self.schema)
        if self.writer is None:
            self.writer = pq.ParquetWriter(self.output_path, self.schema, compression='snappy')
        self.writer.write_table(table)
        self.batch_num += 1
        self.columns = defaultdict(list)
        self.batch_count = 0

    def node(self, n):
        self.scanned_count += 1
        if self.scanned_count % 50000 == 0:
            self.pbar.update(50000)

        if self.scanned_count % self.step != 0:
            return

        if not n.tags: return

        try:
            self._extract_poi(n.tags, lambda: self.wkb_factory.create_point(n), "node", n.id)
        except osmium.InvalidLocationError: pass

    def area(self, a):
        self.scanned_count += 1
        if self.scanned_count % 1000 == 0:
             self.pbar.update(1000)

        if self.scanned_count % self.step != 0:
            return

        if not a.tags:
            return

        try:
            self._extract_poi(a.tags, lambda: self.wkb_factory.create_multipolygon(a), "way" if a.from_way() else "relation", a.orig_id())
        except Exception:
            pass

    def close(self):
        self._flush()
        self.pbar.close()
        if self.writer: self.writer.close()

        if os.path.exists(self.output_path):
            size_mb = os.path.getsize(self.output_path) / (1024 * 1024)
            print(f"\nFinished! Extracted {self.total_pois:,} POIs.")
            print(f"Output Size: {size_mb:.2f} MB")

In [3]:
raw_input_file = "europe-latest.osm.pbf"
raw_output_file = "europe_pois_raw.parquet"

In [4]:
file_size = os.path.getsize(raw_input_file)
estimated_elements = file_size // 5

# Set step=1, max_pois=None to run the full continent, or change step to consider every step=k node/area
handler = POIHandler(raw_output_file, total_elements=estimated_elements, max_pois=None, step=10)

Loading Geocoder Database...
Loading formatted geocoded file...
Geocoder Ready.


Scanning:   0%|          | 0/6717417940 [00:00<?, ? elems/s]

In [None]:
print(f"Processing {raw_input_file}...")
try:
    handler.apply_file(raw_input_file, locations=True, idx="flex_mem")
except LimitReached:
    print("\nLimit reached.")
finally:
    handler.close()

Processing europe-latest.osm.pbf...


Scanning:   2%|▏         | 102850000/6717417940 [02:48<2:05:52, 875865.96 elems/s, Found POIs=25,000]

In [16]:
input_file = "south_america_pois_raw.parquet"
output_file = "south_america_pois_enriched.parquet"
CHUNK_SIZE = 100_000

In [17]:
parquet_file = pq.ParquetFile(input_file)
writer = None

In [18]:
processed_count = 0

for i, batch in enumerate(parquet_file.iter_batches(batch_size=CHUNK_SIZE)):
    # Load Chunk
    df = batch.to_pandas()

    # Get Coordinates
    coords = list(zip(df["lat"], df["lon"]))

    # Batch Geocode (Fast)
    geo_results = rg.search(coords, mode=2)
    geo_df = pd.DataFrame(geo_results)

    # geo_df['lat'] = geo_df['lat'].astype(float)
    # geo_df['lon'] = geo_df['lon'].astype(float)
    geo_df = geo_df.drop(columns=['lat', 'lon'])

    # Rename
    geo_df.columns = [f"addr_{col}" for col in geo_df.columns]

    # Merge
    # Reset index to ensure they align 1:1
    enriched_df = pd.concat([df.reset_index(drop=True), geo_df.reset_index(drop=True)], axis=1)

    # Write
    table = pa.Table.from_pandas(enriched_df)
    if writer is None:
        writer = pq.ParquetWriter(output_file, table.schema, compression='snappy')
    writer.write_table(table)

    processed_count += len(df)
    print(f"Processed Batch {i+1}: {processed_count:,} rows")

if writer:
    writer.close()

print(f"\nDone! Processed {processed_count:,} rows.")
print(f"Saved to: {output_file}")

Processed Batch 1: 100,000 rows
Processed Batch 2: 130,206 rows

Done! Processed 130,206 rows.
Saved to: south_america_pois_enriched.parquet


In [19]:
print("\n--- INSPECTING DATA ---")

size_gb = os.path.getsize(output_file) / (1024**3)
print(f"Parquet Size: {size_gb:.4f} GB")

# Read the file back to verify it works
df = pd.read_parquet(output_file)
print(f"Loaded DataFrame with shape: {df.shape}")
print(df.head(5))

# Show tag structure example
print("\nTag Structure Example:")
print(df.iloc[0]["tags"])


--- INSPECTING DATA ---
Parquet Size: 0.0088 GB
Loaded DataFrame with shape: (130206, 11)
  osm_type    osm_id        lat        lon poi_class poi_group  \
0     node  96183523 -27.339303 -68.877766      peak    Nature   
1     node  96183913 -24.034400 -67.322309      peak    Nature   
2     node  96184413 -27.104828 -68.787111      peak    Nature   
3     node  96184669 -27.459390 -68.980419      peak    Nature   
4     node  96185824 -27.175681 -68.574729   volcano    Nature   

                                                tags  \
0                                  [[natural, peak]]   
1  [[ele, 5594], [name, Cerro Rincón], [natural, ...   
2                                  [[natural, peak]]   
3                                  [[natural, peak]]   
4  [[ele, 6501], [name, Volcán Ata], [natural, vo...   

                   addr_name addr_admin1           addr_admin2 addr_cc  
0                   Fiambala   Catamarca                            AR  
1  San Antonio de los Cobres 

In [20]:
df.describe()

Unnamed: 0,osm_id,lat,lon
count,130206.0,130206.0,130206.0
mean,3678604000.0,-17.817473,-59.801466
std,3866830000.0,14.733753,13.475882
min,447074.0,-59.447318,-109.443225
25%,657427100.0,-29.701813,-71.664506
50%,1369693000.0,-20.830165,-60.642518
75%,5806526000.0,-6.766048,-47.712576
max,13406610000.0,15.672022,-26.369892


In [21]:
df.sample(n=80000)

Unnamed: 0,osm_type,osm_id,lat,lon,poi_class,poi_group,tags,addr_name,addr_admin1,addr_admin2,addr_cc
24135,node,4870813912,-32.835481,-56.426459,restaurant,Food,"[[amenity, restaurant]]",Paso de los Toros,Tacuarembo,,UY
84402,way,484967031,-6.165269,-37.848065,park,Nature,"[[leisure, park]]",Sao Bento,Paraiba,Catole Do Rocha,BR
114808,way,1106331317,-19.465044,-44.248302,park,Nature,"[[leisure, park], [name, Praça Alexandre Lanza]]",Sete Lagoas,Minas Gerais,Sete Lagoas,BR
126910,way,1374463640,-5.579810,-36.936359,convenience,Supplies,"[[addr:city, Assu], [addr:housenumber, 3568], ...",Acu,Rio Grande do Norte,Acu,BR
69948,way,230359307,4.583360,-74.127398,park,Nature,"[[leisure, park]]",Bogota,Bogota D.C.,,CO
...,...,...,...,...,...,...,...,...,...,...,...
77874,way,397194644,-17.751610,-63.174662,park,Nature,"[[leisure, park]]",Santa Cruz de la Sierra,Santa Cruz,,BO
33212,node,5919380639,-42.508581,-71.429320,convenience,Supplies,"[[name, 8 Hnos.], [shop, convenience]]",Leleque,Chubut,,AR
29292,node,5371978841,-23.594589,-46.405735,place_of_worship,Sightseeing,"[[amenity, place_of_worship], [denomination, e...",Ferraz de Vasconcelos,Sao Paulo,Ferraz De Vasconcelos,BR
87711,way,542635576,-45.871418,-67.503012,park,Nature,"[[addr:city, Comodoro Rivadavia], [addr:postco...",Comodoro Rivadavia,Chubut,,AR


In [22]:
df[df['osm_type'] == 'relation']

Unnamed: 0,osm_type,osm_id,lat,lon,poi_class,poi_group,tags,addr_name,addr_admin1,addr_admin2,addr_cc
65021,relation,10997410,-29.260844,-71.479075,nature_reserve,Nature,"[[boundary, protected_area], [leisure, nature_...",La Serena,Coquimbo,Provincia de Elqui,CL
65023,relation,6775837,-53.660870,-72.213337,nature_reserve,Nature,"[[boundary, protected_area], [leisure, nature_...",Punta Arenas,Magallanes,Provincia de Magallanes,CL
65028,relation,6342969,-22.979300,-43.212488,sports_centre,Leisure,"[[access, private], [addr:street, Avenida Epit...",Rio de Janeiro,Rio de Janeiro,Rio De Janeiro,BR
65861,relation,447074,-13.708398,-65.924393,nature_reserve,Nature,"[[addr:country, BO], [boundary, protected_area...",Santa Rosa,El Beni,,BO
65868,relation,451551,-3.595277,-79.254555,nature_reserve,Nature,"[[boundary, protected_area], [iba_code, EC068]...",Zaruma,El Oro,,EC
...,...,...,...,...,...,...,...,...,...,...,...
130030,relation,5891637,-3.720879,-38.508233,beach,Nature,"[[lifeguard, yes], [name, Praia de Iracema], [...",Fortaleza,Ceara,Fortaleza,BR
130058,relation,6644285,-33.812132,-58.427354,beach,Nature,"[[name, Playa de la Agraciada], [natural, beac...",Nueva Palmira,Colonia,,UY
130078,relation,19179864,-15.454291,-56.087368,park,Nature,"[[image, https://www.midianews.com.br//storage...",Cuiaba,Mato Grosso,Cuiaba,BR
130117,relation,19972521,-32.948400,-68.856310,park,Nature,"[[leisure, park], [name, Paseo Nuestra Señora ...",Godoy Cruz,Mendoza,Departamento de Godoy Cruz,AR


In [23]:
df[df['osm_type'] == 'way']

Unnamed: 0,osm_type,osm_id,lat,lon,poi_class,poi_group,tags,addr_name,addr_admin1,addr_admin2,addr_cc
65016,way,8105832,-23.535091,-46.635314,attraction,Sightseeing,"[[addr:city, São Paulo], [addr:housenumber, 1]...",Sao Paulo,Sao Paulo,Sao Paulo,BR
65017,way,10241503,-34.562175,-58.458690,park,Nature,"[[addr:city, Ciudad Autónoma de Buenos Aires],...",Colegiales,Buenos Aires F.D.,,AR
65018,way,10429335,-34.576329,-58.503240,park,Nature,"[[leisure, park], [name, Plaza Leandro N. Alem...",General San Martin,Buenos Aires,Partido de General San Martin,AR
65019,way,10596781,-34.581360,-58.399876,park,Nature,"[[leisure, park], [name, Plaza República de Ch...",Retiro,Buenos Aires F.D.,,AR
65020,way,11482710,-19.924773,-43.925598,hospital,Health,"[[addr:city, Belo Horizonte], [addr:housenumbe...",Belo Horizonte,Minas Gerais,Belo Horizonte,BR
...,...,...,...,...,...,...,...,...,...,...,...
130201,way,1461562641,-31.152548,-64.475381,swimming_pool,Family,"[[leisure, swimming_pool]]",Valle Hermoso,Cordoba,,AR
130202,way,1461635491,-31.095347,-64.483910,playground,Family,"[[leisure, playground]]",La Falda,Cordoba,,AR
130203,way,1461644608,-23.493052,-47.511176,pharmacy,Health,"[[addr:housenumber, 1820], [addr:street, Aveni...",Sorocaba,Sao Paulo,Sorocaba,BR
130204,way,1461647495,-41.869160,-73.828990,place_of_worship,Sightseeing,"[[amenity, place_of_worship], [building, resid...",Ancud,Los Lagos,Provincia de Chiloe,CL


In [24]:
df[df['osm_type'] == 'node']

Unnamed: 0,osm_type,osm_id,lat,lon,poi_class,poi_group,tags,addr_name,addr_admin1,addr_admin2,addr_cc
0,node,96183523,-27.339303,-68.877766,peak,Nature,"[[natural, peak]]",Fiambala,Catamarca,,AR
1,node,96183913,-24.034400,-67.322309,peak,Nature,"[[ele, 5594], [name, Cerro Rincón], [natural, ...",San Antonio de los Cobres,Salta,,AR
2,node,96184413,-27.104828,-68.787111,peak,Nature,"[[natural, peak]]",Fiambala,Catamarca,,AR
3,node,96184669,-27.459390,-68.980419,peak,Nature,"[[natural, peak]]",Copiapo,Atacama,Provincia de Copiapo,CL
4,node,96185824,-27.175681,-68.574729,volcano,Nature,"[[ele, 6501], [name, Volcán Ata], [natural, vo...",Fiambala,Catamarca,,AR
...,...,...,...,...,...,...,...,...,...,...,...
65011,node,13401933466,-31.311029,-64.462588,doctors,Health,"[[amenity, doctors], [healthcare, doctor], [he...",Cosquin,Cordoba,,AR
65012,node,13405007405,-34.439106,-71.075018,convenience,Supplies,"[[brand, Shell Select], [brand:wikidata, Q1243...",San Vicente,O'Higgins,Provincia de Cachapoal,CL
65013,node,13405274749,-31.804567,-59.166003,toilets,Services,"[[access, yes], [amenity, toilets], [check_dat...",Villaguay,Entre Rios,Departamento de Villaguay,AR
65014,node,13406341801,-26.993739,-54.483027,clothes,Shopping,"[[addr:housenumber, 80], [addr:postcode, 3354]...",Dos de Mayo,Misiones,,AR
