## Get Data

### Download Railway Dataset

In [2]:
!pip install -r requirements.txt

Collecting requests (from -r requirements.txt (line 1))
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm (from -r requirements.txt (line 2))
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting charset_normalizer<4,>=2 (from requests->-r requirements.txt (line 1))
  Using cached charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests->-r requirements.txt (line 1))
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests->-r requirements.txt (line 1))
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests->-r requirements.txt (line 1))
  Downloading certifi-2025.11.12-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Using cached charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl (107 kB)
Using cached idna-3.11-py3-none-any.whl (71 kB)
Using cache

In [None]:
import requests
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import traceback

OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OUTPUT_DIR = Path("stations_world")
OUTPUT_DIR.mkdir(exist_ok=True)

LOG_FILE = Path("download_errors.log")
FAILED_FILE = Path("failed_tiles.json")

ROWS = 100
COLS = 200
WORKERS = 200

def log_error(message: str):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(message + "\n")

def make_query(s, w, n, e):
    return f"""
    [out:json][timeout:180];

    (
      node["railway"="station"]({s},{w},{n},{e});
      way["railway"="station"]({s},{w},{n},{e});
      relation["railway"="station"]({s},{w},{n},{e});

      node["amenity"="bus_station"]({s},{w},{n},{e});
      way["amenity"="bus_station"]({s},{w},{n},{e});
    );

    out body;
    >;
    out skel qt;
    """

def download_tile(args):
    tile_id, south, west, north, east = args
    query = make_query(south, west, north, east)

    try:
        resp = requests.post(OVERPASS_URL, data=query, timeout=300)

        if resp.status_code == 200:
            out_file = OUTPUT_DIR / f"stations_tile_{tile_id}.json"
            with open(out_file, "w", encoding="utf-8") as f:
                f.write(resp.text)
            return True

        error_msg = f"HTTP {resp.status_code} for tile {tile_id} ({south},{west},{north},{east})"
        log_error(error_msg)
        return False

    except Exception as e:
        error_msg = (
            f"Exception for tile {tile_id} ({south},{west},{north},{east}): "
            f"{repr(e)}\n{traceback.format_exc()}"
        )
        log_error(error_msg)
        return False

def main():
    lat_step = 180 / ROWS
    lon_step = 360 / COLS

    tasks = []
    tile_id = 0

    for _ in range(ROWS * COLS):
        i = tile_id // COLS
        j = tile_id % COLS

        south = -90 + i * lat_step
        north = -90 + (i + 1) * lat_step
        west  = -180 + j * lon_step
        east  = -180 + (j + 1) * lon_step

        tasks.append((tile_id, south, west, north, east))
        tile_id += 1

    failed_tiles = []

    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(download_tile, t): t for t in tasks}

        for future in tqdm(as_completed(futures), total=len(tasks), desc="Downloading world tiles"):
            tile_data = futures[future]
            success = future.result()

            if not success:
                tile_id, s, w, n, e = tile_data
                failed_tiles.append({
                    "tile_id": tile_id,
                    "south": s,
                    "west": w,
                    "north": n,
                    "east": e
                })

    # Save failed tile list
    with open(FAILED_FILE, "w", encoding="utf-8") as f:
        json.dump(failed_tiles, f, indent=2)

    print(f"\nFinished. Failed tiles saved in {FAILED_FILE}")
    print(f"Error log saved in {LOG_FILE}")

if __name__ == "__main__":
    main()


Downloading world tiles:   2%|‚ñè         | 382/20000 [00:18<15:54, 20.54it/s]  


Retry script (for failed_tiles.json)

In [None]:
import requests
import json
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback

OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OUTPUT_DIR = Path("stations_world_retry")
OUTPUT_DIR.mkdir(exist_ok=True)

FAILED_FILE = Path("failed_tiles.json")
FAILED_RETRY_FILE = Path("failed_tiles_retry.json")
LOG_FILE = Path("retry_errors.log")

WORKERS = 50

def log_error(message: str):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(message + "\n")

def make_query(s, w, n, e):
    return f"""
    [out:json][timeout:180];

    (
      node["railway"="station"]({s},{w},{n},{e});
      way["railway"="station"]({s},{w},{n},{e});
      relation["railway"="station"]({s},{w},{n},{e});

      node["amenity"="bus_station"]({s},{w},{n},{e});
      way["amenity"="bus_station"]({s},{w},{n},{e});
    );

    out body;
    >;
    out skel qt;
    """

def download_tile(tile):
    tile_id = tile["tile_id"]
    s, w, n, e = tile["south"], tile["west"], tile["north"], tile["east"]
    query = make_query(s, w, n, e)

    try:
        resp = requests.post(OVERPASS_URL, data=query, timeout=300)

        if resp.status_code == 200:
            out_file = OUTPUT_DIR / f"stations_tile_{tile_id}.json"
            with open(out_file, "w", encoding="utf-8") as f:
                f.write(resp.text)
            return True

        log_error(f"HTTP {resp.status_code} for retry tile {tile_id}")
        return False

    except Exception as e:
        log_error(
            f"Exception for retry tile {tile_id}: {repr(e)}\n{traceback.format_exc()}"
        )
        return False

def main():
    with open(FAILED_FILE, "r", encoding="utf-8") as f:
        tiles = json.load(f)

    failed_retry = []

    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(download_tile, t): t for t in tiles}

        for future in tqdm(as_completed(futures), total=len(tiles), desc="Retrying failed tiles"):
            tile_data = futures[future]
            success = future.result()

            if not success:
                failed_retry.append(tile_data)

    with open(FAILED_RETRY_FILE, "w", encoding="utf-8") as f:
        json.dump(failed_retry, f, indent=2)

    print(f"\nRetry finished. Still failing: {len(failed_retry)}")
    print(f"Saved to {FAILED_RETRY_FILE}")

if __name__ == "__main__":
    main()


### Download OpenAdress Dataset

1. Open https://batch.openaddresses.io/data#map=0/0/0
2. Create an account or login
3. Download the "Global" file

optimize code for failed downloads