# City Explorer: Multi-Source Attraction & Activity Discovery with Routing

This notebook is intentionally **readable** and **step-based**.

## What it does
1. Loads required environment variables (API keys + DB path)
2. Initializes a shared SQLite database (migrations + tables)
3. Retrieves **Top 10** items for a city (ranked by review count) from:
   - Google Places (New) — tourist attractions
   - TripAdvisor Content API — attractions/activities (depending on API behavior)
4. Uses a city-level snapshot cache (`city_top10`) and an item-level cache (`item_summary`)
5. Provides an interactive UI using **ipywidgets**

## Required environment variables
- `GOOGLE_MAPS_API_KEY`
- `TRIPADVISOR_API_KEY`
- `DABN23_DB_PATH` (full file path, e.g. `G:\My Drive\dabn23_SharedDatabase\dabn23_cache.sqlite`)


In [1]:
# 0) Dependency check (optional)
# This notebook does NOT auto-install by default (cleaner + more reproducible).
AUTO_INSTALL = False

required = [
    ("requests", "requests"),
    ("pandas", "pandas"),
    ("ipywidgets", "ipywidgets"),
]

missing = []
for import_name, pip_name in required:
    try:
        __import__(import_name)
    except ImportError:
        missing.append(pip_name)

if missing:
    print("Missing packages:", ", ".join(missing))
    print("Install command:")
    print("  pip install " + " ".join(missing))
    if AUTO_INSTALL:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])
        print("Installed. Re-run this cell if needed.")
else:
    print("All required packages are installed.")


All required packages are installed.


In [2]:
# 1) Make sure we can import from /src (works when running from notebooks/ folder)
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)


Project root: c:\Users\Samuel\Desktop\Git Repo\dabn23-project1\dabn23


In [3]:
# 2) Load configuration (API keys + DB path)
# config.py fails fast with a helpful error message if something is missing.

from src.config import GOOGLE_API_KEY, TA_API_KEY, DB_PATH

print("Google API key loaded (length):", len(GOOGLE_API_KEY))
print("TripAdvisor API key loaded (length):", len(TA_API_KEY))
print("DB_PATH:", DB_PATH)


Google API key loaded (length): 39
TripAdvisor API key loaded (length): 32
DB_PATH: G:\My Drive\dabn23_SharedDatabase\dabn23_places_cache.sqlite


In [4]:
# 3) Initialize the shared SQLite database (creates the file if it doesn't exist)

from pathlib import Path
from src.db import connect, migrate_if_needed, create_tables

# Ensure parent folder exists (SQLite can create the file, but not the folder)
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)

conn = connect(DB_PATH)
migrate_if_needed(conn)   # handles legacy schemas (e.g., place_ids_json -> item_ids_json)
create_tables(conn)

tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print("✅ DB ready. Tables:", [t[0] for t in tables])


✅ DB ready. Tables: ['city_top10', 'item_summary']


In [5]:
import src.pipelines as p
with open(p.__file__, "r", encoding="utf-8") as f:
    txt = f.read()

print("Length:", len(txt))
print(txt[:800])

Length: 5897
# src/pipelines.py
"""
Top-10 pipelines (snapshot + cache) for DABN23.

This module is meant to replace the notebook-defined pipeline functions so that:
- notebooks stay thin (just call functions)
- snapshot + caching logic lives in src/
- TripAdvisor group filtering is applied BEFORE snapshot is saved
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional

from .cache import (
    get_city_snapshot_item_ids,
    save_city_snapshot_item_ids,
    get_cached_item_summary,
    upsert_item_summary,
)

from . import google_places as g
from . import tripadvisor as ta


def top10_google_attractions(
    conn,
    city: str,
    n: int = 10,
    language: str = "en",
    search_pool: int = 50,
) -> List[Dict[str, Any]]:
    """Top-N Google tourist attractions by revi


In [6]:
# Step 4 definitions with imports

from src.pipelines import top10_city

ALLOW = ["Tours", "Food & Drink", "Outdoor Activities", "Boat Tours & Water Sports", "Nightlife", "Shopping"]
DENY  = ["Sights & Landmarks", "Museums"]

def city_search(city: str):
    return top10_city(conn, city, allow_groups=ALLOW, deny_groups=DENY)

## 5) Interactive search UI (ipywidgets)

Use the controls to choose:
- city
- data source (Google or TripAdvisor)
- type (attraction/activity)

Then click **Search Top 10**.


In [7]:
from src.ui import build_city_widget

LAST_SEARCHED_CITY = None
LAST_SEARCH_RESULTS = None

build_city_widget(city_search)

VBox(children=(HBox(children=(Text(value='Paris', description='City:', layout=Layout(width='420px'), placehold…

{'last_city': None, 'last_results': None}

In [8]:
print(LAST_SEARCHED_CITY)

None


## 6) Optional: "closest two" demo (fallback)

This uses a straight-line distance fallback (Haversine) so the demo works even before
Google Routes API is integrated into `src/routing.py`.


In [9]:
from src.routing import closest_two_fallback

# Example: compute closest two among Google top-10 (needs lat/lng)
city = "Paris"
results = top10_google_attractions(city)

start = results[0]
others = results[1:]

closest = closest_two_fallback(start, others)

print("Start:", start.get("name"))
print("Closest two (fallback distance):")
for c in closest:
    print(" -", c.get("name"))


NameError: name 'top10_google_attractions' is not defined

# Selenium

## Selenium implementation


In [None]:
import json
import sqlite3
import datetime

def get_attraction_names(city: str, conn: sqlite3.Connection):
    """
    Look up stored attraction names for a city from city_top10 / ta_place_summary.
    Returns a list of name strings, or None if city not found.
    """
    citykey = city.strip().lower()

    cur = conn.cursor()

    # Read the stored place_ids_json for this city
    row = cur.execute(
        "SELECT item_ids_json FROM city_top10 "
        "WHERE city_key = ? AND source = ? AND item_type = ?",
        (citykey, "google", "attraction")
    ).fetchone()

    if not row:
        # City not in DB
        return None

    place_ids = json.loads(row[0])
    if not place_ids:
        return []

    # Fetch names in the same ranked order
    placeholders = ",".join("?" * len(place_ids))
    name_rows = cur.execute(
        f"SELECT item_id, name FROM item_summary "
        f"WHERE source = ? AND item_id IN ({placeholders})",
        ["google", *place_ids]
    ).fetchall()

    name_map = {pid: name for pid, name in name_rows}

    # Preserve original ranking order
    return [name_map[pid] for pid in place_ids if pid in name_map]

In [34]:
import os
import sqlite3
import pathlib

# Path to TripAdvisor cache in the same folder as this notebook's working dir
TA_DB_PATH = str(pathlib.Path().cwd() / "dabn23_tripadvisor_cache.sqlite")

print("TripAdvisor DB path:", TA_DB_PATH)
print("Exists?", os.path.exists(TA_DB_PATH))

taconn = sqlite3.connect(TA_DB_PATH)
#taconn.execute("PRAGMA journal_mode=WAL;")  # safe even if wal/shm files present

TripAdvisor DB path: c:\Users\Samuel\Desktop\Git Repo\dabn23-project1\dabn23\notebooks\dabn23_tripadvisor_cache.sqlite
Exists? True


In [35]:
def _parse_busy_bar(aria: str):
    """
    Parses one peak-hours bar aria-label.
    Handles English ("77% busy at 2 pm") and Swedish/Nordic ("77 aktivitet kl. 1400.").
    Returns (hour_24, pct) or None.
    """
    # Swedish/Nordic: "37 aktivitet kl. 1300."
    m = re.search(r"^(\d+)\D+?kl\.\s*(\d{2})\d{2}", aria.strip())
    if m:
        return int(m.group(2)), int(m.group(1))

    # English: "77% busy at 2 pm"
    m = re.search(r"(\d+)%.*?(\d{1,2})\s*(am|pm)", aria, re.IGNORECASE)
    if m:
        pct, h, mer = int(m.group(1)), int(m.group(2)), m.group(3).lower()
        hour_24 = (h % 12) + (12 if mer == "pm" else 0)
        return hour_24, pct

    return None


def get_current_busyness(driver, attraction_name: str):
    """
    Searches Google Maps for attraction_name and scrapes the full-day busyness.
    Returns list[int|None] with 24 entries (index = hour 0–23),
    or None if the place has no peak-hours section at all.
    """
    print(f"\n  Searching: {attraction_name}")

    # 1. Navigate and type in search bar
    driver.get("https://www.google.com/maps")
    time.sleep(2)
    dismiss_google_consent(driver)

    search_bar = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, "q"))
    )
    search_bar.clear()
    search_bar.send_keys(attraction_name)
    driver.find_element(By.CSS_SELECTOR, "button.mL3xi").click()
    time.sleep(3)

    # 2. Disambiguation list → click first result if present
    try:
        WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.hfpxzc"))
        ).click()
        time.sleep(3)
        print("    Clicked top result from list.")
    except TimeoutException:
        print("    Landed directly on place page.")

    # 3. Find peak-hours section
    try:
        peak_section = WebDriverWait(driver, 6).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.UmE4Qe"))
        )
    except TimeoutException:
        print("    No peak hours data available.")
        return None

    # 4. Parse all hourly bars into a 24-slot list
    hourly_data = [None] * 24
    bars = peak_section.find_elements(By.CSS_SELECTOR, "div.dpoVLd")
    print(f"    Found {len(bars)} hourly bars.")

    for bar in bars:
        aria = bar.get_attribute("aria-label") or ""

        # Live "Currently X% busy" → slot into current hour
        live = re.search(r"(?:Currently|Nuvarande).*?(\d+)%", aria, re.IGNORECASE)
        if live:
            hourly_data[datetime.datetime.now().hour] = int(live.group(1))
            print(f"    ✓ Live now: {int(live.group(1))}%")
            continue

        parsed = _parse_busy_bar(aria)
        if parsed:
            hour_24, pct = parsed
            if 0 <= hour_24 <= 23:
                hourly_data[hour_24] = pct

    filled = sum(1 for x in hourly_data if x is not None)
    print(f"    ✓ Stored {filled}/24 hours.")
    return hourly_data


def dismiss_google_consent(driver):
    """Dismiss the GDPR consent banner on Google Maps (EU only)."""
    try:
        accept_btn = WebDriverWait(driver, 8).until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//button[.//span[contains(text(),"Accept all") '
                'or contains(text(),"Reject all")]]'
            ))
        )
        accept_btn.click()
        time.sleep(1)
        print("  Google consent dismissed.")
    except:
        print("  No consent popup found.")

In [None]:
import datetime
def scrape_peak_hours(city: str, conn: sqlite3.Connection):
    """
    Scrapes Google Maps peak hours for all TA top-10 attractions of a city.
    Saves results into the global `busyness_data` dict.
    Supports multiple cities — each call adds/updates one city entry.
    Prints the current-hour snapshot when done.
    """
    scraped_at = datetime.datetime.now().strftime("%H:%M")
    city_key   = city.strip()

    print(f"Looking up '{city_key}' in  DB...")
    names = get_attraction_names(city_key, conn)

    if names is None:
        print(f"  ✗ '{city_key}' not found in DB. Run the scraper first.")
        return
    if not names:
        print(f"  ✗ No attractions stored for '{city_key}'.")
        return

    print(f"  Found {len(names)} attractions: {', '.join(names[:3])}...")

    attractions = {}
    try:
        for name in names:
            hourly = get_current_busyness(driver, name)
            attractions[name] = hourly   # list[int|None] or None
            time.sleep(2)
    finally:
        driver.quit()
        print("\nDriver closed.")

    # Save into global dict (safe to call again for a different city)
    busyness_data[city_key] = {
        "scraped_at":  scraped_at,
        "attractions": attractions,
    }

    # Print current-hour snapshot
    now_hour = datetime.now().hour
    print(f"\n{'='*54}")
    print(f"  CURRENT BUSYNESS — {city_key}  (scraped at {scraped_at})")
    print(f"{'='*54}")
    for name, hourly in attractions.items():
        if hourly is None:
            val = "N/A (no GM data)"
        elif hourly[now_hour] is None:
            val = "N/A (no data this hour)"
        else:
            val = f"{hourly[now_hour]}%"
        print(f"  {name:<44} {val}")

In [44]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from selenium.common.exceptions import NoSuchElementException, TimeoutException

options = Options()
options.add_argument("--lang=en-US")
options.add_experimental_option("prefs", {
    "intl.accept_languages": "en-US,en",
    "profile.default_content_setting_values.geolocation": 2,
})
options.add_argument(
    "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Emulation.setGeolocationOverride", {
    "latitude": 40.7128, "longitude": -74.0060, "accuracy": 100
})
print("Driver started.")

# Global storage for peak-hours data (multi-city)
busyness_data = {}
scrape_peak_hours("Paris", conn)

driver.close()
print("Finished scraping.")
# To scrape additional cities, re-run this cell with a new city name.
# busyness_data will accumulate entries for all cities scraped this session.

Driver started.
Looking up 'Paris' in TA DB...


OperationalError: no such table: ta_city_top10

In [38]:
import pandas as pd

def print_busyness_summary(city: str = None):
    """Print a DataFrame of current-hour busyness. Omit city= for all cities."""
    cities = [city] if city else list(busyness_data.keys())
    now_hour = datetime.now().hour

    if not cities:
        print("busyness_data is empty — run scrape_peak_hours() first.")
        return

    for c in cities:
        if c not in busyness_data:
            print(f"No data for '{c}'.")
            continue
        entry = busyness_data[c]
        rows = []
        for name, hourly in entry["attractions"].items():
            pct = None if (hourly is None) else hourly[now_hour]
            rows.append({
                "Attraction":           name,
                f"Busy at {now_hour:02d}:00": f"{pct}%" if pct is not None else "N/A",
                "Has full-day data":    hourly is not None,
            })
        print(f"\n=== {c}  (scraped at {entry['scraped_at']}) ===")
        display(pd.DataFrame(rows))

print_busyness_summary("Venice")

AttributeError: module 'datetime' has no attribute 'now'