# Place Discovery Quickstart

Purpose: Find canonical Google Maps place IDs (ChIJ...) for selected businesses and cities in Morocco.

What you will do:
- Ensure city map centers and aliases are available (built from OSM once).
- Run discovery for a small list of businesses and cities.
- Validate canonical IDs and export a clean CSV for the next step (review collection).

Requirements:
- SERPAPI_API_KEY in your environment (.env or export).
- Internet access for SerpAPI.



In [None]:
# Setup: import from src and verify OSM-derived centers
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from review_analyzer.geo import ensure_osm_derived_files, resolve_city_name
from review_analyzer.discover import DiscoveryEngine
from review_analyzer import config

centers, aliases = ensure_osm_derived_files()
print(f" Centers: {len(centers)} | Aliases: {len(aliases)}")
print("Sample:")
for i, (k, v) in enumerate(centers.items()):
 print(" ", k, "->", v)
 if i == :
 break


In [None]:
# Run a small discovery (canonical IDs preferred automatically)
from datetime import datetime
import pandas as pd

engine = DiscoveryEngine(debug=True)

businesses = ["Crédit Agricole", "CFG Bank"]
cities = ["Rabat", "Kénitra"] # accents are OK; aliases resolve automatically
output_path = project_root / "data" / "0_processed" / "discovery" / f"quick_discovery_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)

print("Running discovery...")
df = engine.discover_branches(
 businesses=businesses,
 cities=cities,
 business_type="banque",
 map_centers=None, # engine loads OSM/default
 brand_filter=None,
 output_path=output_path,
)
print(f" Done {output_path.relative_to(project_root)}")

df.head()


In [None]:
# Validate canonical IDs and basic summary
import re

id_col = (
 'canonical_place_id'
 if ('canonical_place_id' in df.columns and df['canonical_place_id'].notna().any())
 else 'place_id' if 'place_id' in df.columns else None
)
print("ID column:", id_col)

rx = re.compile(r"^ChIJ[0-9A-Za-z_-]+$")
non = 0
if id_col:
 non = df[id_col].dropna().astype(str).apply(lambda x: not bool(rx.match(x))).sum()
print("rows:", len(df), "non_canonical:", non)

if id_col and non == 0:
 print("All IDs are canonical (ChIJ…)")

print("Businesses:", sorted(df['business'].dropna().unique().tolist())[:8] if 'business' in df.columns else [])
print("Cities:", sorted(df['city'].dropna().unique().tolist())[:8] if 'city' in df.columns else [])


In [None]:
# Export for next step (review collection)

# Choose best ID per row: canonical_place_id > place_id > data_id (string fallback)
import pandas as pd

def choose_id(row):
 cid = row.get('canonical_place_id')
 if isinstance(cid, str) and config.validate_place_id(cid):
 return cid
 pid = row.get('place_id')
 if isinstance(pid, str) and config.validate_place_id(pid):
 return pid
 did = row.get('data_id')
 if pd.notna(did) and str(did).strip() != "":
 return str(did)
 return None

ids = df.apply(choose_id, axis=)

df_out = pd.DataFrame({
 '_place_id': ids,
 '_business': df.get('business'),
 '_city': df.get('city'),
 'title': df.get('name'),
 'address': df.get('address')
}).dropna(subset=['_place_id']).drop_duplicates(subset=['_place_id'])

# Save to new data architecture
next_step_file = project_root / 'data' / '0_processed' / 'discovery' / 'agencies_discovered.csv'
next_step_file.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(next_step_file, index=False)

# Also save to legacy path for backward compatibility
legacy_file = project_root / 'data' / 'output' / 'agencies_for_collection.csv'
legacy_file.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(legacy_file, index=False)

print(f"Exported:")
print(f" Primary: {next_step_file.relative_to(project_root)}")
print(f" Legacy: {legacy_file.relative_to(project_root)}")
print(f" Rows: {len(df_out)}")
print(f"\nData saved to: data/0_processed/discovery/")
print(f"\nNext: Open collect_reviews.ipynb")
