# Google Places Data Pull – Quick Assessment

Purpose: pull POIs for a location via Google Places + Geocoding, measure time, and inspect data quality (counts, sample rows, missing fields).

⚠️ **API key**: set the environment variable `GOOGLE_MAP_API` (or rename to `GOOGLE_MAPS_API_KEY`). **Do not hardcode keys in the notebook.**



In [None]:
import os, time, requests, math, json
from collections import defaultdict
from typing import List, Dict, Any

API_KEY = os.getenv("GOOGLE_MAP_API") or os.getenv("GOOGLE_MAPS_API_KEY")
if not API_KEY:
    raise ValueError("Set GOOGLE_MAP_API or GOOGLE_MAPS_API_KEY in your environment")

GEOCODE_URL = "https://maps.googleapis.com/maps/api/geocode/json"
PLACES_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
DISTANCE_URL = "https://maps.googleapis.com/maps/api/distancematrix/json"

def geocode_address(address: str):
    params = {"address": address, "key": API_KEY}
    resp = requests.get(GEOCODE_URL, params=params, timeout=20)
    data = resp.json()
    if data.get("status") != "OK":
        raise RuntimeError(f"Geocoding failed: {data.get('status')}")
    loc = data["results"][0]["geometry"]["location"]
    return loc["lat"], loc["lng"], data["results"][0].get("formatted_address")

def fetch_places(location, radius_m: int, place_types: List[str], page_pause: float = 2.0):
    lat, lng = location
    results = []
    next_page_token = None
    while True:
        params = {
            "location": f"{lat},{lng}",
            "radius": radius_m,
            "type": place_types[0],  # one type per query
            "key": API_KEY
        }
        if next_page_token:
            params["pagetoken"] = next_page_token
            time.sleep(page_pause)  # required before using next_page_token

        resp = requests.get(PLACES_URL, params=params, timeout=30)
        data = resp.json()
        status = data.get("status")
        if status not in ("OK", "ZERO_RESULTS"):
            break
        results.extend(data.get("results", []))
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break
    return results

def summarize_results(raw_results: List[Dict[str, Any]]):
    if not raw_results:
        return {"count": 0, "missing_name": 0, "with_rating": 0, "avg_rating": None}
    missing_name = sum(1 for r in raw_results if not r.get("name"))
    ratings = [r.get("rating") for r in raw_results if r.get("rating") is not None]
    avg_rating = round(sum(ratings)/len(ratings), 2) if ratings else None
    return {
        "count": len(raw_results),
        "missing_name": missing_name,
        "with_rating": len(ratings),
        "avg_rating": avg_rating,
    }

# Category → Google place type mapping (one type per call; adjust as needed)
CATEGORY_TYPES = {
    "schools": ["school"],
    "hospitals": ["hospital"],
    "clinics": ["doctor"],
    "pharmacies": ["pharmacy"],
    "banks": ["bank"],
    "atms": ["atm"],
    "restaurants": ["restaurant"],
    "cafes": ["cafe"],
    "fast_food": ["meal_takeaway"],
    "gyms": ["gym"],
    "libraries": ["library"],
    "worship": ["place_of_worship"],
    "nightlife": ["bar"],
    "cinemas": ["movie_theater"],
    "parks": ["park"],
    "hotels": ["lodging"],
    "shops": ["store"],
    "transit": ["transit_station"],
}

def run_category_pull(location, radius_m=2000):
    summary = {}
    timings = {}
    samples = {}
    for cat, types in CATEGORY_TYPES.items():
        start = time.perf_counter()
        raw = fetch_places(location, radius_m, types)
        elapsed = time.perf_counter() - start
        timings[cat] = elapsed
        summary[cat] = summarize_results(raw)
        samples[cat] = raw[:3]
    return summary, timings, samples



In [2]:
# Choose a location
address = "Indiranagar, Bangalore"
radius_m = 2000  # 2 km

geo_start = time.perf_counter()
lat, lng, formatted = geocode_address(address)
geo_time = time.perf_counter() - geo_start
print(f"Geocoded '{address}' → {lat:.6f}, {lng:.6f} | {formatted} | {geo_time:.2f}s")

print("\nFetching places per category (Google Places Nearby)...")
summary, timings, samples = run_category_pull((lat, lng), radius_m=radius_m)

print("\n=== Counts & Data Quality ===")
for cat, stats in summary.items():
    print(f"{cat:12s} | count={stats['count']:3d} | missing_name={stats['missing_name']:2d} | with_rating={stats['with_rating']:2d} | avg_rating={stats['avg_rating']}")

print("\n=== Timings (s) ===")
for cat, t in sorted(timings.items(), key=lambda x: x[1], reverse=True):
    print(f"{cat:12s}: {t:.2f}s")

print("\n=== Sample Rows (first 3 per category, if available) ===")
for cat, rows in samples.items():
    if not rows:
        continue
    print(f"\n[{cat}] sample:")
    for r in rows:
        print(json.dumps({
            "name": r.get("name"),
            "types": r.get("types"),
            "rating": r.get("rating"),
            "user_ratings_total": r.get("user_ratings_total"),
            "geometry": r.get("geometry", {}).get("location", {}),
        }, ensure_ascii=False))



Geocoded 'Indiranagar, Bangalore' → 12.978369, 77.640836 | Indiranagar, Bengaluru, Karnataka, India | 0.53s

Fetching places per category (Google Places Nearby)...

=== Counts & Data Quality ===
schools      | count= 60 | missing_name= 0 | with_rating=33 | avg_rating=4.53
hospitals    | count= 60 | missing_name= 0 | with_rating=41 | avg_rating=4.19
clinics      | count= 60 | missing_name= 0 | with_rating=57 | avg_rating=4.45
pharmacies   | count= 60 | missing_name= 0 | with_rating=52 | avg_rating=3.96
banks        | count= 60 | missing_name= 0 | with_rating=60 | avg_rating=3.25
atms         | count= 60 | missing_name= 0 | with_rating=58 | avg_rating=2.72
restaurants  | count= 60 | missing_name= 0 | with_rating=60 | avg_rating=4.03
cafes        | count= 60 | missing_name= 0 | with_rating=57 | avg_rating=4.28
fast_food    | count= 60 | missing_name= 0 | with_rating=50 | avg_rating=4.12
gyms         | count= 60 | missing_name= 0 | with_rating=55 | avg_rating=4.64
libraries    | count=  8 

## Notes
- Google Places Nearby caps results per query (20 per page, up to ~60 with pagination). Counts in dense areas will be underestimates.
- Geometry is point-only; no polygons. Area-based metrics (park area, road density) are not available from this endpoint.
- Transit coverage may be sparse; `transit_station` is a coarse type. For precise metro/bus stops, OSM is usually richer.
- If you need travel times/distances, call Distance Matrix on the nearest few POIs.
- Respect Google Maps Platform Terms; avoid long-term storage beyond allowed caching.

