In [1]:
import os
import json
import time
import pickle
import requests
import pandas as pd
from dotenv import load_dotenv

In [2]:
load_dotenv()
API_KEY = os.getenv("GOOGLE_MAPS_API_KEY")
if not API_KEY:
    raise ValueError("API_KEY not found in environment variables.")

In [3]:
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
DETAILS_BASE_URL = "https://places.googleapis.com/v1/places/"

# Field masks for the two types of calls:
# For text search, we only request the place IDs (and nextPageToken if present)
TEXT_FIELD_MASK = "places.id,nextPageToken"
# For place details, use the long list of fields as in your original code.
fields_list = [
    "attributions", "id", "name", "photos",
    "addressComponents", "adrFormatAddress", "formattedAddress", "location", "plusCode", "shortFormattedAddress", "types", "viewport",
    "accessibilityOptions", "businessStatus", "containingPlaces", "displayName", "googleMapsLinks", "googleMapsUri", "iconBackgroundColor", "iconMaskBaseUri", "primaryType", "primaryTypeDisplayName", "pureServiceAreaBusiness", "subDestinations", "utcOffsetMinutes",
    "currentOpeningHours", "currentSecondaryOpeningHours", "internationalPhoneNumber", "nationalPhoneNumber", "priceLevel", "priceRange", "rating", "regularOpeningHours", "regularSecondaryOpeningHours", "userRatingCount", "websiteUri"
]
DETAILS_FIELD_MASK = ",".join(fields_list)

# Common headers for both requests
common_headers = {
    "Content-Type": "application/json",
    "X-Goog-Api-Key": API_KEY,
}


In [4]:

# List of Egyptian governorates (you can adjust the list as needed)
governorates = [
    "Cairo", "Giza", "Alexandria", "Dakahlia", "Red Sea", "Beheira", 
    "Fayoum", "Gharbia", "Ismailia", "Menoufia", "Minya", "Qaliubiya",
    "New Valley", "Suez", "Aswan", "Assiut", "Beni Suef", "Port Said",
    "Damietta", "Sharkia", "South Sinai", "Kafr El Sheikh", "Matrouh",
    "Luxor", "Qena", "Sohag"
]

# Place queries you want to search for
place_queries = [
    "restaurant", "cafe", "museum", "art gallery", "tourist attraction", "monument"
]


In [5]:
# Files to save checkpoints
PLACE_IDS_CHECKPOINT = "place_ids_checkpoint.pkl"
DETAILS_CHECKPOINT = "place_details_checkpoint.pkl"

# Load checkpoint for place IDs if available
if os.path.exists(PLACE_IDS_CHECKPOINT):
    with open(PLACE_IDS_CHECKPOINT, "rb") as f:
        collected_place_ids = pickle.load(f)
    print(f"Loaded {len(collected_place_ids)} place IDs from checkpoint.")
else:
    collected_place_ids = set()



In [6]:
# -------------------------------
# STEP 1: Text Search to collect Place IDs
# -------------------------------
for governorate in governorates:
    for query in place_queries:
        # Build the text query: e.g., "restaurant in Cairo, Egypt"
        text_query = f"{query} in {governorate}, Egypt"
        print(f"Searching for: {text_query}")
        
        # Initial request body and parameters
        request_body = {
            "textQuery": text_query,
            "pageSize": 20  # maximum allowed per page
        }
        next_page_token = None

        while True:
            if next_page_token:
                request_body["pageToken"] = next_page_token

            try:
                response = requests.post(
                    TEXT_SEARCH_URL,
                    headers={**common_headers, "X-Goog-FieldMask": TEXT_FIELD_MASK},
                    json=request_body
                )
                response.raise_for_status()
            except Exception as e:
                print(f"Error during text search for '{text_query}': {e}")
                break

            data = response.json()
            places = data.get("places", [])
            for place in places:
                # The id is under the "id" key in each place (or within places.id field)
                place_id = place.get("id")
                if place_id:
                    collected_place_ids.add(place_id)

            # Save checkpoint after each page
            with open(PLACE_IDS_CHECKPOINT, "wb") as f:
                pickle.dump(collected_place_ids, f)
            print(f"Collected {len(collected_place_ids)} unique place IDs so far.")

            # Check if there is a nextPageToken for pagination
            next_page_token = data.get("nextPageToken")
            if not next_page_token:
                break

            # Sleep a bit before requesting next page (rate limit caution)
            time.sleep(2)
        # Pause between different queries to be safe
        time.sleep(2)

# Convert set to list for iteration
place_ids_list = list(collected_place_ids)
print(f"Total unique place IDs collected: {len(place_ids_list)}")

Searching for: restaurant in Cairo, Egypt
Collected 20 unique place IDs so far.
Collected 40 unique place IDs so far.
Collected 60 unique place IDs so far.
Searching for: cafe in Cairo, Egypt
Collected 77 unique place IDs so far.
Collected 96 unique place IDs so far.
Collected 114 unique place IDs so far.
Searching for: museum in Cairo, Egypt
Collected 124 unique place IDs so far.
Searching for: art gallery in Cairo, Egypt
Collected 143 unique place IDs so far.
Collected 162 unique place IDs so far.
Collected 182 unique place IDs so far.
Searching for: tourist attraction in Cairo, Egypt
Collected 198 unique place IDs so far.
Collected 217 unique place IDs so far.
Collected 237 unique place IDs so far.
Searching for: monument in Cairo, Egypt
Collected 254 unique place IDs so far.
Collected 272 unique place IDs so far.
Collected 288 unique place IDs so far.
Searching for: restaurant in Giza, Egypt
Collected 303 unique place IDs so far.
Collected 320 unique place IDs so far.
Collected 339

In [7]:

# -------------------------------
# STEP 2: Get Place Details for each ID
# -------------------------------
# Load checkpoint for details if available
if os.path.exists(DETAILS_CHECKPOINT):
    with open(DETAILS_CHECKPOINT, "rb") as f:
        place_details_list = pickle.load(f)
    print(f"Loaded {len(place_details_list)} place details from checkpoint.")
else:
    place_details_list = []

# Create a set of already processed place IDs to skip duplicates
processed_ids = {detail.get("id") for detail in place_details_list if "id" in detail}

for idx, place_id in enumerate(place_ids_list):
    if place_id in processed_ids:
        continue  # skip already processed details

    details_url = f"{DETAILS_BASE_URL}{place_id}"
    # We can pass the fields as header instead of URL param
    try:
        details_response = requests.get(
            details_url,
            headers={**common_headers, "X-Goog-FieldMask": DETAILS_FIELD_MASK}
        )
        details_response.raise_for_status()
    except Exception as e:
        print(f"Error fetching details for {place_id}: {e}")
        continue

    details_data = details_response.json()
    place_details_list.append(details_data)
    processed_ids.add(place_id)
    print(f"Fetched details for {place_id} ({idx+1}/{len(place_ids_list)})")

    # Save checkpoint every few iterations
    if (idx + 1) % 10 == 0:
        with open(DETAILS_CHECKPOINT, "wb") as f:
            pickle.dump(place_details_list, f)
        print(f"Checkpoint saved for {idx+1} places.")

    time.sleep(0.5)  # pause to avoid rate limits

# Save final checkpoint for details
with open(DETAILS_CHECKPOINT, "wb") as f:
    pickle.dump(place_details_list, f)
print("Final details checkpoint saved.")



Fetched details for ChIJg0IZSDt5TRQRVoRPioDGiMY (1/4128)
Fetched details for ChIJh3exDCjF9RQR3xJz46pVXXA (2/4128)
Fetched details for ChIJKamZp-FLqxUR4o76ZTA47NE (3/4128)
Fetched details for ChIJKZPDWgC79xQR97MAX3mGZHk (4/4128)
Fetched details for ChIJTcMeB5wP9hQRxPyPlFiVmRE (5/4128)
Fetched details for ChIJZ__09cAXSRQR8VCqKdSKI4I (6/4128)
Fetched details for ChIJt__hh85LqxURTml3_5oV69s (7/4128)
Fetched details for ChIJ9zVTOQB5WRQRAVZF4Kr_F34 (8/4128)
Fetched details for ChIJgf-WyqBbWBQRgo0D3xpcu60 (9/4128)
Fetched details for ChIJkzb0rCuc9xQR0hviznZPeYs (10/4128)
Checkpoint saved for 10 places.
Fetched details for ChIJl117NrrJ9xQROA3_0IYzVIQ (11/4128)
Fetched details for ChIJlWXwtTEnWhQRAupzdYvCjHU (12/4128)
Fetched details for ChIJCSnOoFqc-RQRbZZPQ0mTLTU (13/4128)
Fetched details for ChIJS9e4pRIh-BQRkpYW-n3ZaDk (14/4128)
Fetched details for ChIJnxPjJf2PVBQR752Ed8fpEyA (15/4128)
Fetched details for ChIJYf_zlR5BWBQRHSDJylWv_IU (16/4128)
Fetched details for ChIJUxTLV2_J9xQRT7TP6gXcr3U (

In [8]:
# -------------------------------
# STEP 3: Parse the detailed responses to a DataFrame
# -------------------------------
# Convert list of dictionaries to a pandas DataFrame.
# Depending on the structure of the JSON, you may need to adjust the parsing.
df_places = pd.json_normalize(place_details_list)
print("Final DataFrame head:")
print(df_places.head())

# Save the DataFrame to CSV as a backup
df_places.to_csv("places_details.csv", index=False)
print("Data saved to places_details.csv")


Final DataFrame head:
                                 name                           id  \
0  places/ChIJg0IZSDt5TRQRVoRPioDGiMY  ChIJg0IZSDt5TRQRVoRPioDGiMY   
1  places/ChIJh3exDCjF9RQR3xJz46pVXXA  ChIJh3exDCjF9RQR3xJz46pVXXA   
2  places/ChIJKamZp-FLqxUR4o76ZTA47NE  ChIJKamZp-FLqxUR4o76ZTA47NE   
3  places/ChIJKZPDWgC79xQR97MAX3mGZHk  ChIJKZPDWgC79xQR97MAX3mGZHk   
4  places/ChIJTcMeB5wP9hQRxPyPlFiVmRE  ChIJTcMeB5wP9hQRxPyPlFiVmRE   

                                               types nationalPhoneNumber  \
0  [tourist_attraction, point_of_interest, establ...        011 40663325   
1  [tourist_attraction, point_of_interest, establ...                 NaN   
2  [breakfast_restaurant, brunch_restaurant, coff...        010 61480619   
3  [tourist_attraction, point_of_interest, establ...                 NaN   
4  [japanese_restaurant, restaurant, food, point_...                 NaN   

  internationalPhoneNumber                                   formattedAddress  \
0          +20 11 4