In [3]:
!pip install python-dotenv --quiet

In [3]:
# -------------------------------------------------------------
# This script sets up the Colab environment for interacting with BigQuery and Google Drive.
#
# Purpose:
# - Authenticate the Colab session with Google Cloud services.
# - Initialize the BigQuery client for querying and managing datasets.
# - Mount Google Drive to access project files (e.g., saved CSVs, templates).
# - Load environment variables (such as API keys) securely from a .env file.
#
# Notes:
# - Must run this block at the start of every Colab session.
# - Assumes the .env file is stored at /MyDrive/google/.env.
# - Enables seamless access to BigQuery and external APIs (e.g., Google Places API).
# -------------------------------------------------------------

# %load /content/drive/MyDrive/ColabTemplates/colab_bigquery_startup.txt

!pip install python-dotenv --quiet

from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
client = bigquery.Client()

from google.auth import default
creds, _ = default()
print(f"🔐 Authenticated as: {creds.service_account_email}")

from google.colab import drive
drive.mount('/content/drive')

from dotenv import load_dotenv
import os

load_dotenv("/content/drive/MyDrive/google/.env")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


🔐 Authenticated as: default
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# -------------------------------------------------------------
# This script performs a basic test query against the Google Places API.
#
# Purpose:
# - Verify that the Places API key is working correctly.
# - Confirm that the API returns a valid response status.
# - Ensure connectivity before starting batch restaurant enrichment.
#
# Notes:
# - This is a lightweight diagnostic call using a generic search query.
# - "restaurants in Chicago IL" is used as a simple test case.
# - Expected status is "OK" if the API key and quota are configured correctly.
# -------------------------------------------------------------



import requests

SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"

params = {
    "query": "restaurants in Chicago IL",
    "key": GOOGLE_API_KEY
}

response = requests.get(SEARCH_URL, params=params)
data = response.json()
print("Status:", data.get("status"))


Status: OK


In [63]:
# -------------------------------------------------------------
# This script enriches a batch of restaurants using the Google Places API.
#
# Purpose:
# - Load a batch of restaurants from a master list saved in Drive.
# - Query Google Places to enrich each restaurant with rating, price level, popularity, and types.
# - Filter results to include only valid restaurants based on returned types.
# - Save progress every 50 matches to a checkpoint CSV in Drive.
#
# Notes:
# - The script is batch-driven using 'batch_number' and 'BATCH_SIZE' for systematic processing.
# - If batch_number = 0 and first iteration, any old checkpoint file is deleted automatically.
# - Google Places rate limits are respected using 1.5 second delays and handling 429 errors.
# - Final results are assembled into df_results for further cleaning and insertion into BigQuery.
#
# File Locations:
# - Input: /MyDrive/msds434_project/unique_restaurants_from_cleanedinspection.csv
# - Output: /MyDrive/msds434_project/places_batch_checkpoint.csv
# -------------------------------------------------------------


from google.cloud import bigquery
import pandas as pd
import time
import requests
import os
import csv

# BigQuery client
client = bigquery.Client(project="hygiene-prediction-434")

# Step 1: Load restaurants
df_all_restaurants = pd.read_csv("/content/drive/MyDrive/msds434_project/unique_restaurants_from_cleanedinspection.csv")

# Define batch number and size
batch_number = 9  # Change this each day: 0, 1, 2, etc.
BATCH_SIZE = 1000

start_offset = batch_number * BATCH_SIZE
end_offset = start_offset + BATCH_SIZE

# Save Google places query results with start_offset in filename
checkpoint_path = f"/content/drive/MyDrive/msds434_project/places_batch_checkpoint_start_offset_{start_offset}.csv"

total_rows = len(df_all_restaurants)
print(f"✅ Total restaurants available: {total_rows}")
print(f"🚀 Processing batch {batch_number}: rows {start_offset} to {min(end_offset - 1, total_rows - 1)}")

# Safety check to avoid slicing beyond the data
if start_offset >= total_rows:
    raise ValueError("🚨 start_offset exceeds number of restaurants in dataset. Update batch_number.")

# Slice batch from full restaurant list
df_sample = df_all_restaurants.iloc[start_offset:end_offset][["dba_name", "zip"]]


# Step 2: Query Google Places
SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
results = []

print("🚀 Starting Google Places enrichment...")

for i, row in df_sample.iterrows():
    query_string = f"{row['dba_name']} {row['zip']}"
    params = {
        "query": query_string,
        "key": GOOGLE_API_KEY
    }

    response = requests.get(SEARCH_URL, params=params)

    if response.status_code == 429:
      print("⏳ Rate limit hit. Sleeping 10 seconds...")
      time.sleep(10)
      continue

    data = response.json()

    if data.get("status") == "OK" and data.get("results"):
        place = data["results"][0]
        place_types = place.get("types", [])

        if "restaurant" in place_types:
            results.append({
                "dba_name": row['dba_name'],
                "zip": row['zip'],
                "matched_name": place.get("name"),
                "rating": place.get("rating"),
                "price_level": place.get("price_level"),
                "user_ratings_total": place.get("user_ratings_total"),
                "business_status": place.get("business_status"),
                "place_id": place.get("place_id"),
                "address": place.get("formatted_address"),  # ✅ Save address here
                "types": place_types
            })
        # else:
        #     print(f"❌ Skipped: {row['dba_name']} matched to non-restaurant: {place.get('name')} ({place_types})")
    else:
        results.append({
            "dba_name": row['dba_name'],
            "zip": row['zip'],
            "matched_name": None,
            "rating": None,
            "price_level": None,
            "user_ratings_total": None,
            "business_status": None,
            "place_id": None,
            "address": None,  # ✅ Include address in the else block too
            "types": None
        })

    # If we are building a fresh file, delete the old stale one first
    if i == 0 and start_offset == 0 and os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print("🧹 Old checkpoint file deleted. Starting fresh for batch 0.")

    # Inside your loop — save every 50 rows
    if i % 50 == 0:
        df = pd.DataFrame(results[-50:])  # only the last 50
        df.to_csv(
            checkpoint_path,
            mode="a",
            header=not os.path.exists(checkpoint_path),  # write header only if file doesn't exist
            index=False,
            quoting=csv.QUOTE_NONNUMERIC
        )
        print(f"💾 Saved checkpoint at {len(results)} total matches")


    time.sleep(0.25)  # Respect rate limits
    if i % 50 == 0:
      print(f"✅ Completed: {i}")

# Step 3: Results to DataFrame
df_results = pd.DataFrame(results)
df_results.head()
df_results.info()


✅ Total restaurants available: 9552
🚀 Processing batch 9: rows 9000 to 9551
🚀 Starting Google Places enrichment...
💾 Saved checkpoint at 1 total matches
✅ Completed: 9000
💾 Saved checkpoint at 39 total matches
✅ Completed: 9050
💾 Saved checkpoint at 71 total matches
✅ Completed: 9100
💾 Saved checkpoint at 108 total matches
✅ Completed: 9150
💾 Saved checkpoint at 148 total matches
✅ Completed: 9200
💾 Saved checkpoint at 181 total matches
✅ Completed: 9250
💾 Saved checkpoint at 214 total matches
✅ Completed: 9300
💾 Saved checkpoint at 257 total matches
✅ Completed: 9350
💾 Saved checkpoint at 304 total matches
✅ Completed: 9400
💾 Saved checkpoint at 342 total matches
✅ Completed: 9450
💾 Saved checkpoint at 386 total matches
✅ Completed: 9500
💾 Saved checkpoint at 432 total matches
✅ Completed: 9550


Unnamed: 0,dba_name,zip,matched_name,rating,price_level,user_ratings_total,business_status,place_id,address,types
0,tts tacos & tequilas,60642,TTS Tacos & Tequilas,4.8,2.0,109.0,OPERATIONAL,ChIJJafU65LTD4gRSr76vIkAVJE,"1438 W Chicago Ave, Chicago, IL 60642, United ...","[restaurant, food, point_of_interest, establis..."
1,tts tacos & tortas,60630,TTS Tacos & Tortas,4.5,2.0,346.0,OPERATIONAL,ChIJz3qH6bDND4gR1EAXehGRotM,"4507 W Lawrence Ave, Chicago, IL 60630, United...","[restaurant, food, point_of_interest, establis..."
2,tuco & blondie,60657,Tuco And Blondie,4.4,2.0,1574.0,OPERATIONAL,ChIJTcDpnlHSD4gRZOaxJicTdjo,"3358 N Southport Ave, Chicago, IL 60657, Unite...","[restaurant, food, point_of_interest, establis..."
3,tufano's vernon park tap,60607,Tufano's Vernon Park Tap,4.6,2.0,1332.0,OPERATIONAL,ChIJYymCJOEsDogRkh7RJpjEk3k,"1073 W Vernon Park Pl, Chicago, IL 60607, Unit...","[restaurant, food, point_of_interest, establis..."
4,tuk tuk,60607,Tuk Tuk Indian Street Food,4.4,2.0,351.0,OPERATIONAL,ChIJfaqw8a8tDogR8VeoB4Yj6_U,"1445 W Taylor St, Chicago, IL 60607, United St...","[restaurant, food, point_of_interest, establis..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   dba_name            432 non-null    object 
 1   zip                 432 non-null    int64  
 2   matched_name        405 non-null    object 
 3   rating              405 non-null    float64
 4   price_level         356 non-null    float64
 5   user_ratings_total  405 non-null    float64
 6   business_status     405 non-null    object 
 7   place_id            405 non-null    object 
 8   address             405 non-null    object 
 9   types               405 non-null    object 
dtypes: float64(3), int64(1), object(6)
memory usage: 33.9+ KB


In [64]:
# -------------------------------------------------------------
# Post-process formatted_address to extract street address and zip code
# -------------------------------------------------------------

import re

# Define the parser
def parse_address_components(formatted_address):
    if not formatted_address or not isinstance(formatted_address, str):
        return None, None
    try:
        parts = formatted_address.split(",")
        if len(parts) >= 3:
            street_address = parts[0].strip()
            zip_search = re.search(r'\b\d{5}\b', formatted_address)
            if zip_search:
                zip_code = int(zip_search.group(0))
            else:
                zip_code = None
            return street_address, zip_code
    except Exception as e:
        print(f"⚠️ Error parsing address: {formatted_address} ({e})")
        return None, None
    return None, None

# Apply the parser to the DataFrame
df_results[["address", "parsed_zip"]] = df_results["address"].apply(
    lambda addr: pd.Series(parse_address_components(addr))
)

# (Optional) Replace zip if parsed_zip is available
df_results["zip"] = df_results["parsed_zip"].combine_first(df_results["zip"])

# Drop the helper parsed_zip column
df_results.drop(columns=["parsed_zip"], inplace=True)

print("✅ Parsed address and zip fields cleanly.")
print(f"✅ Parsed {df_results['address'].notnull().sum()} addresses and {df_results['zip'].notnull().sum()} zip codes.")


✅ Parsed address and zip fields cleanly.
✅ Parsed 405 addresses and 432 zip codes.


In [65]:
# -------------------------------------------------------------
# This script classifies enriched restaurant data into operational categories, price tiers, and popularity tiers.
#
# Purpose:
# - Assign an operational category (e.g., full_service, bar/alcohol, takeout) based on Google Places types.
# - Categorize restaurants by price level (low, medium, high) based on Google Places price_level.
# - Categorize restaurants by popularity (low_popularity, medium_popularity, high_popularity) based on review counts.
# - Enhance df_results with new engineered fields to support ML feature engineering and EDA.
#
# Notes:
# - 'types' field must be a list of categories from Google Places API.
# - Missing or unknown price levels and review counts are categorized as 'unknown' safely.
# - Resulting new columns: 'category', 'price_category', and 'popularity_category' are added to df_results.
# -------------------------------------------------------------


def classify_types(types):
    if not types or not isinstance(types, list):
        return "unknown"
    if any(t in types for t in ["bar", "liquor_store", "night_club"]):
        return "bar/alcohol"
    if any(t in types for t in ["meal_delivery", "meal_takeaway"]):
        return "takeout"
    if "bakery" in types:
        return "bakery"
    if "cafe" in types:
        return "cafe"
    return "full_service"

def categorize_price(price):
    if price is None:
        return "unknown"
    if price <= 1:
        return "low"
    if price == 2:
        return "medium"
    if price >= 3:
        return "high"

def categorize_popularity(ratings_total):
    if ratings_total is None:
        return "unknown"
    if ratings_total <= 50:
        return "low_popularity"
    if 51 <= ratings_total <= 200:
        return "medium_popularity"
    return "high_popularity"

df_results["category"] = df_results["types"].apply(classify_types)
df_results["category"].value_counts()

df_results["price_category"] = df_results["price_level"].apply(categorize_price)
df_results["popularity_category"] = df_results["user_ratings_total"].apply(categorize_popularity)


df_results.info()
df_results.head()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
full_service,281
bar/alcohol,53
takeout,45
unknown,27
cafe,17
bakery,9


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dba_name             432 non-null    object 
 1   zip                  432 non-null    float64
 2   matched_name         405 non-null    object 
 3   rating               405 non-null    float64
 4   price_level          356 non-null    float64
 5   user_ratings_total   405 non-null    float64
 6   business_status      405 non-null    object 
 7   place_id             405 non-null    object 
 8   address              405 non-null    object 
 9   types                405 non-null    object 
 10  category             432 non-null    object 
 11  price_category       356 non-null    object 
 12  popularity_category  432 non-null    object 
dtypes: float64(4), object(9)
memory usage: 44.0+ KB


Unnamed: 0,dba_name,zip,matched_name,rating,price_level,user_ratings_total,business_status,place_id,address,types,category,price_category,popularity_category
0,tts tacos & tequilas,60642.0,TTS Tacos & Tequilas,4.8,2.0,109.0,OPERATIONAL,ChIJJafU65LTD4gRSr76vIkAVJE,1438 W Chicago Ave,"[restaurant, food, point_of_interest, establis...",full_service,medium,medium_popularity
1,tts tacos & tortas,60630.0,TTS Tacos & Tortas,4.5,2.0,346.0,OPERATIONAL,ChIJz3qH6bDND4gR1EAXehGRotM,4507 W Lawrence Ave,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity
2,tuco & blondie,60657.0,Tuco And Blondie,4.4,2.0,1574.0,OPERATIONAL,ChIJTcDpnlHSD4gRZOaxJicTdjo,3358 N Southport Ave,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity
3,tufano's vernon park tap,60607.0,Tufano's Vernon Park Tap,4.6,2.0,1332.0,OPERATIONAL,ChIJYymCJOEsDogRkh7RJpjEk3k,1073 W Vernon Park Pl,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity
4,tuk tuk,60607.0,Tuk Tuk Indian Street Food,4.4,2.0,351.0,OPERATIONAL,ChIJfaqw8a8tDogR8VeoB4Yj6_U,1445 W Taylor St,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity


In [66]:
# -------------------------------------------------------------
# This script performs basic data validation checks on df_results.
#
# Purpose:
# - Identify missing values across all columns to detect incomplete data.
# - Detect and count duplicated place_id entries to ensure data uniqueness.
# - Optionally display sample duplicated rows for manual review.
#
# Notes:
# - place_id is a critical field and must be unique for RestaurantProfile.
# - Missing values and duplicates must be handled before inserting into BigQuery.
# - This check should be performed after enrichment and before final cleaning.
# -------------------------------------------------------------

# 1. Missing Value Check Across All Columns
print("🔍 Missing Values Per Column:")
print(df_results.isnull().sum())

# 2. Redundancy Check — Duplicated place_id
num_duplicate_place_ids = df_results["place_id"].duplicated().sum()
print(f"\n🔍 Duplicated place_id count: {num_duplicate_place_ids}")

# 3. Optional — Find rows with duplicate place_ids
if num_duplicate_place_ids > 0:
    print("\n🔍 Example duplicated place_ids:")
    display(df_results[df_results.duplicated(subset=["place_id"], keep=False)].sort_values("place_id").head(10))


🔍 Missing Values Per Column:
dba_name                0
zip                     0
matched_name           27
rating                 27
price_level            76
user_ratings_total     27
business_status        27
place_id               27
address                27
types                  27
category                0
price_category         76
popularity_category     0
dtype: int64

🔍 Duplicated place_id count: 65

🔍 Example duplicated place_ids:


Unnamed: 0,dba_name,zip,matched_name,rating,price_level,user_ratings_total,business_status,place_id,address,types,category,price_category,popularity_category
211,whale fish & chicken,60651.0,Whale Fish & Chicken,4.0,1.0,816.0,OPERATIONAL,ChIJ--SRaLUyDogRrDp1lJG2siE,3600 W Chicago Ave,"[restaurant, point_of_interest, food, establis...",full_service,low,high_popularity
212,whale fish & chicken,60651.0,Whale Fish & Chicken,4.0,1.0,816.0,OPERATIONAL,ChIJ--SRaLUyDogRrDp1lJG2siE,3600 W Chicago Ave,"[restaurant, food, point_of_interest, establis...",full_service,low,high_popularity
147,volcano sushi cafe,60607.0,volcano,3.8,2.0,391.0,CLOSED_PERMANENTLY,ChIJ-YRR8-MsDogRqnt3Dqwqb6Q,1062 W Taylor St,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity
146,volcano sushi 90,60607.0,volcano,3.8,2.0,391.0,CLOSED_PERMANENTLY,ChIJ-YRR8-MsDogRqnt3Dqwqb6Q,1062 W Taylor St,"[restaurant, food, point_of_interest, establis...",full_service,medium,high_popularity
145,volare,60611.0,Volare Ristorante Italiano,4.6,2.0,3660.0,OPERATIONAL,ChIJ07oClassDogRccjo-BBesIc,201 E Grand Ave,"[meal_delivery, meal_takeaway, restaurant, bar...",bar/alcohol,medium,high_popularity
144,volare,60611.0,Volare Ristorante Italiano,4.6,2.0,3660.0,OPERATIONAL,ChIJ07oClassDogRccjo-BBesIc,201 E Grand Ave,"[meal_delivery, meal_takeaway, bar, restaurant...",bar/alcohol,medium,high_popularity
389,yummy thai,60607.0,Yummy Thai,3.9,1.0,367.0,OPERATIONAL,ChIJ5XrZcRwtDogRQJL0JGNbY24,1418 W Taylor St,"[restaurant, food, point_of_interest, establis...",full_service,low,high_popularity
388,yummy thai,60607.0,Yummy Thai,3.9,1.0,367.0,OPERATIONAL,ChIJ5XrZcRwtDogRQJL0JGNbY24,1418 W Taylor St,"[restaurant, food, point_of_interest, establis...",full_service,low,high_popularity
195,"wendy's properties,llc",60617.0,Wendy's,3.9,1.0,874.0,OPERATIONAL,ChIJ6ZJHuJ3YEYgRTq02w1byRIc,3516 E 118th St,"[restaurant, point_of_interest, food, establis...",full_service,low,high_popularity
193,"wendy's properties,llc",60617.0,Wendy's,3.9,1.0,874.0,OPERATIONAL,ChIJ6ZJHuJ3YEYgRTq02w1byRIc,3516 E 118th St,"[restaurant, point_of_interest, food, establis...",full_service,low,high_popularity


In [67]:
# -------------------------------------------------------------
# This script finalizes the cleaning of the enriched restaurant batch before insertion into BigQuery.
#
# Purpose:
# - Drop any rows missing the required place_id field.
# - Fill missing price_category values with 'unknown' to ensure consistent data typing.
# - Deduplicate the dataset by keeping the first occurrence of each unique place_id.
# - Verify the cleaned dataset is ready for safe insertion into RestaurantProfile.
#
# Notes:
# - place_id is required for insertion and relational integrity.
# - Deduplication is critical to prevent multiple entries of the same restaurant.
# - Missing price_level is acceptable; missing place_id is not.
# - A final missing value summary is generated for quick validation.
# -------------------------------------------------------------


# Drop all bad rows (those missing place_id — which covers all other missing too)
df_cleaned = df_results[df_results["place_id"].notnull()].copy()

# (Optional) Fill missing price_category
df_cleaned["price_category"] = df_cleaned["price_category"].fillna("unknown")

print(f"✅ Cleaned dataset ready: {len(df_cleaned)} rows with valid place_id")

# 1. Drop duplicates, keeping the first occurrence of each place_id
df_cleaned_deduped = df_cleaned.drop_duplicates(subset="place_id", keep="first").copy()

print(f"✅ After deduplication: {len(df_cleaned_deduped)} unique place_ids ready for insert.")
df_cleaned_deduped.isnull().sum()


✅ Cleaned dataset ready: 405 rows with valid place_id
✅ After deduplication: 366 unique place_ids ready for insert.


Unnamed: 0,0
dba_name,0
zip,0
matched_name,0
rating,0
price_level,49
user_ratings_total,0
business_status,0
place_id,0
address,0
types,0


In [68]:
# -------------------------------------------------------------
# This script finalizes the cleaned enrichment batch and inserts it into the RestaurantProfile table in BigQuery.
#
# Purpose:
# - Drop any rows missing place_id (critical for join integrity).
# - Deduplicate entries based on place_id, keeping the first occurrence.
# - Serialize the 'types' field (list) into a JSON string to match BigQuery schema requirements.
# - Upload the cleaned, validated batch into the RestaurantProfile table.
#
# Notes:
# - place_id must not be null and must be unique across the inserted dataset.
# - 'types' field must be serialized to JSON format before insertion.
# - write_disposition is set to 'WRITE_APPEND' to accumulate batches over time.
# - Final insert count is printed for confirmation after successful load.
# -------------------------------------------------------------

from google.cloud import bigquery
import json

# 1. Prepare cleaned data
df_cleaned = df_results[df_results["place_id"].notnull()].copy()
df_cleaned_deduped = df_cleaned.drop_duplicates(subset="place_id", keep="first").copy()
df_to_insert = df_cleaned_deduped[df_cleaned_deduped["place_id"].notnull()].copy()

# 2. Fix 'types' column
df_to_insert["types"] = df_to_insert["types"].apply(lambda x: json.dumps(x) if isinstance(x, list) else None)

print(f"✅ Ready to insert {len(df_to_insert)} rows")

# 3. Upload to BigQuery
job = client.load_table_from_dataframe(
    df_to_insert,
    "hygiene-prediction-434.RestaurantModeling.RestaurantProfile",
    job_config=bigquery.LoadJobConfig(
        write_disposition="WRITE_APPEND"
    )
)

job.result()

print(f"✅ Successfully inserted {len(df_to_insert)} rows into RestaurantProfile")

✅ Ready to insert 366 rows


LoadJob<project=hygiene-prediction-434, location=US, id=d9c3686f-7886-4b1b-8787-baa24813d389>

✅ Successfully inserted 366 rows into RestaurantProfile


In [69]:
# -------------------------------------------------------------
# After inserting cleaned restaurant batch, backfill place_id into InspectionEvents
# -------------------------------------------------------------

backfill_query = """
-- Corrected backfill with distinct matching
UPDATE `hygiene-prediction-434.RestaurantModeling.InspectionEvents` AS inspections
SET inspections.place_id = match_table.place_id
FROM (
  SELECT
    DISTINCT CAST(cleaned.inspection_id AS STRING) AS inspection_id,
    restaurant.place_id
  FROM `hygiene-prediction-434.HygienePredictionRow.CleanedInspectionRow` AS cleaned
  JOIN `hygiene-prediction-434.RestaurantModeling.RestaurantProfile` AS restaurant
    ON LOWER(cleaned.dba_name) = LOWER(restaurant.dba_name)
    AND LOWER(cleaned.address) = LOWER(restaurant.address)
    AND cleaned.zip = restaurant.zip
) AS match_table
WHERE
  inspections.inspection_id = match_table.inspection_id
  AND inspections.place_id IS NULL


"""

# Execute backfill
job = client.query(backfill_query)
job.result()

print("✅ Successfully backfilled place_id into InspectionEvents after inserting new restaurants.")


<google.cloud.bigquery.table._EmptyRowIterator at 0x7be5dba37e50>

✅ Successfully backfilled place_id into InspectionEvents after inserting new restaurants.


In [70]:
# Place ID coverage audit query
coverage_query = """
SELECT
  COUNT(*) AS total_inspections,
  SUM(CASE WHEN place_id IS NOT NULL THEN 1 ELSE 0 END) AS inspections_with_place_id,
  SUM(CASE WHEN place_id IS NULL THEN 1 ELSE 0 END) AS inspections_missing_place_id,
  ROUND(SAFE_DIVIDE(SUM(CASE WHEN place_id IS NOT NULL THEN 1 ELSE 0 END), COUNT(*)), 3) AS pct_with_place_id
FROM `hygiene-prediction-434.RestaurantModeling.InspectionEvents`
"""

df_coverage = client.query(coverage_query).to_dataframe()
pct = df_coverage["pct_with_place_id"].iloc[0]
print(f"✅ Current place_id coverage: {pct * 100:.1f}% of inspections matched to restaurants.")



✅ Current place_id coverage: 29.8% of inspections matched to restaurants.


In [79]:
# -------------------------------------------------------------
# Rebuild clean 'types' field in RestaurantProfile using batch checkpoint CSVs
# -------------------------------------------------------------

from google.cloud import bigquery
import pandas as pd
import glob

# 1. Load all places_batch_checkpoint_start_offset_*.csv files
import ast

print("🔍 Loading checkpoint CSVs from Drive...")

csv_files = glob.glob('/content/drive/MyDrive/msds434_project/places_batch_checkpoint_start_offset_*.csv')

dfs = []
for file in csv_files:
    def safe_parse(x):
        try:
            return ast.literal_eval(x) if pd.notnull(x) else []
        except Exception:
            return []

    df = pd.read_csv(file, converters={"types": safe_parse})
    dfs.append(df)

df_combined = pd.concat(dfs, ignore_index=True)
print(f"✅ Loaded {len(df_combined)} rows combined from {len(csv_files)} files.")


# 2. Extract place_id and clean types
df_types = df_combined[["place_id", "types"]].dropna()

# 3. Deduplicate by place_id
df_types = df_types.drop_duplicates(subset="place_id")
print(f"✅ Deduplicated to {len(df_types)} unique place_ids.")

# 4. Upload cleaned types into a temporary BigQuery table
print("📤 Uploading cleaned place_id + types to TempTypesFix...")

client = bigquery.Client(project="hygiene-prediction-434")

table_id_temp = "hygiene-prediction-434.RestaurantModeling.TempTypesFix"

job = client.load_table_from_dataframe(
    df_types,
    table_id_temp,
    job_config=bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        schema=[
            bigquery.SchemaField("place_id", "STRING"),
            bigquery.SchemaField("types", "STRING", mode="REPEATED"),
        ]
    )
)

job.result()
print("✅ Uploaded clean types to TempTypesFix.")

# 5. Merge clean types back into RestaurantProfile
print("🔄 Merging clean types into RestaurantProfile...")

merge_query = """
UPDATE `hygiene-prediction-434.RestaurantModeling.RestaurantProfile` AS main
SET main.types = temp.types
FROM `hygiene-prediction-434.RestaurantModeling.TempTypesFix` AS temp
WHERE main.place_id = temp.place_id
"""

job = client.query(merge_query)
job.result()

print("✅ Successfully updated types in RestaurantProfile.")


🔍 Loading checkpoint CSVs from Drive...
✅ Loaded 9628 rows combined from 10 files.
✅ Deduplicated to 5996 unique place_ids.
📤 Uploading cleaned place_id + types to TempTypesFix...


LoadJob<project=hygiene-prediction-434, location=US, id=4dc7299e-4b67-4851-99d1-cb3e4d4a235e>

✅ Uploaded clean types to TempTypesFix.
🔄 Merging clean types into RestaurantProfile...


<google.cloud.bigquery.table._EmptyRowIterator at 0x7be5dbc3ef10>

✅ Successfully updated types in RestaurantProfile.
