In [1]:
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from datetime import datetime
import sqlite3
import difflib


# Find the closest matching name using string similarity
def get_most_similar_name(base_name, name_dict, threshold=0.6):
    base_name = base_name.strip().lower()
    best_index = None
    best_similarity = 0.0

    for idx, name in name_dict.items():
        name = name.strip().lower()
        similarity = difflib.SequenceMatcher(None, base_name, name).ratio()
        # print(f"Name '{name}' with similarity {similarity:.2f}")
        if similarity > threshold and similarity > best_similarity:
            best_similarity = similarity
            best_index = idx

    return best_index, best_similarity


class RestaurantDB:
    # Handles all database operations related to restaurants and inspections
    def __init__(self, db_path, table_name, inspection_table_name):
        self.db_path = db_path
        self.table_name = table_name
        self.inspection_table_name = inspection_table_name
        self.conn = None
        self.cursor = None

    # Connect to the SQLite database
    def connect_to_db(self):
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()

    # Create the inspection_info table if it does not exist
    def create_inspection_table(self):
        # self.cursor.execute(f"DROP TABLE IF EXISTS {self.inspection_table_name}")
        self.cursor.execute(f"""
            CREATE TABLE IF NOT EXISTS {self.inspection_table_name} (
                F_Restaurant_Info_ID INTEGER,
                Inspection_Info_ID INTEGER PRIMARY KEY AUTOINCREMENT,
                Business_Name TEXT,
                Inspection_Date DATE,
                Score FLOAT,
                Grade TEXT,
                Address1 TEXT,
                City TEXT,
                Name_Similarity_Percentage FLOAT,
                IFrame_Number INTEGER,
                Created_At DATETIME DEFAULT (datetime('now')),
                Is_Processed INTEGER DEFAULT 0,
                Is_Valid INTEGER DEFAULT 1,
                FOREIGN KEY (F_Restaurant_Info_ID)
                    REFERENCES {self.table_name}(Restaurant_Info_ID)
            );
        """)
        self.conn.commit()

    # Insert a single inspection record
    def insert_inspection(
        self,
        f_restaurant_info_id,
        business_name,
        inspection_date,
        score,
        grade,
        address1,
        city,
        name_similarity_percentage,
        iframe_number
    ):
        self.cursor.execute("""
            INSERT INTO inspection_info (
                F_Restaurant_Info_ID,
                Business_Name,
                Inspection_date,
                Score,
                Grade,
                Address1,
                City,
                Name_Similarity_Percentage,
                IFrame_Number,
                Is_Processed
            )
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
        """, (
            f_restaurant_info_id,
            business_name,
            inspection_date,
            score,
            grade,
            address1,
            city,
            name_similarity_percentage,
            iframe_number
        ))
        self.conn.commit()

    # Fetch restaurants that are valid and not yet processed
    def fetch_restaurants(self):
        self.cursor.execute(f"""
            SELECT Restaurant_Info_ID, Business_id, Name, Address1, City, Zip_Code
            FROM {self.table_name}
            WHERE is_valid = 1 AND is_processed = 0
            LIMIT 1000
        """)
        return self.cursor.fetchall()

    # Mark a restaurant as processed
    def update_processed_status(self, restaurant_info_id):
        self.cursor.execute(f"""
            UPDATE {self.table_name}
            SET is_processed = 1
            WHERE Restaurant_Info_ID = ?
        """, (restaurant_info_id,))
        self.conn.commit()

    # Convert numeric inspection score to letter grade
    def get_restaurant_grade(self, score):
        return (
            'A' if float(score) >= 90 else
            'B' if float(score) >= 80 else
            'C' if float(score) >= 70 else
            'F'
        )

    # Close the database connection
    def close_db(self):
        self.conn.close()


# Database configuration
db_path = "../db/database_sqlite.db"
table_name = "restaurant_info"
INSPECTION_TABLE_NAME = "inspection_info"

# Initialize database handler
restaurant_db = RestaurantDB(db_path, table_name, INSPECTION_TABLE_NAME)
restaurant_db.connect_to_db()
restaurant_db.create_inspection_table()

# Fetch restaurants that still need inspection data
restaurants = restaurant_db.fetch_restaurants()

for restaurant in restaurants:
    restaurant_info_id, business_id, name, address1, city, zip_code = restaurant

    # Skip restaurants without an address
    if address1 is None or address1 == "":
        restaurant_db.update_processed_status(restaurant_info_id)
        continue

    print(f"Processing: '{name}' => {address1}")

    # Run Chrome in headless mode
    options = Options()
    options.add_argument("--headless")

    # Start Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # Open LA County health inspection search page
    url = "https://ehservices.publichealth.lacounty.gov/servlet/guest?service=1&enterprise=1"
    driver.get(url)

    # Wait for search input to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "search_text"))
    )

    # Read hidden form values required for submission
    csrf_token = driver.find_element(By.NAME, "CSRFToken").get_attribute("value")
    sort_by = driver.find_element(By.NAME, "sort_by").get_attribute("value")
    sort_index = driver.find_element(By.NAME, "sort_index").get_attribute("value")
    sort_dirn = driver.find_element(By.NAME, "sort_dirn").get_attribute("value")
    vform = driver.find_element(By.NAME, "vform").get_attribute("value")
    ifowner = driver.find_element(By.NAME, "ifowner").get_attribute("value")
    target_url = driver.find_element(By.NAME, "target_url").get_attribute("value")
    page_index = driver.find_element(By.NAME, "page_index").get_attribute("value")
    page_size_idx = driver.find_element(By.NAME, "page_size_idx").get_attribute("value")
    archive_filter = driver.find_element(By.NAME, "archive_filter").get_attribute("value")
    esdisplay = driver.find_element(By.NAME, "esdisplay").get_attribute("value")

    # Enter address into search box
    search_input = driver.find_element(By.NAME, "search_text")
    search_input.send_keys(address1)

    # Restore hidden fields before submitting the form
    driver.execute_script(f"""
        document.getElementsByName('sort_by')[0].value = '{sort_by}';
        document.getElementsByName('sort_index')[0].value = '{sort_index}';
        document.getElementsByName('sort_dirn')[0].value = '{sort_dirn}';
        document.getElementsByName('vform')[0].value = '{vform}';
        document.getElementsByName('ifowner')[0].value = '{ifowner}';
        document.getElementsByName('target_url')[0].value = '{target_url}';
        document.getElementsByName('page_index')[0].value = '{page_index}';
        document.getElementsByName('page_size_idx')[0].value = '{page_size_idx}';
        document.getElementsByName('archive_filter')[0].value = '{archive_filter}';
        document.getElementsByName('esdisplay')[0].value = '{esdisplay}';
        document.getElementsByName('CSRFToken')[0].value = '{csrf_token}';
    """)

    # Submit the search form
    driver.find_element(By.NAME, "Search_Button").click()

    # Wait for results table to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//table[@class="table-bordered"]//tr')
            )
        )
    except Exception:
        pass

    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.find("table", {"class": "table-bordered"})

    if table:
        results = []
        rows = table.find_all("tr")

        for row in rows:
            columns = row.find_all("td")
            if len(columns) >= 5:
                facility = columns[1].get_text(strip=True)
                inspection_date = columns[2].get_text(strip=True)
                score = columns[3].get_text(strip=True) or 80
                address = columns[4].get_text(strip=True)
                city = columns[5].get_text(strip=True)

                inspections_button = row.find("a", href=True)
                if inspections_button:
                    href_value = inspections_button.get("href")
                    match = re.search(r"doInstanceSearch\((\d+)\)", href_value)
                    iform_value = match.group(1) if match else None

                    results.append({
                        "facility": facility,
                        "inspection_date": inspection_date,
                        "score": score,
                        "address": address,
                        "city": city,
                        "iform_value": iform_value
                    })

        idx, sim = get_most_similar_name(
            name,
            {i: r["facility"] for i, r in enumerate(results)},
            threshold=0.6
        )

        if idx is not None:
            restaurant_db.insert_inspection(
                f_restaurant_info_id=restaurant_info_id,
                business_name=results[idx]["facility"],
                inspection_date=datetime.strptime(
                    results[idx]["inspection_date"], "%m/%d/%Y"
                ).strftime("%Y-%m-%d"),
                score=float(results[idx]["score"]),
                grade=restaurant_db.get_restaurant_grade(score),
                address1=results[idx]["address"],
                city=results[idx]["city"],
                name_similarity_percentage=round(sim * 100, 2),
                iframe_number=results[idx]["iform_value"]
            )
        else:
            print("No similarity found.")
    else:
        print("No table found.")

    # Mark restaurant as processed after handling
    restaurant_db.update_processed_status(restaurant_info_id)

# Close browser and database connection
driver.quit()
restaurant_db.close_db()

Processing: 'New Zealand Seafood Marketing' => 4321 S Boyle Ave
No table found!

Processing: 'Palmas Juice Bar & Cafe' => 1553 East 120th St
No table found!

Processing: 'No 1 Chef' => 843 W Gardena Blvd
Best match for 'No 1 Chef' is at index 1 with similarity 0.76
Processing: 'Intelligentsia Coffee' => 1331 Abbot Kinney Blvd
Best match for 'Intelligentsia Coffee' is at index 1 with similarity 0.88
Processing: 'Tacos Yeya’s' => 2557 Lincoln Blvd
No table found!

Processing: 'Goodies Restaurant' => 1875 Century Park E
No similarity found.

Processing: 'Perfect Donuts' => 4402 Vineland Ave
Best match for 'Perfect Donuts' is at index 1 with similarity 1.00
Processing: 'Alfred Coffee' => 8509 Melrose Ave
Best match for 'Alfred Coffee' is at index 1 with similarity 1.00
Processing: 'Blue Palms' => 829 N La Cienega Blvd
No table found!

Processing: 'Cilantro Fresh Mexican Grill' => 330 S Hope St
No similarity found.

Processing: 'Airlight at Conrad Los Angeles' => 100 S Grand Ave
Best match 

**For data view**

In [7]:
import pandas as pd
import sqlite3

# Path to the SQLite database
DB_PATH = "../db/database_sqlite.db"

# Table to load from the database
TABLE_NAME = "inspection_info"

# Connect to the database
conn = sqlite3.connect(DB_PATH)

# Load the table into a pandas DataFrame
df = pd.read_sql_query(f"SELECT * FROM {TABLE_NAME}", conn)

# Close the database connection
conn.close()

# Display the DataFrame
df

Unnamed: 0,F_Restaurant_Info_ID,Inspection_Info_ID,Business_Name,Inspection_date,Score,Grade,Address1,City,Name_Similarity_Percentage,IFrame_Number,Created_At,Is_Processed,Is_Valid
0,8,1,EL SENOR TACO,2025-02-12,97.0,A,1517 E FLORENCE AVE,LOS ANGELES,100.000000,904556,2025-04-14 00:50:07,0,1
1,12,2,MI PUEBLO RESTAURANT,2024-10-18,92.0,A,1341 E FLORENCE AVE,LOS ANGELES,72.727273,10137390,2025-04-14 00:50:28,0,1
2,13,3,BANGIN BUNS,2025-03-06,93.0,A,1457 E FLORENCE AVE # 113,LOS ANGELES,100.000000,17555671,2025-04-14 00:50:50,0,1
3,19,4,BIRRIERIA TLAQUEPAQUE #2,2023-01-10,92.0,A,1734 E FLORENCE AVE,LOS ANGELES,93.333333,911598,2025-04-14 00:51:33,0,1
4,20,5,TEQUERIA TIJUANA NUMERO 1 INC,2024-06-01,96.0,A,241 W FLORENCE AVE,LOS ANGELES,66.666667,14524932,2025-04-14 00:51:53,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8436,20550,8437,DANIEL'S TACOS,2024-12-02,92.0,A,5650 CAHUENGA BLVD,NORTH HOLLYWOOD,100.000000,17512549,2025-04-17 01:33:10,0,1
8437,20583,8438,CITY WOK,2024-09-10,99.0,A,10949 VENTURA BLVD,STUDIO CITY,100.000000,543848,2025-04-17 01:33:36,0,1
8438,15798,8439,THE CORNER STORE,2024-10-01,93.0,A,1118 W 37TH ST,SAN PEDRO,100.000000,19088826,2025-04-17 01:33:39,0,1
8439,17505,8440,FUNNEL HOUSE SOUL OF LA,2024-06-01,96.0,A,22458 VENTURA BLVD,WOODLAND HILLS,60.610000,21526753,2025-04-17 01:35:01,0,1
