In [41]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

Find additional missing restaurants

In [42]:
def find_missing_restaurants(file_pairs, base_path):
    """
    This function finds the restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are the first file names in the pairs and values are lists of missing restaurants.
    """
    missing_restaurants_ff_website = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant addresses
        restaurants_rnd_1 = set(df_rnd_1['restaurant_address'])
        restaurants_rnd_2 = set(df_rnd_2['restaurant_address'])

        # Find restaurants that are in round 1 but not in round 2
        missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

        # Store the missing restaurants in the dictionary
        missing_restaurants_ff_website[file_pair[0]] = missing_restaurants

    return missing_restaurants_ff_website

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_burgerking_ca_03282024.csv", "raw_prices_burgerking_ca_05152024.csv"),
    ("raw_prices_burgerking_non_ca_03302024.csv", "raw_prices_burgerking_non_ca_05152024.csv"),
    ("raw_prices_carlsjr_ca_03292024.csv", "raw_prices_carlsjr_ca_05152024.csv"),
    ("raw_prices_carlsjr_non_ca_03302024.csv", "raw_prices_carlsjr_non_ca_05162024.csv"),
    ("raw_prices_hardees_non_ca_03292024.csv", "raw_prices_hardees_non_ca_05162024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ff_website = find_missing_restaurants(file_pairs, base_path)


In [43]:
def find_missing_addresses(file_pairs, base_path):
    """
    This function finds the addresses of restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are restaurant names and values are lists of missing addresses.
    """
    missing_restaurants_ubereats = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant names and addresses as dictionaries
        restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
        restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()

        # Find missing addresses for each restaurant
        for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
            addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
            missing_addresses = addresses_rnd_1 - addresses_rnd_2
            if missing_addresses:
                if restaurant not in missing_restaurants_ubereats:
                    missing_restaurants_ubereats[restaurant] = []
                missing_restaurants_ubereats[restaurant].extend(missing_addresses)

    return missing_restaurants_ubereats

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_ubereats_ca_ff_03222024.csv", "raw_prices_ubereats_ca_ff_05142024.csv"),
    ("raw_prices_ubereats_ca_fflocal_03292024.csv", "raw_prices_ubereats_ca_fflocal05272024.csv"),
    ("raw_prices_ubereats_ca_fullserv_03252024.csv", "raw_prices_ubereats_ca_ffullserv_05142024.csv"),
    ("raw_prices_ubereats_nonca_ff_03292024.csv", "raw_prices_ubereats_nonca_ff_05162024.csv"),
    ("raw_prices_ubereats_nonca_fullserv_03252024.csv", "raw_prices_ubereats_nonca_fullserv_05162024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ubereats = find_missing_addresses(file_pairs, base_path)


In [44]:
file_pairs = [("raw_prices_ubereats_nonca_ff_03292024.csv", "missing_nonca.csv")]
    
# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"



missing_restaurants_ubereats = find_missing_addresses(file_pairs, base_path)

In [45]:
# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_wendys_ca_03302024.csv", "raw_prices_wendys_ca_05142024.csv"),
    ("raw_prices_wendys_nonca_03302024.csv", "raw_prices_wendys_nonca_05142024.csv")
]

# Initialize a dictionary to store missing restaurant names
wendys_missing = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2)

    # Extract the restaurant addresses
    restaurants_rnd_1 = set(df_rnd_1['address'])
    restaurants_rnd_2 = set(df_rnd_2['address'])

    # Find restaurants that are in round 1 but not in round 2
    missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

    # Store the missing restaurants in the dictionary
    wendys_missing[file_pair[0]] = missing_restaurants

After Re-scraping

In [67]:
#Combine Dataset with the missing data 
missing_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/missing_nonca_rnd3.csv")
columns = missing_data.columns
prev_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/nonca_ff_rnd2.csv")[columns]
complete_data = pd.concat([missing_data, prev_data])
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardee', 'Hardees', case=False)
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardeess', 'Hardees', case=False)



#Save Dataset as a csv to re-
complete_data.to_csv("nonca_ff_rnd3.csv")

In [68]:
file_pairs = [
    ("raw_prices_ubereats_nonca_ff_hardees_032920204.csv", "nonca_ff_rnd3.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    
    #
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('Hardee', 'Hardees', case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('Hardeess', 'Hardees', case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('Hardeessss', 'Hardees', case=False)




    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Burger King: 17.49%
Percentage of missing addresses for Carls Jr: 31.25%
Percentage of missing addresses for Five Guys: 20.23%
Percentage of missing addresses for Hardees: 1.98%
Percentage of missing addresses for Jack in the Box: 0.86%
Percentage of missing addresses for McDonald: 2.24%
Percentage of missing addresses for Shake Shack: 26.51%
Percentage of missing addresses for Sonic: 33.85%
Percentage of missing addresses for The Habit: 14.29%
Percentage of missing addresses for Wendy: 27.03%

Final dictionary of missing addresses by restaurant name:
Burger King: ['3155 s padre island dr, corpus christi, tx, 78415, us', '17000 n rm 620, round rock, tx, 78681, us', '810 north main street, oregon, wi, 53575, us', '4535 highway 58, chattanooga, tn, 37416, us', '4905 airport blvd, austin, tx, 78751, us', '737 n. main street, layton, ut, 84041, us', '1432 s peoria ave, tulsa, ok, 74120, us', '301 west 3rd street, austin, tx, 78701, us', '3130 south memor

In [62]:
missing_restaurants_ubereats

{'Burger King': [nan,
  '3155 s padre island dr, corpus christi, tx, 78415, us',
  '17000 n rm 620, round rock, tx, 78681, us',
  '810 north main street, oregon, wi, 53575, us',
  '4535 highway 58, chattanooga, tn, 37416, us',
  '4905 airport blvd, austin, tx, 78751, us',
  '737 n. main street, layton, ut, 84041, us',
  '1432 s peoria ave, tulsa, ok, 74120, us',
  '301 west 3rd street, austin, tx, 78701, us',
  '1821 n pointe drive, durham, nc, 27705, us',
  '3130 south memorial drive, tulsa, ok, 74145, us',
  '1627 university ave, lubbock, tx, 79401, us',
  '3980 william penn highway, monroeville, pa, 15146, us',
  '1533 n. peoria, tulsa, ok, 74106, us',
  '2011 north hillfield road, layton, ut, 84041, us',
  '2414 mac davis lane, lubbock, tx, 79401, us',
  '1680 n freedom blvd, provo, ut, 84604, us',
  '3215 wake forest rd, raleigh, nc, 27609, us',
  '619 n ih 35, austin, tx, 78702, us',
  '2021 morgan ave, corpus christi, tx, 78405-1541, us',
  '1109 university ave, lubbock, tx, 794

In [20]:
#Need to merge hardees and nonca together 

nonca_ff = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_ff_05162024.csv")
# hardee = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_hardees_03282024.csv")
# non_ca_rnd_1 = pd.concat([nonca_ff, hardee])
# non_ca_rnd_1.to_csv("raw_prices_ubereats_nonca_ff_hardees_032920204.csv")
nonca_ff

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,...,restaurant_rating first,state,zip,county_name_x,county_name_y,date,uber_eats,post_policy,fast_food,local
0,0,McDonald,"1 christy dr, chadds ford, pa, 19317, us",5.516615,3.61,4.646159,4.3,145,90,6.59,...,,pa,19317,,Chester,2024-05-16,1,1,1,0
1,1,McDonald,"100 brownswitch rd, slidell, la, 70458, us",4.788060,4.07,4.223423,4.0,134,360+,5.87,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
2,2,McDonald,"3301 pontchartrain drive, slidell, la, 70458, us",4.927594,4.07,4.420474,4.2,133,460+,5.87,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
3,3,Wendy,"3915 pontchartrain, slidell, la, 70458, us",5.523137,4.87,2.958956,,83,170+,6.70,...,4.2,la,70458,,St. Tammany,2024-05-16,1,1,1,0
4,4,Burger King,"185 gause blvd., slidell, la, 70458, us",7.343924,5.39,5.966670,4.1,79,260+,6.49,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,1787,Five Guys,"springs at trussville street, center point, al...",8.464483,8.51,3.323807,4.5,29,240+,13.67,...,,al,35235,,Jefferson,2024-05-16,1,1,1,0
1788,1788,Hardee,"2450 e layton ave, saint francis, wi, 53235-60...",10.064328,9.99,5.060824,4.5,136,270+,7.61,...,,wi,53235,Milwaukee,Milwaukee,2024-05-16,1,1,1,0
1789,1789,Hardee,"2930 highway 138 sw, conyers, ga, 30094, us",7.680000,6.70,6.376385,3.9,70,130+,7.13,...,,ga,30094,Rockdale,Rockdale,2024-05-16,1,1,1,0
1790,1790,Hardee,"3062 anvil block road, ellenwood, ga, 30294, us",7.680000,6.70,6.376385,3.9,70,310+,7.13,...,,ga,30294,DeKalb,DeKalb,2024-05-16,1,1,1,0
