In [41]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

Find additional missing restaurants

In [42]:
def find_missing_restaurants(file_pairs, base_path):
    """
    This function finds the restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are the first file names in the pairs and values are lists of missing restaurants.
    """
    missing_restaurants_ff_website = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant addresses
        restaurants_rnd_1 = set(df_rnd_1['restaurant_address'])
        restaurants_rnd_2 = set(df_rnd_2['restaurant_address'])

        # Find restaurants that are in round 1 but not in round 2
        missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

        # Store the missing restaurants in the dictionary
        missing_restaurants_ff_website[file_pair[0]] = missing_restaurants

    return missing_restaurants_ff_website

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_burgerking_ca_03282024.csv", "raw_prices_burgerking_ca_05152024.csv"),
    ("raw_prices_burgerking_non_ca_03302024.csv", "raw_prices_burgerking_non_ca_05152024.csv"),
    ("raw_prices_carlsjr_ca_03292024.csv", "raw_prices_carlsjr_ca_05152024.csv"),
    ("raw_prices_carlsjr_non_ca_03302024.csv", "raw_prices_carlsjr_non_ca_05162024.csv"),
    ("raw_prices_hardees_non_ca_03292024.csv", "raw_prices_hardees_non_ca_05162024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ff_website = find_missing_restaurants(file_pairs, base_path)


In [121]:
def find_missing_addresses(file_pairs, base_path):
    """
    This function finds the addresses of restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are restaurant names and values are lists of missing addresses.
    """
    missing_restaurants_ubereats = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant names and addresses as dictionaries
        restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
        restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()

        # Find missing addresses for each restaurant
        for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
            addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
            missing_addresses = addresses_rnd_1 - addresses_rnd_2
            if missing_addresses:
                if restaurant not in missing_restaurants_ubereats:
                    missing_restaurants_ubereats[restaurant] = [] 
                missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
            # Calculate the percentage of missing addresses for the current restaurant
            total_addresses_rnd_1 = len(addresses_rnd_1)
            if total_addresses_rnd_1 != 0:
                percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
                print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")



    return missing_restaurants_ubereats

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_ubereats_ca_fflocal_03292024.csv", "raw_prices_ubereats_ca_fflocal05272024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ubereats = find_missing_addresses(file_pairs, base_path)
missing_restaurants_ubereats

Percentage of missing addresses for Betty Burgers: 0.00%
Percentage of missing addresses for Big Mouth Burgers: 0.00%
Percentage of missing addresses for Burger Basket: 0.00%
Percentage of missing addresses for Burger Boss: 0.00%
Percentage of missing addresses for Burger Factory: 0.00%
Percentage of missing addresses for Burger Palace: 0.00%
Percentage of missing addresses for Burger Station: 50.00%
Percentage of missing addresses for Cypress Best Burgers: 0.00%
Percentage of missing addresses for Gold Star Hamburgers: 0.00%
Percentage of missing addresses for Hook Burger: 0.00%
Percentage of missing addresses for P & G Burgers: 0.00%
Percentage of missing addresses for R Burgers: 0.00%
Percentage of missing addresses for Super Burger: 20.00%
Percentage of missing addresses for Varsity Burgers: 0.00%


{'Burger Station': ['10361 Rockingham Drive, Sacramento, CA, 95827, US'],
 'Super Burger': ['2505 3rd St, Ceres, CA, 95307, US']}

In [44]:
file_pairs = [("raw_prices_ubereats_nonca_ff_03292024.csv", "missing_nonca.csv")]
    
# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"



missing_restaurants_ubereats = find_missing_addresses(file_pairs, base_path)

In [45]:
# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_wendys_ca_03302024.csv", "raw_prices_wendys_ca_05142024.csv"),
    ("raw_prices_wendys_nonca_03302024.csv", "raw_prices_wendys_nonca_05142024.csv")
]

# Initialize a dictionary to store missing restaurant names
wendys_missing = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2)

    # Extract the restaurant addresses
    restaurants_rnd_1 = set(df_rnd_1['address'])
    restaurants_rnd_2 = set(df_rnd_2['address'])

    # Find restaurants that are in round 1 but not in round 2
    missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

    # Store the missing restaurants in the dictionary
    wendys_missing[file_pair[0]] = missing_restaurants

After Re-scraping

In [135]:
#Combine Dataset with the missing data 
missing_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/missing_nonca_ff_rnd4.csv")
columns = missing_data.columns
prev_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/nonca_ff_rnd3.csv")[columns]
complete_data = pd.concat([missing_data, prev_data])
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardee', 'Hardees', case=False)
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardeess', 'Hardees', case=False)

complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardee', 'Hardees', case=False)
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardeess', 'Hardees', case=False)
complete_data['restaurant_name'] = complete_data['restaurant_name'].str.replace('Hardeessss', 'Hardees', case=False)



#Save Dataset as a csv to re-
complete_data.to_csv("nonca_ff_rnd4.csv")

In [136]:
file_pairs = [
    ("raw_prices_ubereats_nonca_ff_hardees_032920204.csv", "nonca_ff_rnd4.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    

    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Burger King: 16.59%
Percentage of missing addresses for Carls Jr: 31.25%
Percentage of missing addresses for Five Guys: 20.23%
Percentage of missing addresses for Hardees: 1.98%
Percentage of missing addresses for Jack in the Box: 0.86%
Percentage of missing addresses for McDonald: 2.24%
Percentage of missing addresses for Shake Shack: 26.51%
Percentage of missing addresses for Sonic: 33.33%
Percentage of missing addresses for The Habit: 14.29%
Percentage of missing addresses for Wendy: 26.58%

Final dictionary of missing addresses by restaurant name:
Burger King: ['3155 s padre island dr, corpus christi, tx, 78415, us', '17000 n rm 620, round rock, tx, 78681, us', '4535 highway 58, chattanooga, tn, 37416, us', '4905 airport blvd, austin, tx, 78751, us', '737 n. main street, layton, ut, 84041, us', '1432 s peoria ave, tulsa, ok, 74120, us', '301 west 3rd street, austin, tx, 78701, us', '3130 south memorial drive, tulsa, ok, 74145, us', '1627 universi

In [128]:
missing_restaurants_ubereats

{'Burger King': ['3155 s padre island dr, corpus christi, tx, 78415, us',
  '17000 n rm 620, round rock, tx, 78681, us',
  '810 north main street, oregon, wi, 53575, us',
  '4535 highway 58, chattanooga, tn, 37416, us',
  '4905 airport blvd, austin, tx, 78751, us',
  '737 n. main street, layton, ut, 84041, us',
  '1432 s peoria ave, tulsa, ok, 74120, us',
  '301 west 3rd street, austin, tx, 78701, us',
  '3130 south memorial drive, tulsa, ok, 74145, us',
  '1627 university ave, lubbock, tx, 79401, us',
  '3980 william penn highway, monroeville, pa, 15146, us',
  '2011 north hillfield road, layton, ut, 84041, us',
  '2414 mac davis lane, lubbock, tx, 79401, us',
  '1680 n freedom blvd, provo, ut, 84604, us',
  '619 n ih 35, austin, tx, 78702, us',
  '2021 morgan ave, corpus christi, tx, 78405-1541, us',
  '1109 university ave, lubbock, tx, 79401, us',
  '2037 s padre island drive, corpus christi, tx, 78417, us',
  '1121 e palm valley blvd, round rock, tx, 78664, us',
  '201 e franklin s

Full Service

In [None]:
rr = pd.read_csv("missing_redrobin.csv")
applebee = pd.read_csv("missing_applebee.csv")
bjs = pd.read_csv("missing_bjs.csv")
buffalo = pd.read_csv("missing_buffalo.csv")
chilis = pd.read_csv("missing_chilis.csv")
dennys = pd.read_csv("missing_dennys.csv")
outback = pd.read_csv("missing_outback.csv")
panera = pd.read_csv("missing_panera.csv")
tgif = pd.read_csv("missing_tgif.csv")


fullserv = pd.concat([rr, applebee, bjs, buffalo, chilis, dennys, outback, panera, tgif]).dropna()
fullserv

ca_fullserv = fullserv[fullserv['restaurant_location'].str.contains(', CA,')]
nonca_fullserv = fullserv[~fullserv['restaurant_location'].str.contains(', CA,')]

ca_fullserv.to_csv("missing_ca_fullserv_rnd1.csv", index = False)
nonca_fullserv.to_csv("missing_nonca_fullserv_rnd1.csv", index = False)

CA FullServ

In [108]:
#Add missing to corresponding 
#Combine Dataset with the missing data 
missing_data = pd.read_csv("missing_ca_fullserv_rnd2.csv")
columns = missing_data.columns
prev_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/ca_fullserv_rnd1.csv")[columns]
complete_data = pd.concat([missing_data, prev_data])


#Save Dataset as a csv to re-
complete_data.to_csv("final_ca_fullserv.csv")
#complete_data
complete_data['restaurant_name'].unique()

array(['BJs Restaurants', 'TGI Fridays', 'Red Robin', 'Applebee', 'BJ',
       'Buffalo Wild Wings', "Chili's", 'Denny', 'Outback Steakhouse',
       'Panera Bread', 'TGI Friday'], dtype=object)

NONCA

In [106]:
file_pairs = [
    ("raw_prices_ubereats_ca_fullserv_03252024.csv", "ca_fullserv_rnd2.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    
    #
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('BJ', "BJ's Restaurants", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("Chili's", "Chili's Grill & Bar", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("TGI Friday", "TGI Fridays", case=False)




    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Applebee: 2.56%
Percentage of missing addresses for BJ's Restaurants: 15.91%
Percentage of missing addresses for Buffalo Wild Wings: 1.39%
Percentage of missing addresses for Chili's Grill & Bar: 5.66%
Percentage of missing addresses for Denny: 0.00%
Percentage of missing addresses for Outback Steakhouse: 0.00%
Percentage of missing addresses for Panera Bread: 0.00%
Percentage of missing addresses for Red Robin: 0.00%
Percentage of missing addresses for TGI Fridays: 12.50%

Final dictionary of missing addresses by restaurant name:
Applebee: ['9105 e stockton blvd, elk grove, ca, 95624, us', '9255 winnetka ave, chatsworth, ca, 91311, us']
BJ's Restaurants: ['1200, roseville, ca, 95678, us', '460 the city drive s, orange, ca, 92868, us', '2730 e. bidwell street, folsom, ca, 95630, us', '9237 laguna springs drive, elk grove, ca, 95758, us', '13130 jamboree road, irvine, ca, 92602, us', '3531 n. freeway blvd., sacramento, ca, 95834, us', '1689 arden way,

In [139]:
#Add missing to corresponding 
#Combine Dataset with the missing data 
missing_data = pd.read_csv("missing_nonca_fullserv_rnd3.csv")
columns = missing_data.columns
prev_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/nonca_fullserv_rnd2.csv")[columns]
complete_data = pd.concat([missing_data, prev_data])


#Save Dataset as a csv to re-
complete_data.to_csv("nonca_fullserv_rnd3.csv")
complete_data

Unnamed: 0.1,Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,inputted_location,restaurant_rating,number_of_ratings,restaurant_distance
0,0,Chilis,Triple Dipper®,18.59,"4600 Chapel Hill Blvd., Durham, NC, 27707-2669...","6917 fayetteville rd., durham, nc, 27713-8723, us",4.2,700+,4.6 mi
1,1,Chilis,Southwestern Eggrolls,14.59,"4600 Chapel Hill Blvd., Durham, NC, 27707-2669...","6917 fayetteville rd., durham, nc, 27713-8723, us",4.2,700+,4.6 mi
2,2,Chilis,Skillet Queso,12.09,"4600 Chapel Hill Blvd., Durham, NC, 27707-2669...","6917 fayetteville rd., durham, nc, 27713-8723, us",4.2,700+,4.6 mi
3,3,Chilis,Boneless Wings,0.00,"4600 Chapel Hill Blvd., Durham, NC, 27707-2669...","6917 fayetteville rd., durham, nc, 27713-8723, us",4.2,700+,4.6 mi
4,4,Chilis,White Skillet Queso,9.99,"4600 Chapel Hill Blvd., Durham, NC, 27707-2669...","6917 fayetteville rd., durham, nc, 27713-8723, us",4.2,700+,4.6 mi
...,...,...,...,...,...,...,...,...,...
132728,89065,Red Robin,Coleslaw,4.79,"95 N Moorland Rd, Brookfield, WI, 53005, US",Waukesha Village Hall W250 S3567 Center Road W...,4.4,470+,8 mi
132729,89066,Red Robin,Garlic Fries,4.79,"95 N Moorland Rd, Brookfield, WI, 53005, US",Waukesha Village Hall W250 S3567 Center Road W...,4.4,470+,8 mi
132730,89067,Red Robin,Yukon Chips,3.59,"95 N Moorland Rd, Brookfield, WI, 53005, US",Waukesha Village Hall W250 S3567 Center Road W...,4.4,470+,8 mi
132731,89068,Red Robin,Garlic Parmesan Broccoli,4.79,"95 N Moorland Rd, Brookfield, WI, 53005, US",Waukesha Village Hall W250 S3567 Center Road W...,4.4,470+,8 mi


In [140]:
file_pairs = [
    ("raw_prices_ubereats_nonca_fullserv_03252024.csv", "nonca_fullserv_rnd3.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    
    #Replace Restaurant Names
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('BJ', "BJ's Restaurants", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("Chili's", "Chili's Grill & Bar", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("TGI Friday", "TGI Fridays", case=False)




    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Applebee: 3.95%
Percentage of missing addresses for BJ's Restaurants: 7.41%
Percentage of missing addresses for Buffalo Wild Wings: 4.40%
Percentage of missing addresses for Chili's Grill & Bar: 13.10%
Percentage of missing addresses for Denny: 3.36%
Percentage of missing addresses for Outback Steakhouse: 0.00%
Percentage of missing addresses for Panera Bread: 0.00%
Percentage of missing addresses for Red Robin: 0.00%
Percentage of missing addresses for TGI Fridays: 70.73%

Final dictionary of missing addresses by restaurant name:
Applebee: ['24041 southland dr, hayward, ca, 94545, us', '4353 lawrenceville hwy., tucker, ga, 30084, us', '4808, dublin, ca, 94568, us', '1041 admiral callaghan lane, vallejo, ca, 94591, us', '5010 w hwy 290 service rd, austin, tx, 78735, us', '614 north valley mills, waco, tx, 76710, us']
BJ's Restaurants: ['515 w. bay area boulevard, webster, tx, 77598, us', '3620 fallon road, dublin, ca, 94568, us']
Buffalo Wild Wings: 

In [138]:
missing_restaurants_ubereats

{'Applebee': ['24041 southland dr, hayward, ca, 94545, us',
  '4353 lawrenceville hwy., tucker, ga, 30084, us',
  '4808, dublin, ca, 94568, us',
  '1041 admiral callaghan lane, vallejo, ca, 94591, us',
  '5010 w hwy 290 service rd, austin, tx, 78735, us',
  '614 north valley mills, waco, tx, 76710, us'],
 "BJ's Restaurants": ['515 w. bay area boulevard, webster, tx, 77598, us',
  '3620 fallon road, dublin, ca, 94568, us'],
 'Buffalo Wild Wings': ['3712, dublin, ca, 94568, us',
  '7604 n i-35, austin, tx, 78752, us',
  '2720 north grandview boulevard, waukesha, wi, 53188, us',
  '4415 ambassador caffery pkwy ste 700, lafayette, la, 70508, us',
  '32135 union lndg, union city, ca, 94587, us',
  '5b serramonte center serramonte center, daly city, ca, 94015, us',
  '2315 w interstate 20, grand prairie, tx, 75052, us'],
 "Chili's Grill & Bar": ['6917 fayetteville rd., durham, nc, 27713-8723, us',
  '4111 roswell road ne., marietta, ga, 30062-6242, us',
  '108 marketplace drive, lexington, k

In [124]:
#Add missing to corresponding 
#Combine Dataset with the missing data 
missing_data = pd.read_csv("missing_ca_localff.csv")
columns = missing_data.columns
prev_data = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_ca_fflocal05272024.csv")[columns]
complete_data = pd.concat([missing_data, prev_data])


#Save Dataset as a csv to re-
complete_data.to_csv("final_ca_localff.csv")
# complete_data

complete_data

Unnamed: 0.1,Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,inputted_location,restaurant_rating,number_of_ratings,restaurant_distance
0,0,Burger Station,Veggie Delight Salad,13.45,"10361 Rockingham Drive, Sacramento, CA, 95827, US","10361 Rockingham Drive, Sacramento, CA, 95827, US",0 mi,,•
1,1,Burger Station,Chicken Caesar Salad,14.10,"10361 Rockingham Drive, Sacramento, CA, 95827, US","10361 Rockingham Drive, Sacramento, CA, 95827, US",0 mi,,•
2,2,Burger Station,Green Salad,12.15,"10361 Rockingham Drive, Sacramento, CA, 95827, US","10361 Rockingham Drive, Sacramento, CA, 95827, US",0 mi,,•
3,3,Burger Station,Chef's Salad,14.10,"10361 Rockingham Drive, Sacramento, CA, 95827, US","10361 Rockingham Drive, Sacramento, CA, 95827, US",0 mi,,•
4,4,Burger Station,Oriental Salad,14.10,"10361 Rockingham Drive, Sacramento, CA, 95827, US","10361 Rockingham Drive, Sacramento, CA, 95827, US",0 mi,,•
...,...,...,...,...,...,...,...,...,...
9331,9331,R Burgers,Buffalo Dipping Sauce,0.25,"2590 Geer Rd, Turlock, CA, 95382, US","2505 3rd St, Ceres, CA, 95307, US",4.3,62,8 mi
9332,9332,R Burgers,Honey Mustard Dipping Sauce,0.25,"2590 Geer Rd, Turlock, CA, 95382, US","2505 3rd St, Ceres, CA, 95307, US",4.3,62,8 mi
9333,9333,R Burgers,Ranch Dipping Sauce,0.25,"2590 Geer Rd, Turlock, CA, 95382, US","2505 3rd St, Ceres, CA, 95307, US",4.3,62,8 mi
9334,9334,R Burgers,Sweet &amp; Sour Dipping Sauce,0.25,"2590 Geer Rd, Turlock, CA, 95382, US","2505 3rd St, Ceres, CA, 95307, US",4.3,62,8 mi


In [None]:
"raw_prices_ubereats_ca_fflocal_03292024.csv", "raw_prices_ubereats_ca_fflocal05272024.csv"),

In [126]:
file_pairs = [
    ("raw_prices_ubereats_ca_fflocal_03292024.csv", "final_ca_localff.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    
    #Replace Restaurant Names
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('BJ', "BJ's Restaurants", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("Chili's", "Chili's Grill & Bar", case=False)
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace("TGI Friday", "TGI Fridays", case=False)




    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Betty Burgers: 0.00%
Percentage of missing addresses for Big Mouth Burgers: 0.00%
Percentage of missing addresses for Burger Basket: 0.00%
Percentage of missing addresses for Burger Boss: 0.00%
Percentage of missing addresses for Burger Factory: 0.00%
Percentage of missing addresses for Burger Palace: 0.00%
Percentage of missing addresses for Burger Station: 0.00%
Percentage of missing addresses for Cypress Best Burgers: 0.00%
Percentage of missing addresses for Gold Star Hamburgers: 0.00%
Percentage of missing addresses for Hook Burger: 0.00%
Percentage of missing addresses for P & G Burgers: 0.00%
Percentage of missing addresses for R Burgers: 0.00%
Percentage of missing addresses for Super Burger: 0.00%
Percentage of missing addresses for Varsity Burgers: 0.00%

Final dictionary of missing addresses by restaurant name:
