In [1]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [28]:
def find_missing_restaurants(file_pairs, base_path):
    """
    This function finds the restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are the first file names in the pairs and values are lists of missing restaurants.
    """
    missing_restaurants_ff_website = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant addresses
        restaurants_rnd_1 = set(df_rnd_1['restaurant_address'])
        restaurants_rnd_2 = set(df_rnd_2['restaurant_address'])

        # Find restaurants that are in round 1 but not in round 2
        missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

        # Store the missing restaurants in the dictionary
        missing_restaurants_ff_website[file_pair[0]] = missing_restaurants

    return missing_restaurants_ff_website

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_burgerking_ca_03282024.csv", "raw_prices_burgerking_ca_05152024.csv"),
    ("raw_prices_burgerking_non_ca_03302024.csv", "raw_prices_burgerking_non_ca_05152024.csv"),
    ("raw_prices_carlsjr_ca_03292024.csv", "raw_prices_carlsjr_ca_05152024.csv"),
    ("raw_prices_carlsjr_non_ca_03302024.csv", "raw_prices_carlsjr_non_ca_05162024.csv"),
    ("raw_prices_hardees_non_ca_03292024.csv", "raw_prices_hardees_non_ca_05162024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ff_website = find_missing_restaurants(file_pairs, base_path)


In [27]:
def find_missing_addresses(file_pairs, base_path):
    """
    This function finds the addresses of restaurants present in the first CSV file of each pair 
    but missing in the second CSV file of each pair.

    Parameters:
    file_pairs (list of tuples): List of pairs of file names.
    base_path (str): The base path where the CSV files are located.

    Returns:
    dict: A dictionary where keys are restaurant names and values are lists of missing addresses.
    """
    missing_restaurants_ubereats = {}

    # Loop through each pair of file paths
    for file_pair in file_pairs:
        # Extract the file paths for rounds 1 and 2
        file_path_1 = base_path + file_pair[0]
        file_path_2 = base_path + file_pair[1]

        # Load the CSV files
        df_rnd_1 = pd.read_csv(file_path_1)
        df_rnd_2 = pd.read_csv(file_path_2)

        # Extract the restaurant names and addresses as dictionaries
        restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
        restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()

        # Find missing addresses for each restaurant
        for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
            addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
            missing_addresses = addresses_rnd_1 - addresses_rnd_2
            if missing_addresses:
                if restaurant not in missing_restaurants_ubereats:
                    missing_restaurants_ubereats[restaurant] = []
                missing_restaurants_ubereats[restaurant].extend(missing_addresses)

    return missing_restaurants_ubereats

# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_ubereats_ca_ff_03222024.csv", "raw_prices_ubereats_ca_ff_05142024.csv"),
    ("raw_prices_ubereats_ca_fflocal_03292024.csv", "raw_prices_ubereats_ca_fflocal05272024.csv"),
    ("raw_prices_ubereats_ca_fullserv_03252024.csv", "raw_prices_ubereats_ca_ffullserv_05142024.csv"),
    ("raw_prices_ubereats_nonca_ff_03292024.csv", "raw_prices_ubereats_nonca_ff_05162024.csv"),
    ("raw_prices_ubereats_nonca_fullserv_03252024.csv", "raw_prices_ubereats_nonca_fullserv_05162024.csv"),
]

# Define the base path where the CSV files are located
base_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/"

# Call the function and store the result
missing_restaurants_ubereats = find_missing_addresses(file_pairs, base_path)


In [5]:
# Define the list of pairs of CSV files for corresponding rounds
file_pairs = [
    ("raw_prices_wendys_ca_03302024.csv", "raw_prices_wendys_ca_05142024.csv"),
    ("raw_prices_wendys_nonca_03302024.csv", "raw_prices_wendys_nonca_05142024.csv")
]

# Initialize a dictionary to store missing restaurant names
wendys_missing = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2)

    # Extract the restaurant addresses
    restaurants_rnd_1 = set(df_rnd_1['address'])
    restaurants_rnd_2 = set(df_rnd_2['address'])

    # Find restaurants that are in round 1 but not in round 2
    missing_restaurants = list(restaurants_rnd_1 - restaurants_rnd_2)

    # Store the missing restaurants in the dictionary
    wendys_missing[file_pair[0]] = missing_restaurants

In [6]:
wendys_missing

{'raw_prices_wendys_ca_03302024.csv': [],
 'raw_prices_wendys_nonca_03302024.csv': ['2218 Cloverdale Avenue, Winston Salem, NC, 27103, US']}

Concatenating missing with old data

In [7]:
#Separate CA and NONCA 

missing_ff = pd.read_csv("missing_data_ff.csv").dropna()
df_ca = missing_ff[missing_ff['restaurant_location'].str.contains('CA')]
df_nonca = missing_ff[~missing_ff['restaurant_location'].str.contains('CA')]


# #Combine with old round 2 
ca_rnd_2 = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_ca_ff_05142024.csv")
ca_rnd_2_complete = pd.concat([df_ca, ca_rnd_2])

nonca_rnd_2 = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_ff_05162024.csv")
nonca_rnd_2_complete = pd.concat([df_nonca, nonca_rnd_2])

# #Save as csv for further processing 
ca_rnd_2_complete.to_csv("ubereats_ca_ff_complete.csv")
nonca_rnd_2_complete.to_csv("ubereats_nonca_ff_complete.csv")

In [8]:
missing_ff = pd.read_csv("missing_data_ff_ca_rnd2.csv").dropna()
final_ca = pd.concat([ca_rnd_2_complete, missing_ff])
final_ca.to_csv("final_ca_rnd2.csv")

In [55]:
missing_ff = pd.read_csv("missing_data_ff_nonca_rnd2.csv").dropna()
final_nonca = pd.concat([nonca_rnd_2_complete, missing_ff])
final_nonca.to_csv("final_nonca_rnd2.csv")

In [9]:
file_pairs = [
    ("raw_prices_ubereats_ca_ff_03222024.csv", "final_ca_rnd2.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()

    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Burger King: 1.80%
Percentage of missing addresses for Carls Jr: 2.04%
Percentage of missing addresses for Five Guys: 0.00%
Percentage of missing addresses for Jack in the Box: 0.98%
Percentage of missing addresses for McDonald: 0.44%
Percentage of missing addresses for Shake Shack: 0.00%
Percentage of missing addresses for Sonic: 0.00%
Percentage of missing addresses for The Habit: 6.52%
Percentage of missing addresses for Wendy: 0.74%

Final dictionary of missing addresses by restaurant name:
Burger King: ['702 north wilson way, stockton, ca, 95205, us', '972 el camino real, south san francisco, ca, 94080, us', '8304 el camino real, atascadero, ca, 93422, us']
Carls Jr: ['5501 freeport ave, sacramento, ca, 95822, us', '1999 camden ave, san jose, ca, 95124, us']
Jack in the Box: ['1100 el camino real, san carlos, ca, 94070, us', '1900 ramada drive, paso robles, ca, 93446, us']
McDonald: ['40465 winchester rd, temecula, ca, 92591, us']
The Habit: ['1

In [20]:
#Need to merge hardees and nonca together 

nonca_ff = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_ff_05162024.csv")
# hardee = pd.read_csv("/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_hardees_03282024.csv")
# non_ca_rnd_1 = pd.concat([nonca_ff, hardee])
# non_ca_rnd_1.to_csv("raw_prices_ubereats_nonca_ff_hardees_032920204.csv")
nonca_ff

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,...,restaurant_rating first,state,zip,county_name_x,county_name_y,date,uber_eats,post_policy,fast_food,local
0,0,McDonald,"1 christy dr, chadds ford, pa, 19317, us",5.516615,3.61,4.646159,4.3,145,90,6.59,...,,pa,19317,,Chester,2024-05-16,1,1,1,0
1,1,McDonald,"100 brownswitch rd, slidell, la, 70458, us",4.788060,4.07,4.223423,4.0,134,360+,5.87,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
2,2,McDonald,"3301 pontchartrain drive, slidell, la, 70458, us",4.927594,4.07,4.420474,4.2,133,460+,5.87,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
3,3,Wendy,"3915 pontchartrain, slidell, la, 70458, us",5.523137,4.87,2.958956,,83,170+,6.70,...,4.2,la,70458,,St. Tammany,2024-05-16,1,1,1,0
4,4,Burger King,"185 gause blvd., slidell, la, 70458, us",7.343924,5.39,5.966670,4.1,79,260+,6.49,...,,la,70458,,St. Tammany,2024-05-16,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,1787,Five Guys,"springs at trussville street, center point, al...",8.464483,8.51,3.323807,4.5,29,240+,13.67,...,,al,35235,,Jefferson,2024-05-16,1,1,1,0
1788,1788,Hardee,"2450 e layton ave, saint francis, wi, 53235-60...",10.064328,9.99,5.060824,4.5,136,270+,7.61,...,,wi,53235,Milwaukee,Milwaukee,2024-05-16,1,1,1,0
1789,1789,Hardee,"2930 highway 138 sw, conyers, ga, 30094, us",7.680000,6.70,6.376385,3.9,70,130+,7.13,...,,ga,30094,Rockdale,Rockdale,2024-05-16,1,1,1,0
1790,1790,Hardee,"3062 anvil block road, ellenwood, ga, 30294, us",7.680000,6.70,6.376385,3.9,70,310+,7.13,...,,ga,30294,DeKalb,DeKalb,2024-05-16,1,1,1,0


In [24]:
file_pairs = [
    ("raw_prices_ubereats_nonca_ff_hardees_032920204.csv", "raw_prices_ubereats_nonca_ff_05162024.csv")
]

# Initialize a dictionary to store missing addresses by restaurant name
missing_restaurants_ubereats = {}

# Loop through each pair of file paths
for file_pair in file_pairs:
    # Extract the file paths for rounds 1 and 2
    file_path_1 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[0]
    file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/" + file_pair[1]

    # Load the CSV files
    df_rnd_1 = pd.read_csv(file_path_1)
    df_rnd_2 = pd.read_csv(file_path_2, low_memory=False)
    
    # Convert the addresses to lowercase
    df_rnd_1['restaurant_location'] = df_rnd_1['restaurant_location'].str.lower()
    df_rnd_2['restaurant_location'] = df_rnd_2['restaurant_location'].str.lower()
    
    df_rnd_2['restaurant_name'] = df_rnd_2['restaurant_name'].str.replace('Hardee', 'Hardees', case=False)


    # Extract the restaurant names and addresses as dictionaries
    restaurants_rnd_1 = df_rnd_1.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    restaurants_rnd_2 = df_rnd_2.groupby('restaurant_name')['restaurant_location'].apply(set).to_dict()
    
    # Find missing addresses for each restaurant
    for restaurant, addresses_rnd_1 in restaurants_rnd_1.items():
        addresses_rnd_2 = restaurants_rnd_2.get(restaurant, set())
        missing_addresses = addresses_rnd_1 - addresses_rnd_2
        if missing_addresses:
            if restaurant not in missing_restaurants_ubereats:
                missing_restaurants_ubereats[restaurant] = []
            missing_restaurants_ubereats[restaurant].extend(missing_addresses)
        
        # Calculate the percentage of missing addresses for the current restaurant
        total_addresses_rnd_1 = len(addresses_rnd_1)
        if total_addresses_rnd_1 != 0:
            percent_missing = (len(missing_addresses) / total_addresses_rnd_1) * 100
            print(f"Percentage of missing addresses for {restaurant}: {percent_missing:.2f}%")

# Print the final dictionary of missing addresses
print("\nFinal dictionary of missing addresses by restaurant name:")
for restaurant, addresses in missing_restaurants_ubereats.items():
    print(f"{restaurant}: {addresses}")

Percentage of missing addresses for Burger King: 38.57%
Percentage of missing addresses for Carls Jr: 93.75%
Percentage of missing addresses for Five Guys: 51.45%
Percentage of missing addresses for Hardees: 87.13%
Percentage of missing addresses for Jack in the Box: 26.72%
Percentage of missing addresses for McDonald: 45.38%
Percentage of missing addresses for Shake Shack: 33.73%
Percentage of missing addresses for Sonic: 58.85%
Percentage of missing addresses for The Habit: 38.10%
Percentage of missing addresses for Wendy: 42.34%

Final dictionary of missing addresses by restaurant name:
Burger King: [nan, '1702 memorial blvd, murfreesboro, tn, 37129, us', '3900 geary boulevard, san francisco, ca, 94118, us', '810 north main street, oregon, wi, 53575, us', '3500 south meridian avenue, wichita, ks, 67217, us', '1726 bluff ridge drive, syracuse, ut, 84075, us', '493 old hickory blvd, brentwood, tn, 37027, us', '3155 s padre island dr, corpus christi, tx, 78415, us', '301 west 3rd stree