In [1]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
file_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_ca_ff_03222024.csv"
ca_ff = pd.read_csv(file_path)
file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/uszips.csv"
ca_zip_count = pd.read_csv(file_path_2)
file_path_3 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/processed_prices_ubereats_ca_ff_03222024.csv"
example = pd.read_csv(file_path_3)

In [3]:
#Nan values 
nan_count = ca_ff.isnull().sum()
print(nan_count)

Unnamed: 0               0
restaurant_name          0
menu_item                0
menu_item_price          0
restaurant_location    250
inputted_location        0
restaurant_rating        0
number_of_ratings        0
restaurant_distance    235
dtype: int64


In [4]:
#Drop all the columns we don't need 
ca_ff_ = ca_ff.drop(columns=['Unnamed: 0', 'inputted_location','restaurant_distance'])
ca_ff_ = ca_ff_.dropna(subset=['restaurant_location'])

In [5]:
ca_zip_count = ca_zip_count[['zip', 'county_name']]

Exploring missing values 

In [6]:
#restaurant_rating cleaning 
# Ensure the column is of string type using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].astype(str)

# Count rows containing 'mi'
rows_with_mi = ca_ff_['restaurant_rating'].str.contains('mi').sum()
print("Number of rows with 'mi' in restaurant rating:", rows_with_mi)

# Replace invalid ratings ending with 'mi' with '0' using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].str.replace(r'.*mi$', '0', regex=True)

Number of rows with 'mi' in restaurant rating: 3742


In [7]:
#converting data types 
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].str.replace(r'\s+', ' ', regex=True)
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].astype('string')
ca_ff_['restaurant_rating'] = ca_ff_['restaurant_rating'].str.strip().astype(float)

In [8]:
#cleaning up string columns 

ca_ff_['menu_item'] = ca_ff_['menu_item'].str.lower()
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].str.lower()
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].str.replace('_', ' ')

#remove special characters
ca_ff_['menu_item'] = ca_ff_['menu_item'].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch.isspace()))
ca_ff_

Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,restaurant_rating,number_of_ratings
0,McDonald,medium french fries,4.69,"1330 jackson st, oakland, ca, 94612, us",4.6,100+
1,McDonald,10 pc chicken mcnuggets,6.69,"1330 jackson st, oakland, ca, 94612, us",4.6,100+
2,McDonald,big mac,6.79,"1330 jackson st, oakland, ca, 94612, us",4.6,100+
3,McDonald,2 cheeseburger meal,10.99,"1330 jackson st, oakland, ca, 94612, us",4.6,100+
4,McDonald,cheeseburger,3.49,"1330 jackson st, oakland, ca, 94612, us",4.6,100+
...,...,...,...,...,...,...
148708,The Habit,ranch,0.63,"1855 e daily dr, camarillo, ca, 93010, us",4.8,29
148709,The Habit,bbq sauce,0.63,"1855 e daily dr, camarillo, ca, 93010, us",4.8,29
148710,The Habit,teriyaki sauce,0.63,"1855 e daily dr, camarillo, ca, 93010, us",4.8,29
148711,The Habit,spicy red pepper sauce,0.63,"1855 e daily dr, camarillo, ca, 93010, us",4.8,29


In [9]:
def price_list(x):
    return list(x)

In [10]:
def mean_non_zero(x):
    return np.mean(x[x != 0]) if np.any(x != 0) else 0

def median_non_zero(x):
    return np.median(x[x != 0]) if np.any(x != 0) else 0

def std_non_zero(x):
    return np.std(x[x != 0]) if np.any(x != 0) else 0

McDonald's 

In [47]:
#Filter to just McDonalds
ca_ff_mcd = ca_ff_[ca_ff_['restaurant_name'] == 'McDonald']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_mcd = ca_ff_mcd.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_mcd.columns = [' '.join(col).strip() for col in grouped_mcd.columns.values]


#Second part of grouping 
mcd_lst = ['big mac', 'big mac meal', 'cheeseburger', 'hamburger', 'medium french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_mcd = ca_ff_mcd[ca_ff_mcd['menu_item'].isin(mcd_lst)].sort_values('menu_item')
menu_items_mcd = menu_items_mcd.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_mcd_2 = menu_items_mcd.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_mcd_2[['specialty_item', 'combo', 'cheeseburger', 'hamburger', 'fries']] = grouped_mcd_2['menu_item_price'].apply(pd.Series)
grouped_mcd_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_mcd = pd.merge(grouped_mcd, grouped_mcd_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_mcd

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,combo,cheeseburger,hamburger,fries
0,McDonald,"1 christy dr, chadds ford, pa, 19317, us",5.516615,3.61,4.646159,4.3,145,90,6.59,11.69,3.09,2.49,3.99
1,McDonald,"100 brownswitch rd, slidell, la, 70458, us",4.788060,4.07,4.223423,4.0,134,360+,5.87,10.43,2.39,2.27,2.87
2,McDonald,"100 w parkwood ave, friendswood, tx, 77546, us",4.906496,3.69,3.657060,4.6,152,1,5.59,8.89,1.79,2.39,3.19
3,McDonald,"1000 commerce st, dallas, tx, 75202, us",4.806615,3.99,2.961904,4.4,146,4,5.89,11.19,2.79,2.19,4.09
4,McDonald,"1001 n 9th street, reading, pa, 19604, us",6.607355,4.69,4.840492,4.1,143,2,7.99,13.29,3.89,3.59,5.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,McDonald,"bryan, usa, bryan, tx, 77802, us",5.169764,4.09,4.200394,4.4,144,600+,5.11,8.19,2.35,2.15,3.11
439,McDonald,"i-45 at fm 646, league city, tx, 77539, us",5.033023,3.89,3.201751,4.3,151,180+,5.59,10.29,1.89,1.79,3.19
440,McDonald,"one poplar st, pittsburgh, pa, 15205, us",6.018992,4.66,4.304436,4.5,143,2,7.00,12.72,3.36,2.84,4.27
441,McDonald,"rt 420 &amp; 13, prospect park, pa, 19076, us",6.022083,3.84,4.671284,4.6,142,2,6.39,11.89,3.19,2.75,4.23


Jack in the Box 

In [48]:
#Filter to just Jack in the Box
ca_ff_jack = ca_ff_[ca_ff_['restaurant_name'] == 'Jack in the Box']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_jack = ca_ff_jack.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_jack.columns = [' '.join(col).strip() for col in grouped_jack.columns.values]

# #Second part of grouping 
jack_lst = ['jr jumbo jack', 'jr jumbo jack cheeseburger', 'jumbo jack', 'large french fry', 'large jumbo jack combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_jack = ca_ff_jack[ca_ff_jack['menu_item'].isin(jack_lst)].sort_values('menu_item')
menu_items_jack = menu_items_jack.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_jack_2 = menu_items_jack.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_jack_2[['hamburger', 'cheeseburger', 'specialty_item', 'fries', 'combo']] = grouped_jack_2['menu_item_price'].apply(pd.Series)
grouped_jack_2.drop(columns=['menu_item_price'], inplace=True)

# #Merging the grouped dfs together 
merged_jack = pd.merge(grouped_jack, grouped_jack_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_jack

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,hamburger,cheeseburger,specialty_item,fries,combo
0,Jack in the Box,"1000 east 41st street unit k, austin, tx, 7875...",8.531185,7.86,4.214700,4.3,135,1,3.74,4.11,6.11,5.36,12.35
1,Jack in the Box,"10004 telephone rd, houston, tx, 77075, us",7.440199,6.92,3.490341,4.5,151,370+,3.01,3.39,6.29,4.52,10.81
2,Jack in the Box,"1001 leander rd, georgetown, tx, 78628, us",8.661769,7.86,4.100395,4.4,147,1,3.74,4.11,6.11,5.36,12.35
3,Jack in the Box,"1001 south fwy, fort worth, tx, 76104, us",6.832810,6.24,3.350544,4.4,153,500+,2.86,3.24,4.99,3.36,9.98
4,Jack in the Box,"10014 s memorial dr e, tulsa, ok, 74133, us",6.932566,6.24,3.317524,4.2,152,250+,2.99,3.36,4.99,3.99,9.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Jack in the Box,"905 legacy dr, plano, tx, 75023, us",7.234000,6.61,3.389221,4.5,150,1,3.49,4.11,5.36,4.36,8.98
162,Jack in the Box,"9052 w fairview ave, boise, id, 83704, us",7.673529,6.86,3.758633,4.3,153,800+,3.74,4.36,6.86,4.11,12.49
163,Jack in the Box,"9101 johnson dr, merriam, ks, 66202, us",7.128933,6.61,3.423880,4.1,150,600+,3.11,3.49,5.49,3.61,10.85
164,Jack in the Box,"916 n expressway, brownsville, tx, 78521, us",7.072857,6.24,3.452936,4.4,161,230+,2.99,3.36,5.36,4.36,10.23


In [49]:
ca_ff_jack

Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,restaurant_rating,number_of_ratings
30503,Jack in the Box,spicy sauced amp loaded popcorn chicken big bo...,13.74,"1124 s vista ave, boise, id, 83705, us",4.6,600+
30504,Jack in the Box,large spicy sauced amp loaded popcorn chicken ...,10.61,"1124 s vista ave, boise, id, 83705, us",4.6,600+
30505,Jack in the Box,spicy sauced amp loaded popcorn chicken,8.11,"1124 s vista ave, boise, id, 83705, us",4.6,600+
30506,Jack in the Box,classic sauced amp loaded popcorn chicken big ...,13.74,"1124 s vista ave, boise, id, 83705, us",4.6,600+
30507,Jack in the Box,large classic sauced amp loaded popcorn chicke...,10.61,"1124 s vista ave, boise, id, 83705, us",4.6,600+
...,...,...,...,...,...,...
218842,Jack in the Box,bacon breakfast jack,4.11,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
218843,Jack in the Box,sausage breakfast jack,4.11,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
218844,Jack in the Box,breakfast jack,3.74,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
218845,Jack in the Box,mini pancakes wsyrup,3.74,"11432 district drive, south jordan, ut, 84095, us",4.0,120+


Wendy's

In [50]:
#Filter to just Wendy's
ca_ff_wendy = ca_ff_[ca_ff_['restaurant_name'] == 'Wendy']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'first', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_wendy = ca_ff_wendy.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_wendy.columns = [' '.join(col).strip() for col in grouped_wendy.columns.values]

# #Second part of grouping 
wendy_lst = ['daves combo', 'daves single', 'french fries', 'jr cheeseburger', 'jr hamburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_wendy = ca_ff_wendy[ca_ff_wendy['menu_item'].isin(wendy_lst)].sort_values('menu_item')
menu_items_wendy = menu_items_wendy.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_wendy_2 = menu_items_wendy.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_wendy_2[['combo', 'specialty_item', 'fries', 'cheeseburger', 'hamburger']] = grouped_wendy_2['menu_item_price'].apply(pd.Series)
grouped_wendy_2.drop(columns=['menu_item_price'], inplace=True)

merged_wendy = pd.merge(grouped_wendy, grouped_wendy_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_wendy


Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating first,menu_item count,number_of_ratings first,combo,specialty_item,fries,cheeseburger,hamburger
0,Wendy,"100 scarlet dr., chapel hill, nc, 27517, us",5.400192,4.625,3.344126,4.4,88,420+,0.0,6.33,0.0,2.43,1.82
1,Wendy,"1000 aerial center parkway, morrisville, nc, 2...",5.400192,4.625,3.344126,4.0,88,190+,0.0,6.33,0.0,2.43,1.82
2,Wendy,"1000 east 41st street, austin, tx, 78751, us",5.854151,4.870,2.861297,4.4,88,1,0.0,6.94,0.0,3.16,2.79
3,Wendy,"1000 w walnut hill lane, irving, tx, 75038, us",5.793846,4.990,2.939536,4.4,87,600+,0.0,6.70,0.0,2.92,2.67
4,Wendy,"1004 north loop 340, bellmead, tx, 76705, us",5.739423,4.990,2.939614,4.1,87,200+,0.0,6.70,0.0,2.92,2.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,Wendy,"9955 bluebonnet rd, baton rouge, la, 70810, us",5.639423,4.870,2.805198,3.9,86,220+,0.0,6.94,0.0,3.04,2.67
400,Wendy,"997 north state street, orem, ut, 84057, us",5.762745,4.870,2.944657,4.5,85,200+,0.0,6.94,0.0,2.67,2.31
401,Wendy,"n 1100 w block park lane, farmington, ut, 8402...",5.805882,4.870,2.962655,4.4,85,68,0.0,6.70,0.0,2.79,2.31
402,Wendy,"ocean highway east, leland, nc, 28451, us",5.381923,4.620,3.117716,4.2,86,100+,0.0,6.33,0.0,2.43,2.18


Burger King

In [51]:
#Filter to just Burger King 
ca_ff_bk = ca_ff_[ca_ff_['restaurant_name'] == 'Burger King']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_bk = ca_ff_bk.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_bk.columns = [' '.join(col).strip() for col in grouped_bk.columns.values]

# #Second part of grouping 
#they don't have a plain hamburger FOR NOW using whopper jr 
bk_lst = ['cheeseburger', 'french fries', 'whopper', 'whopper jr', 'whopper meal']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_bk = ca_ff_bk[ca_ff_bk['menu_item'].isin(bk_lst)].sort_values('menu_item')
menu_items_bk = menu_items_bk.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_bk_2 = menu_items_bk.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_bk_2[['cheeseburger', 'fries', 'specialty_item', 'hamburger', 'combo']] = grouped_bk_2['menu_item_price'].apply(pd.Series)
grouped_bk_2.drop(columns=['menu_item_price'], inplace=True)

merged_bk = pd.merge(grouped_bk, grouped_bk_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_bk

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,specialty_item,hamburger,combo
0,Burger King,"100 legends road, myrtle beach, sc, 29579, us",13.679296,13.740,8.316944,4.2,154,1,3.11,,,,
1,Burger King,"100 river oaks cove, georgetown, tx, 78626, us",7.673038,6.590,5.985244,4.3,79,340+,2.69,3.39,7.39,3.59,12.19
2,Burger King,"1001 east ben white boulevard, austin, tx, 787...",7.673038,6.590,5.985244,4.3,79,1,2.69,3.39,7.39,3.59,12.19
3,Burger King,"1005 south military avenue, green bay, wi, 543...",8.376489,6.815,6.084001,4.3,94,500+,2.98,4.28,8.95,5.18,15.59
4,Burger King,"101 commerce piace west, okatie, sc, 29909, us",7.325190,5.790,5.984162,4.3,79,140+,2.49,3.69,6.89,3.39,12.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,Burger King,"925 winchester road, huntsville, al, 35811, us",5.991807,6.100,2.899433,4.4,83,100+,2.69,3.79,6.60,4.15,9.30
340,Burger King,"9523 north lamar boulevard, austin, tx, 78753, us",7.673038,6.590,5.985244,4.2,79,600+,2.69,3.39,7.39,3.59,12.19
341,Burger King,"9533 corporation drive, indianapolis, in, 4625...",7.467568,6.240,5.947237,4.3,74,320+,2.39,3.49,7.09,4.99,11.39
342,Burger King,"974 south academy street, greenville, sc, 2960...",5.962289,6.100,2.914555,4.5,83,280+,2.45,3.79,6.60,3.79,9.30


Shake Shack

In [52]:
#Filter to just Shake Shack
ca_ff_shake = ca_ff_[ca_ff_['restaurant_name'] == 'Shake Shack']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_shake = ca_ff_shake.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_shake.columns = [' '.join(col).strip() for col in grouped_shake.columns.values]


#Second part of grouping 
shake_lst = ['cheeseburger', 'fries', 'hamburger', 'shackburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_shake = ca_ff_shake[ca_ff_shake['menu_item'].isin(shake_lst)].sort_values('menu_item')
menu_items_shake = menu_items_shake.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_shake_2 = menu_items_shake.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_shake_2[['cheeseburger', 'fries', 'hamburger', 'specialty_item']] = grouped_shake_2['menu_item_price'].apply(pd.Series)
grouped_shake_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_shake = pd.merge(grouped_shake, grouped_shake_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_shake['combo'] = np.nan

Sonic

In [53]:
#FLAG many missing prices for fries and 
#Filter to just Sonic 
ca_ff_sonic = ca_ff_[ca_ff_['restaurant_name'] == 'Sonic']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_sonic = ca_ff_sonic.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_sonic.columns = [' '.join(col).strip() for col in grouped_sonic.columns.values]


#Second part of grouping 
sonic_lst = ['fries', 'quarter pound double cheeseburger', 'supersonic double cheeseburger', 'supersonic double cheeseburger combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_sonic = ca_ff_sonic[ca_ff_sonic['menu_item'].isin(sonic_lst)].sort_values('menu_item')
menu_items_sonic = menu_items_sonic.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_sonic_2 = menu_items_sonic.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_sonic_2[['fries', 'cheeseburger', 'specialty_item', 'combo']] = grouped_sonic_2['menu_item_price'].apply(pd.Series)
grouped_sonic_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_sonic = pd.merge(grouped_sonic, grouped_sonic_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_sonic['hamburger'] = np.nan
merged_sonic

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,fries,cheeseburger,specialty_item,combo,hamburger
0,Sonic,"1001 tunnel road, asheville, nc, 28805, us",2.935946,2.890,1.189730,4.7,37,32,1.29,2.59,5.29,,
1,Sonic,"1005 south riverfront, dallas, tx, 75207, us",4.991316,4.505,2.846447,4.5,88,1,0.00,4.26,7.80,0.0,
2,Sonic,"10480 west ustick road, boise, id, 83704, us",5.251333,4.625,2.242706,4.3,58,600+,0.00,4.26,8.77,,
3,Sonic,"10570 perkins road, baton rouge, la, 70810, us",4.826053,3.890,2.960645,4.0,70,17,0.00,4.26,7.92,,
4,Sonic,"109 jones franklin road, raleigh, nc, 27606, us",4.963514,4.380,2.957876,4.5,172,700+,0.00,4.38,7.80,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Sonic,"931 folly road, james island, sc, 29412, us",5.406000,5.175,3.000602,4.6,58,360+,0.00,4.26,7.80,,
151,Sonic,"940 ambassador caffery parkway, lafayette, la,...",4.887297,3.890,2.976965,4.2,65,37,0.00,4.26,7.92,,
152,Sonic,"951 montlimar dr, mobile, al, 36609, us",5.339737,4.560,3.143574,3.8,88,500+,0.00,4.38,9.38,0.0,
153,Sonic,"971 n main st, spanish fork, ut, 84660, us",4.782632,4.255,2.874614,4.6,88,68,0.00,3.50,8.18,0.0,


Five Guys 

In [54]:
#Filter for Five Guys
ca_ff_five = ca_ff_[ca_ff_['restaurant_name'] == 'Five Guys']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_five = ca_ff_five.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_five.columns = [' '.join(col).strip() for col in grouped_five.columns.values]


#Second part of grouping 
five_lst = ['cheeseburger', 'little cheeseburger', 'little hamburger', 'regular fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_five = ca_ff_five[ca_ff_five['menu_item'].isin(five_lst)].sort_values('menu_item')
menu_items_five = menu_items_five.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_five_2 = menu_items_five.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_five_2[['specialty_item', 'cheeseburger', 'hamburger', 'fries']] = grouped_five_2['menu_item_price'].apply(pd.Series)
grouped_five_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_five = pd.merge(grouped_five, grouped_five_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_five['combo'] = np.nan
merged_five

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,cheeseburger,hamburger,fries,combo
0,Five Guys,"100 moon shot dr suite 110, madison, al, 35758...",6.838276,6.590,3.056315,4.8,29,240+,11.39,8.87,8.27,5.99,
1,Five Guys,"1019 fording island rd., suite 101g, bluffton,...",8.344483,8.390,3.323807,4.7,29,700+,13.55,10.55,9.23,7.43,
2,Five Guys,"1051 south 750 east, orem, ut, 84097, us",7.567143,7.370,2.928996,4.6,28,700+,12.23,10.19,8.99,7.19,
3,Five Guys,"10645 broadway st., suite #120, pearland, tx, ...",8.516786,8.925,3.207797,4.6,28,1,13.45,10.51,9.17,7.33,
4,Five Guys,"1075 w baltimore pike, media, usa, middletown,...",8.352759,8.510,3.360459,4.7,29,430+,13.67,10.43,9.35,7.31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,Five Guys,"9635 riverside pkwy, tulsa, ok, 74137, us",8.456207,8.630,3.245679,4.6,29,380+,13.55,10.55,9.23,7.43,
147,Five Guys,"9749 northlake center parkway; suite e, charlo...",8.344483,8.390,3.323807,4.6,29,700+,13.55,10.55,9.23,7.43,
148,Five Guys,"9826 gilead rd., suite c-106, huntersville, nc...",8.116897,7.910,3.233925,4.8,29,1,13.07,10.07,8.87,7.19,
149,Five Guys,"east town center, lincoln hwy. east, lancaster...",8.824483,9.230,3.374954,4.6,29,350+,13.91,11.27,9.71,7.19,


The Habit

In [55]:
ca_ff_habit = ca_ff_[ca_ff_['restaurant_name'] == 'The Habit']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_habit = ca_ff_habit.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_habit.columns = [' '.join(col).strip() for col in grouped_habit.columns.values]


#Second part of grouping 
habit_lst = ['2 original double char meal', 'charburger', 'charburger with cheese', 'double char', 'french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_habit = ca_ff_habit[ca_ff_habit['menu_item'].isin(habit_lst)].sort_values('menu_item')
menu_items_habit = menu_items_habit.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_habit_2 = menu_items_habit.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_habit_2[['combo', 'hamburger', 'cheeseburger', 'specialty_item','fries']] = grouped_habit_2['menu_item_price'].apply(pd.Series)
grouped_habit_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_habit = pd.merge(grouped_habit, grouped_habit_2, on=['restaurant_name', 'restaurant_location'], how='inner')
#merged_five['combo'] = np.nan
merged_habit

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,combo,hamburger,cheeseburger,specialty_item,fries
0,The Habit,"1122 west wade hampton boulevard, greer, sc, 2...",10.368974,7.835,9.666494,4.5,156,41,15.87,6.74,7.86,8.74,3.74
1,The Habit,"120 village drive, king of prussia, usa, upper...",10.354615,7.835,9.672048,4.7,312,430+,15.87,6.74,7.86,8.74,3.74
2,The Habit,"1489 w. o. ezell boulevard, spartanburg, sc, 2...",10.0765,7.365,10.693992,3.5,60,16,15.87,6.74,7.86,8.74,3.74
3,The Habit,"16291 n marketplace blvd, nampa, id, 83687, us",10.517821,7.835,9.701138,4.4,78,600+,16.24,6.74,7.86,8.74,4.11
4,The Habit,"2050 skibo rd, fayetteville, nc, 28314, us",10.899231,8.74,9.761548,4.4,78,700+,17.24,6.99,8.11,9.24,4.36
5,The Habit,"2206 n eagle rd, meridian, id, 83646, us",10.354615,7.835,9.672048,4.4,78,900+,15.87,6.74,7.86,8.74,3.74
6,The Habit,"260 w union ave, farmington, ut, 84025, us",10.403671,7.81,9.692118,4.6,79,100+,16.24,6.74,7.86,8.74,4.11
7,The Habit,"335 12th street, ogden, ut, 84404, us",10.419351,7.86,9.718882,4.7,77,310+,15.87,6.74,7.86,8.74,3.74
8,The Habit,"3703 battleground avenue, greensboro, nc, 2741...",10.368974,7.835,9.666494,4.6,156,67,15.87,6.74,7.86,8.74,3.74
9,The Habit,"379 west plaza drive, mooresville, nc, 28117, us",10.368974,7.835,9.666494,4.4,78,30,15.87,6.74,7.86,8.74,3.74


Carl's Jr.

In [56]:
#Filter to Carl's Jr 

ca_ff_carls = ca_ff_[ca_ff_['restaurant_name'] == 'Carls Jr']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_carls = ca_ff_carls.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_carls.columns = [' '.join(col).strip() for col in grouped_carls.columns.values]


#Second part of grouping 
carls_lst = ['california classic double cheeseburger', 'naturalcut french fries', 'single big carl', 'single big carl combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_carls = ca_ff_carls[ca_ff_carls['menu_item'].isin(carls_lst)].sort_values('menu_item')
menu_items_carls = menu_items_carls.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_carls_2 = menu_items_carls.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_carls_2[['cheeseburger', 'fries', 'specialty_item','combo']] = grouped_carls_2['menu_item_price'].apply(pd.Series)
grouped_carls_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_carls = pd.merge(grouped_carls, grouped_carls_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_carls['hamburger'] = np.nan
merged_carls

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,specialty_item,combo,hamburger
0,Carls Jr,"10620 montana ave, el paso, tx, 79935, us",5.813077,4.84,2.754061,4.6,92,500+,4.19,2.99,7.49,0.0,
1,Carls Jr,"1120 e charles page blvd, sand springs, ok, 74...",8.267407,8.8,4.384035,4.3,55,50,5.11,3.11,7.49,11.99,
2,Carls Jr,"1124 mcrae blvd, el paso, tx, 79925, us",5.813077,4.84,2.754061,4.5,92,340+,4.19,2.99,7.49,0.0,
3,Carls Jr,"1155 w, riverdale, ut, 84405, us",7.159091,6.39,4.160458,4.3,95,270+,5.09,3.79,7.69,0.0,
4,Carls Jr,"1320 n eagle rd, meridian, id, 83642, us",6.453462,5.34,3.048141,4.5,92,230+,4.99,3.79,6.29,0.0,
5,Carls Jr,"1331 n state st, orem, ut, 84057, us",7.135455,6.39,4.185595,4.4,95,170+,5.09,3.79,7.69,0.0,
6,Carls Jr,"1609 s entertainment ave, boise, id, 83709, us",6.453462,5.34,3.048141,4.5,92,89,4.99,3.79,6.29,0.0,
7,Carls Jr,"1815 s meridian rd, meridan, id, 83642, us",6.453462,5.34,3.048141,4.8,92,240+,4.99,3.79,6.29,0.0,
8,Carls Jr,"1868 w 1800 n, clinton, ut, 84015, us",7.159091,6.39,4.160458,4.5,95,130+,5.09,3.79,7.69,0.0,
9,Carls Jr,"1898 n lee trevino dr, el paso, tx, 79936, us",5.813077,4.84,2.754061,4.4,92,150+,4.19,2.99,7.49,0.0,


In [76]:
#Filter to Hardees's Jr 

ca_ff_hardee = ca_ff_[ca_ff_['restaurant_name'] == 'Hardees']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_hardee = ca_ff_hardee.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_hardee.columns = [' '.join(col).strip() for col in grouped_hardee.columns.values]


#Second part of grouping 
carls_lst = ['big cheeseburger', 'famous star', 'famous star combo', 'naturalcut french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_hardee = ca_ff_hardee[ca_ff_hardee['menu_item'].isin(carls_lst)].sort_values('menu_item')
menu_items_hardee = menu_items_hardee.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_hardee_2 = menu_items_hardee.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_hardee_2[['cheeseburger', 'specialty_item', 'combo','fries']] = grouped_hardee_2['menu_item_price'].apply(pd.Series)
grouped_hardee_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_hardee = pd.merge(grouped_hardee, grouped_hardee_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_hardee['hamburger'] = np.nan
merged_hardee

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,specialty_item,combo,fries,hamburger
0,Hardees,"1 gateway blvd s, savannah, ga, 31419-7551, us",8.857500,7.485,7.239080,4.2,67,10,5.74,7.99,0.00,3.24,
1,Hardees,"10 e clemmonsville rd, winston salem, nc, 2712...",7.503617,7.130,5.341806,4.9,75,16,5.19,7.61,0.00,2.77,
2,Hardees,"10 hwy 17 n, surfside beach, sc, 29575, us",7.441702,6.760,5.362602,4.7,75,10,5.19,7.61,0.00,2.77,
3,Hardees,"100 vandora springs rd., garner, nc, 27529, us",8.155185,7.490,4.388304,3.2,82,10,5.36,6.61,10.99,,
4,Hardees,"1000 acorn dr, nashville, tn, 37210, us",6.779796,6.060,5.292074,4.2,79,42,5.07,7.13,0.00,2.77,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Hardees,"9020 east 21st street, indianapolis, in, 46229...",6.876190,5.875,4.879454,0.0,134,2,5.63,7.67,0.00,2.87,
93,Hardees,"915 army post rd, des moines, ia, 50315, us",8.299184,6.860,7.098840,3.6,76,30,5.36,7.49,0.00,3.24,
94,Hardees,"917 allison-bonnett memorial dr, hueytown, al,...",7.540851,6.060,5.794948,0.0,74,15,5.19,7.25,0.00,2.77,
95,Hardees,"940 thornton rd., lithia springs, ga, 30122, us",7.817955,6.885,6.171346,4.5,71,24,6.16,7.13,0.00,3.38,


In [77]:
#Stack all restaurants
uber_eats_ff_rnd1_prices = merged_hardee #pd.concat([merged_mcd, merged_jack, merged_wendy, merged_shake, merged_bk, merged_sonic, merged_carls, merged_habit, merged_five, merged_hardee]).reset_index(drop=True)

In [63]:
#If there are bad addresses, replace them with the actual address here 

uber_eats_ff_rnd1_prices.loc[1012, 'restaurant_location'] = "s64w15924 commerce center parkway, muskego, wi, 53150"
uber_eats_ff_rnd1_prices.loc[1755, 'restaurant_location'] = "860 peachtree rd ne, atlanta, ga, 30308"

Add location columns 

In [78]:
pattern = r",\s*([a-zA-Z]{2})\s*,?\s*(\d{5}(?:-\d{4})?)"

def extract_state_zip(address):
    match = re.search(pattern, address)
    if match:
        state, zip_code = match.groups()
        return state, zip_code
    else:
        return None, None

# Apply the function to extract state and zip code
uber_eats_ff_rnd1_prices[['state', 'zip']] = uber_eats_ff_rnd1_prices['restaurant_location'].apply(lambda x: pd.Series(extract_state_zip(x)))
uber_eats_ff_rnd1_prices['zip'] = uber_eats_ff_rnd1_prices['zip'].str.split('-').str[0].astype(int)

#Get county 
uber_eats_ff_rnd1_prices = uber_eats_ff_rnd1_prices.merge(ca_zip_count, on = 'zip')

Dataset info

In [79]:
specific_date = datetime.strptime('05142024', '%m%d%Y')
# Assign the datetime object to the entire 'date' column
uber_eats_ff_rnd1_prices['date'] = specific_date
uber_eats_ff_rnd1_prices['uber_eats'] = 1
uber_eats_ff_rnd1_prices['post_policy'] = 1
uber_eats_ff_rnd1_prices['fast_food'] = 0
uber_eats_ff_rnd1_prices['local'] = 0 

In [80]:
#Save as csv 
uber_eats_ff_rnd1_prices.to_csv('processed_prices_ubereats_ca_ffullserv_05142024.csv', index = True)