In [2]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [3]:
file_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_ca_ff_03222024.csv"
ca_ff = pd.read_csv(file_path, low_memory = False)
file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/uszips.csv"
ca_zip_count = pd.read_csv(file_path_2)

In [4]:
ca_ff[ca_ff['restaurant_name'] == 'Hardees']

Unnamed: 0.1,Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,inputted_location,restaurant_rating,number_of_ratings,restaurant_distance


In [5]:
#Nan values 
nan_count = ca_ff.isnull().sum()
print(nan_count)

Unnamed: 0               0
restaurant_name          0
menu_item                0
menu_item_price          0
restaurant_location    250
inputted_location        0
restaurant_rating        0
number_of_ratings        0
restaurant_distance    235
dtype: int64


In [6]:
#Drop all the columns we don't need 
ca_ff_ = ca_ff.drop(columns=['Unnamed: 0', 'inputted_location','restaurant_distance'])
ca_ff_ = ca_ff_.dropna(subset=['restaurant_location'])

In [7]:
ca_zip_count = ca_zip_count[['zip', 'county_name']]

Exploring missing values 

In [23]:
#restaurant_rating cleaning 
# Ensure the column is of string type using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].astype(str)

# Count rows containing 'mi'
rows_with_mi = ca_ff_['restaurant_rating'].str.contains('mi').sum()
print("Number of rows with 'mi' in restaurant rating:", rows_with_mi)

# Replace invalid ratings ending with 'mi' with '0' using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].str.replace(r'.*mi$', '0', regex=True)
ca_ff_ = ca_ff_.dropna()
ca_ff_ = ca_ff_[ca_ff_['menu_item_price'] != 0]

Number of rows with 'mi' in restaurant rating: 0


In [24]:
#converting data types 
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].str.replace(r'\s+', ' ', regex=True)
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].astype('string')
ca_ff_['restaurant_rating'] = ca_ff_['restaurant_rating'].str.strip().astype(float)

In [25]:
#cleaning up string columns 

ca_ff_['menu_item'] = ca_ff_['menu_item'].str.lower()
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].str.lower()
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].str.replace('_', ' ')

#remove special characters
ca_ff_['menu_item'] = ca_ff_['menu_item'].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch.isspace()))

In [26]:
def price_list(x):
    return list(x)

In [27]:
def mean_non_zero(x):
    return np.mean(x[x != 0]) if np.any(x != 0) else 0

def median_non_zero(x):
    return np.median(x[x != 0]) if np.any(x != 0) else 0

def std_non_zero(x):
    return np.std(x[x != 0]) if np.any(x != 0) else 0

Summary Stats

McDonald's 

In [67]:
#Filter to just McDonalds
ca_ff_mcd = ca_ff_[ca_ff_['restaurant_name'] == 'McDonald']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_mcd = ca_ff_mcd.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_mcd.columns = [' '.join(col).strip() for col in grouped_mcd.columns.values]


#Second part of grouping 
mcd_lst = ['big mac', 'big mac meal', 'cheeseburger', 'hamburger', 'medium french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_mcd = ca_ff_mcd[ca_ff_mcd['menu_item'].isin(mcd_lst)].sort_values('menu_item')
menu_items_mcd = menu_items_mcd.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_mcd_2 = menu_items_mcd.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_mcd_2[['specialty_item', 'combo', 'cheeseburger', 'hamburger', 'fries']] = grouped_mcd_2['menu_item_price'].apply(pd.Series)
grouped_mcd_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_mcd = pd.merge(grouped_mcd, grouped_mcd_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_mcd

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,combo,cheeseburger,hamburger,fries
0,McDonald,"1 christy dr, chadds ford, pa, 19317, us",5.516615,3.61,4.646159,4.3,145,90,6.59,11.69,3.09,2.49,3.99
1,McDonald,"100 brownswitch rd, slidell, la, 70458, us",4.788060,4.07,4.223423,4.0,134,360+,5.87,10.43,2.39,2.27,2.87
2,McDonald,"100 w parkwood ave, friendswood, tx, 77546, us",4.906496,3.69,3.657060,4.6,152,1,5.59,8.89,1.79,2.39,3.19
3,McDonald,"1000 commerce st, dallas, tx, 75202, us",4.806615,3.99,2.961904,4.4,146,4,5.89,11.19,2.79,2.19,4.09
4,McDonald,"1001 n 9th street, reading, pa, 19604, us",6.607355,4.69,4.840492,4.1,143,2,7.99,13.29,3.89,3.59,5.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,McDonald,"bryan, usa, bryan, tx, 77802, us",5.169764,4.09,4.200394,4.4,144,600+,5.11,8.19,2.35,2.15,3.11
582,McDonald,"i-45 at fm 646, league city, tx, 77539, us",5.033023,3.89,3.201751,4.3,151,180+,5.59,10.29,1.89,1.79,3.19
583,McDonald,"one poplar st, pittsburgh, pa, 15205, us",6.018992,4.66,4.304436,4.5,143,2,7.00,12.72,3.36,2.84,4.27
584,McDonald,"rt 420 &amp; 13, prospect park, pa, 19076, us",6.022083,3.84,4.671284,4.6,142,2,6.39,11.89,3.19,2.75,4.23


Jack in the Box 

In [68]:
#Filter to just Jack in the Box
ca_ff_jack = ca_ff_[ca_ff_['restaurant_name'] == 'Jack in the Box']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_jack = ca_ff_jack.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_jack.columns = [' '.join(col).strip() for col in grouped_jack.columns.values]

# #Second part of grouping 
jack_lst = ['jr jumbo jack', 'jr jumbo jack cheeseburger', 'jumbo jack', 'large french fry', 'large jumbo jack combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_jack = ca_ff_jack[ca_ff_jack['menu_item'].isin(jack_lst)].sort_values('menu_item')
menu_items_jack = menu_items_jack.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_jack_2 = menu_items_jack.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_jack_2[['hamburger', 'cheeseburger', 'specialty_item', 'fries', 'combo']] = grouped_jack_2['menu_item_price'].apply(pd.Series)
grouped_jack_2.drop(columns=['menu_item_price'], inplace=True)

# #Merging the grouped dfs together 
merged_jack = pd.merge(grouped_jack, grouped_jack_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_jack

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,hamburger,cheeseburger,specialty_item,fries,combo
0,Jack in the Box,"1000 east 41st street unit k, austin, tx, 7875...",8.531185,7.86,4.214700,4.3,135,1,3.74,4.11,6.11,5.36,12.35
1,Jack in the Box,"10004 telephone rd, houston, tx, 77075, us",7.440199,6.92,3.490341,4.5,151,370+,3.01,3.39,6.29,4.52,10.81
2,Jack in the Box,"1001 leander rd, georgetown, tx, 78628, us",8.661769,7.86,4.100395,4.4,147,1,3.74,4.11,6.11,5.36,12.35
3,Jack in the Box,"1001 south fwy, fort worth, tx, 76104, us",6.832810,6.24,3.350544,4.4,153,500+,2.86,3.24,4.99,3.36,9.98
4,Jack in the Box,"10014 s memorial dr e, tulsa, ok, 74133, us",6.932566,6.24,3.317524,4.2,152,250+,2.99,3.36,4.99,3.99,9.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,Jack in the Box,"9101 johnson dr, merriam, ks, 66202, us",7.128933,6.61,3.423880,4.1,150,600+,3.11,3.49,5.49,3.61,10.85
197,Jack in the Box,"916 n expressway, brownsville, tx, 78521, us",7.072857,6.24,3.452936,4.4,161,230+,2.99,3.36,5.36,4.36,10.23
198,Jack in the Box,"925 n loop 12, irving, tx, 75061, us",7.252129,6.61,3.617776,4.4,155,1,3.11,3.74,5.36,3.74,10.99
199,Jack in the Box,"94 e. crosstimbers st, houston, tx, 77022, us",8.285130,6.86,4.633759,4.4,154,900+,5.36,4.11,10.49,,


In [69]:
ca_ff_jack

Unnamed: 0,Unnamed: 0.1,restaurant_name,menu_item,menu_item_price,restaurant_location,restaurant_rating,number_of_ratings
41306,24852,Jack in the Box,cubes munchie meal,15.00,"3426 main st ste 102, pearland, tx, 77581, us",4.5,260+
41307,24853,Jack in the Box,chickntater melt,8.11,"3426 main st ste 102, pearland, tx, 77581, us",4.5,260+
41308,24854,Jack in the Box,large 12pc classic crispy chicken wings combo,25.11,"3426 main st ste 102, pearland, tx, 77581, us",4.5,260+
41309,24855,Jack in the Box,12pc classic crispy chicken wings,19.99,"3426 main st ste 102, pearland, tx, 77581, us",4.5,260+
41310,24856,Jack in the Box,large 12pc buffalo crispy chicken wings combo,25.11,"3426 main st ste 102, pearland, tx, 77581, us",4.5,260+
...,...,...,...,...,...,...,...
339522,323068,Jack in the Box,bacon breakfast jack,4.11,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
339523,323069,Jack in the Box,sausage breakfast jack,4.11,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
339524,323070,Jack in the Box,breakfast jack,3.74,"11432 district drive, south jordan, ut, 84095, us",4.0,120+
339525,323071,Jack in the Box,mini pancakes wsyrup,3.74,"11432 district drive, south jordan, ut, 84095, us",4.0,120+


Wendy's

In [70]:
#Filter to just Wendy's
ca_ff_wendy = ca_ff_[ca_ff_['restaurant_name'] == 'Wendy']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'first', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_wendy = ca_ff_wendy.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_wendy.columns = [' '.join(col).strip() for col in grouped_wendy.columns.values]

# #Second part of grouping 
wendy_lst = ['daves combo', 'daves single', 'french fries', 'jr cheeseburger', 'jr hamburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_wendy = ca_ff_wendy[ca_ff_wendy['menu_item'].isin(wendy_lst)].sort_values('menu_item')
menu_items_wendy = menu_items_wendy.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_wendy_2 = menu_items_wendy.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_wendy_2[['combo', 'specialty_item', 'fries', 'cheeseburger', 'hamburger']] = grouped_wendy_2['menu_item_price'].apply(pd.Series)
grouped_wendy_2.drop(columns=['menu_item_price'], inplace=True)

merged_wendy = pd.merge(grouped_wendy, grouped_wendy_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_wendy


Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating first,menu_item count,number_of_ratings first,combo,specialty_item,fries,cheeseburger,hamburger
0,Wendy,"100 s. greensboro st., carrboro, nc, 27510, us",5.466368,4.750,3.299955,4.1,348,800+,0.0,6.33,0.0,2.67,1.94
1,Wendy,"100 scarlet dr., chapel hill, nc, 27517, us",5.400192,4.625,3.344126,4.4,88,420+,0.0,6.33,0.0,2.43,1.82
2,Wendy,"1000 aerial center parkway, morrisville, nc, 2...",5.400192,4.625,3.344126,4.0,88,190+,0.0,6.33,0.0,2.43,1.82
3,Wendy,"1000 east 41st street, austin, tx, 78751, us",5.854151,4.870,2.861297,4.4,88,1,0.0,6.94,0.0,3.16,2.79
4,Wendy,"1000 w walnut hill lane, irving, tx, 75038, us",5.793846,4.990,2.939536,4.4,87,600+,0.0,6.70,0.0,2.92,2.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Wendy,"9955 bluebonnet rd, baton rouge, la, 70810, us",5.639423,4.870,2.805198,3.9,86,220+,0.0,6.94,0.0,3.04,2.67
453,Wendy,"997 north state street, orem, ut, 84057, us",5.762745,4.870,2.944657,4.5,85,200+,0.0,6.94,0.0,2.67,2.31
454,Wendy,"n 1100 w block park lane, farmington, ut, 8402...",5.805882,4.870,2.962655,4.4,85,68,0.0,6.70,0.0,2.79,2.31
455,Wendy,"ocean highway east, leland, nc, 28451, us",5.381923,4.620,3.117716,4.2,86,100+,0.0,6.33,0.0,2.43,2.18


Burger King

In [71]:
#Filter to just Burger King 
ca_ff_bk = ca_ff_[ca_ff_['restaurant_name'] == 'Burger King']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_bk = ca_ff_bk.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_bk.columns = [' '.join(col).strip() for col in grouped_bk.columns.values]

# #Second part of grouping 
#they don't have a plain hamburger FOR NOW using whopper jr 
bk_lst = ['cheeseburger', 'french fries', 'whopper', 'whopper jr', 'whopper meal']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_bk = ca_ff_bk[ca_ff_bk['menu_item'].isin(bk_lst)].sort_values('menu_item')
menu_items_bk = menu_items_bk.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_bk_2 = menu_items_bk.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_bk_2[['cheeseburger', 'fries', 'specialty_item', 'hamburger', 'combo']] = grouped_bk_2['menu_item_price'].apply(pd.Series)
grouped_bk_2.drop(columns=['menu_item_price'], inplace=True)

merged_bk = pd.merge(grouped_bk, grouped_bk_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_bk

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,specialty_item,hamburger,combo
0,Burger King,"100 legends road, myrtle beach, sc, 29579, us",13.679296,13.740,8.316944,4.2,154,1,3.11,,,,
1,Burger King,"100 river oaks cove, georgetown, tx, 78626, us",7.673038,6.590,5.985244,4.3,79,340+,2.69,3.39,7.39,3.59,12.19
2,Burger King,"1001 east ben white boulevard, austin, tx, 787...",7.639136,6.590,5.747541,4.3,359,1,2.69,3.39,7.39,3.59,12.19
3,Burger King,"1004 north memorial parkway, huntsville, al, 3...",4.538644,3.590,2.096201,4.5,99,170+,2.75,3.47,7.07,4.49,0.00
4,Burger King,"1005 south military avenue, green bay, wi, 543...",8.376489,6.815,6.084001,4.3,94,500+,2.98,4.28,8.95,5.18,15.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,Burger King,"925 winchester road, huntsville, al, 35811, us",5.991807,6.100,2.899433,4.4,83,100+,2.69,3.79,6.60,4.15,9.30
398,Burger King,"9523 north lamar boulevard, austin, tx, 78753, us",7.673038,6.590,5.985244,4.2,79,600+,2.69,3.39,7.39,3.59,12.19
399,Burger King,"9533 corporation drive, indianapolis, in, 4625...",7.467568,6.240,5.947237,4.3,74,320+,2.39,3.49,7.09,4.99,11.39
400,Burger King,"974 south academy street, greenville, sc, 2960...",5.962289,6.100,2.914555,4.5,83,280+,2.45,3.79,6.60,3.79,9.30


Shake Shack

In [72]:
#Filter to just Shake Shack
ca_ff_shake = ca_ff_[ca_ff_['restaurant_name'] == 'Shake Shack']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_shake = ca_ff_shake.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_shake.columns = [' '.join(col).strip() for col in grouped_shake.columns.values]


#Second part of grouping 
shake_lst = ['cheeseburger', 'fries', 'hamburger', 'shackburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_shake = ca_ff_shake[ca_ff_shake['menu_item'].isin(shake_lst)].sort_values('menu_item')
menu_items_shake = menu_items_shake.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_shake_2 = menu_items_shake.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_shake_2[['cheeseburger', 'fries', 'hamburger', 'specialty_item']] = grouped_shake_2['menu_item_price'].apply(pd.Series)
grouped_shake_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_shake = pd.merge(grouped_shake, grouped_shake_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_shake['combo'] = np.nan

Sonic

In [73]:
#FLAG many missing prices for fries and 
#Filter to just Sonic 
ca_ff_sonic = ca_ff_[ca_ff_['restaurant_name'] == 'Sonic']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_sonic = ca_ff_sonic.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_sonic.columns = [' '.join(col).strip() for col in grouped_sonic.columns.values]


#Second part of grouping 
sonic_lst = ['fries', 'quarter pound double cheeseburger', 'supersonic double cheeseburger', 'supersonic double cheeseburger combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_sonic = ca_ff_sonic[ca_ff_sonic['menu_item'].isin(sonic_lst)].sort_values('menu_item')
menu_items_sonic = menu_items_sonic.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_sonic_2 = menu_items_sonic.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_sonic_2[['fries', 'cheeseburger', 'specialty_item', 'combo']] = grouped_sonic_2['menu_item_price'].apply(pd.Series)
grouped_sonic_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_sonic = pd.merge(grouped_sonic, grouped_sonic_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_sonic['hamburger'] = np.nan
merged_sonic

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,fries,cheeseburger,specialty_item,combo,hamburger
0,Sonic,"100 n nichols st, fort worth, tx, 76102, us",4.633333,3.830,2.955107,4.475,348,600+,3.59,7.79,0.00,,
1,Sonic,"1001 tunnel road, asheville, nc, 28805, us",2.935946,2.890,1.189730,4.700,37,32,1.29,2.59,5.29,,
2,Sonic,"1005 south riverfront, dallas, tx, 75207, us",5.002273,4.505,2.884443,4.500,262,1,0.00,4.26,7.80,0.0,
3,Sonic,"10075 w. 75th street, overland park, ks, 66204...",5.282222,4.870,2.942151,4.300,87,260+,4.87,8.28,0.00,,
4,Sonic,"101 n. university, lubbock, tx, 79415, us",5.116667,4.070,3.198826,4.400,435,500+,4.79,9.47,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,Sonic,"951 montlimar dr, mobile, al, 36609, us",5.339737,4.560,3.143574,3.800,88,500+,0.00,4.38,9.38,0.0,
219,Sonic,"9704 sc-707, myrtle beach, sc, 29588, us",5.382414,4.870,3.064439,4.500,59,240+,4.26,7.80,,,
220,Sonic,"971 n main st, spanish fork, ut, 84660, us",4.782632,4.255,2.874614,4.600,88,68,0.00,3.50,8.18,0.0,
221,Sonic,"9829 parkway e, birmingham, al, 35215, us",4.661111,4.010,3.082963,4.100,86,41,4.50,8.04,0.00,,


Five Guys 

In [74]:
#Filter for Five Guys
ca_ff_five = ca_ff_[ca_ff_['restaurant_name'] == 'Five Guys']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_five = ca_ff_five.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_five.columns = [' '.join(col).strip() for col in grouped_five.columns.values]


#Second part of grouping 
five_lst = ['cheeseburger', 'little cheeseburger', 'little hamburger', 'regular fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_five = ca_ff_five[ca_ff_five['menu_item'].isin(five_lst)].sort_values('menu_item')
menu_items_five = menu_items_five.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_five_2 = menu_items_five.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_five_2[['specialty_item', 'cheeseburger', 'hamburger', 'fries']] = grouped_five_2['menu_item_price'].apply(pd.Series)
grouped_five_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_five = pd.merge(grouped_five, grouped_five_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_five['combo'] = np.nan
merged_five

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,cheeseburger,hamburger,fries,combo
0,Five Guys,"100 moon shot dr suite 110, madison, al, 35758...",6.838276,6.59,3.056315,4.8,29,240+,11.39,8.87,8.27,5.99,
1,Five Guys,"1000 easton rd., wyncote, pa, 19095, us",8.352759,8.51,3.360459,4.7,29,490+,13.67,10.43,9.35,7.31,
2,Five Guys,"10000 research blvd, austin, tx, 78759, us",8.805714,9.11,3.218897,4.7,140,3,13.67,10.67,9.35,7.67,
3,Five Guys,"1019 fording island rd., suite 101g, bluffton,...",8.344483,8.39,3.323807,4.7,29,700+,13.55,10.55,9.23,7.43,
4,Five Guys,"10495 highway 49, gulfport, ms, 39503, us",7.574483,7.38,2.698121,4.7,29,700+,11.82,9.42,8.58,7.02,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Five Guys,"9749 northlake center parkway; suite e, charlo...",8.344483,8.39,3.323807,4.6,29,700+,13.55,10.55,9.23,7.43,
194,Five Guys,"9826 gilead rd., suite c-106, huntersville, nc...",8.116897,7.91,3.233925,4.8,29,1,13.07,10.07,8.87,7.19,
195,Five Guys,"east town center, lincoln hwy. east, lancaster...",8.824483,9.23,3.374954,4.6,29,350+,13.91,11.27,9.71,7.19,
196,Five Guys,"galleria mall, 13350, dallas, tx, 75240, us",8.672857,9.11,3.332831,4.6,84,1,13.67,10.67,9.35,7.67,


The Habit

In [75]:
ca_ff_habit = ca_ff_[ca_ff_['restaurant_name'] == 'The Habit']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_habit = ca_ff_habit.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_habit.columns = [' '.join(col).strip() for col in grouped_habit.columns.values]


#Second part of grouping 
habit_lst = ['2 original double char meal', 'charburger', 'charburger with cheese', 'double char', 'french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_habit = ca_ff_habit[ca_ff_habit['menu_item'].isin(habit_lst)].sort_values('menu_item')
menu_items_habit = menu_items_habit.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_habit_2 = menu_items_habit.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_habit_2[['combo', 'hamburger', 'cheeseburger', 'specialty_item','fries']] = grouped_habit_2['menu_item_price'].apply(pd.Series)
grouped_habit_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_habit = pd.merge(grouped_habit, grouped_habit_2, on=['restaurant_name', 'restaurant_location'], how='inner')
#merged_five['combo'] = np.nan
merged_habit

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,combo,hamburger,cheeseburger,specialty_item,fries
0,The Habit,"1080 monroe st, albany, ca, 94706, us",11.874026,8.99,10.985474,4.5,77,1,18.24,7.49,8.61,9.74,4.74
1,The Habit,"1122 west wade hampton boulevard, greer, sc, 2...",10.368974,7.835,9.666494,4.5,156,41,15.87,6.74,7.86,8.74,3.74
2,The Habit,"120 village drive, king of prussia, usa, upper...",10.354615,7.835,9.672048,4.7,312,430+,15.87,6.74,7.86,8.74,3.74
3,The Habit,"1255 s california blvd, walnut creek, ca, 9459...",11.886842,8.865,10.904491,4.5,76,600+,18.24,7.49,8.61,9.74,4.74
4,The Habit,"1412, pinole, ca, 94564, us",11.690641,8.74,10.831685,4.5,78,1,18.24,7.49,8.61,9.74,4.74
5,The Habit,"1489 w. o. ezell boulevard, spartanburg, sc, 2...",10.0765,7.365,10.693992,3.5,60,16,15.87,6.74,7.86,8.74,3.74
6,The Habit,"16291 n marketplace blvd, nampa, id, 83687, us",10.517821,7.835,9.701138,4.4,78,600+,16.24,6.74,7.86,8.74,4.11
7,The Habit,"2050 skibo rd, fayetteville, nc, 28314, us",10.899231,8.74,9.761548,4.4,78,700+,17.24,6.99,8.11,9.24,4.36
8,The Habit,"2206 n eagle rd, meridian, id, 83646, us",10.354615,7.835,9.672048,4.4,78,900+,15.87,6.74,7.86,8.74,3.74
9,The Habit,"2245 n university pkwy, provo, ut, 84604, us",10.354615,7.835,9.672048,4.2,78,330+,15.87,6.74,7.86,8.74,3.74


Carl's Jr.

In [76]:
#Filter to Carl's Jr 

ca_ff_carls = ca_ff_[ca_ff_['restaurant_name'] == 'Carls Jr']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_carls = ca_ff_carls.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_carls.columns = [' '.join(col).strip() for col in grouped_carls.columns.values]


#Second part of grouping 
carls_lst = ['california classic double cheeseburger', 'naturalcut french fries', 'single big carl', 'single big carl combo']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_carls = ca_ff_carls[ca_ff_carls['menu_item'].isin(carls_lst)].sort_values('menu_item')
menu_items_carls = menu_items_carls.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_carls_2 = menu_items_carls.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_carls_2[['cheeseburger', 'fries', 'specialty_item','combo']] = grouped_carls_2['menu_item_price'].apply(pd.Series)
grouped_carls_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_carls = pd.merge(grouped_carls, grouped_carls_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_carls['hamburger'] = np.nan
merged_carls

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,specialty_item,combo,hamburger
0,Carls Jr,"1 hallidie plaza, san francisco, ca, 94102, us",9.49283,7.85,4.711938,4.5,87,4,7.02,5.7,11.78,0.0,
1,Carls Jr,"1039 w university ave, georgetown, tx, 78628, us",6.537963,5.8,2.991468,4.6,95,480+,5.15,3.6,7.0,0.0,
2,Carls Jr,"10620 montana ave, el paso, tx, 79935, us",5.813077,4.84,2.754061,4.6,92,500+,4.19,2.99,7.49,0.0,
3,Carls Jr,"10770 macarthur blvd, oakland, ca, 94605, us",8.688679,7.17,4.22068,4.5,87,600+,6.39,5.19,9.89,0.0,
4,Carls Jr,"1120 e charles page blvd, sand springs, ok, 74...",8.267407,8.8,4.384035,4.3,55,50,5.11,3.11,7.49,11.99,
5,Carls Jr,"1124 mcrae blvd, el paso, tx, 79925, us",5.813077,4.84,2.754061,4.5,92,340+,4.19,2.99,7.49,0.0,
6,Carls Jr,"1155 w, riverdale, ut, 84405, us",6.951207,5.59,4.195929,4.374599,374,280+,5.09,3.79,7.69,0.0,
7,Carls Jr,"1320 n eagle rd, meridian, id, 83642, us",6.453462,5.34,3.048141,4.5,92,230+,4.99,3.79,6.29,0.0,
8,Carls Jr,"1331 n state st, orem, ut, 84057, us",7.135455,6.39,4.185595,4.4,95,170+,5.09,3.79,7.69,0.0,
9,Carls Jr,"1550 fitzgerald drive, pinole, ca, 94564, us",10.557059,10.45,5.405885,4.6,91,500+,6.0,4.55,8.05,13.45,


In [85]:
#Filter to Hardees's Jr 

ca_ff_hardee = ca_ff_[ca_ff_['restaurant_name'] == 'Hardees']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_hardee = ca_ff_hardee.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_hardee.columns = [' '.join(col).strip() for col in grouped_hardee.columns.values]


#Second part of grouping 
carls_lst = ['big cheeseburger', 'famous star', 'famous star combo', 'naturalcut french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_hardee = ca_ff_hardee[ca_ff_hardee['menu_item'].isin(carls_lst)].sort_values('menu_item')
menu_items_hardee = menu_items_hardee.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_hardee_2 = menu_items_hardee.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_hardee_2[['cheeseburger', 'specialty_item', 'combo','fries']] = grouped_hardee_2['menu_item_price'].apply(pd.Series)
grouped_hardee_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_hardee = pd.merge(grouped_hardee, grouped_hardee_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_hardee['hamburger'] = np.nan
merged_hardee

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,specialty_item,combo,fries,hamburger
0,Hardees,"1 gateway blvd s, savannah, ga, 31419-7551, us",7.473636,4.99,6.925371,4.200000,83,81,5.74,7.99,0.00,3.24,
1,Hardees,"10 e clemmonsville rd, winston salem, nc, 2712...",5.760429,3.62,4.972781,4.439394,99,120+,5.19,7.61,0.00,2.89,
2,Hardees,"10 hwy 17 n, surfside beach, sc, 29575, us",5.753291,4.10,4.665328,4.500000,120,160+,5.19,7.61,0.00,2.89,
3,Hardees,"100 governor treutlen rd, pooler, ga, 31322, us",8.641628,6.24,7.401294,3.700000,69,21,5.74,7.99,0.00,3.24,
4,Hardees,"100 vandora springs rd., garner, nc, 27529, us",6.846340,6.24,3.915802,4.400000,156,340+,5.49,6.74,11.11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Hardees,"917 allison-bonnett memorial dr, hueytown, al,...",7.444545,5.49,6.032903,3.800000,73,53,5.19,7.25,0.00,2.77,
101,Hardees,"921 east washington street, indianapolis, in, ...",5.037448,3.11,4.213473,4.100000,199,280+,5.63,7.67,0.00,2.87,
102,Hardees,"940 thornton rd., lithia springs, ga, 30122, us",7.680000,6.70,6.376385,4.100000,70,500+,6.16,7.13,0.00,3.38,
103,Hardees,"975 glynn street north, fayetteville, ga, 3021...",10.087541,9.78,6.092056,4.600000,124,440+,6.31,7.30,11.52,3.46,


In [88]:
#Stack all restaurants
uber_eats_ff_rnd1_prices = pd.concat([merged_mcd, merged_jack, merged_wendy, merged_shake, merged_bk, merged_sonic, merged_carls, merged_habit, merged_five, merged_hardee]).reset_index(drop=True)

In [95]:
#If there are bad addresses, replace them with the actual address here 

uber_eats_ff_rnd1_prices.loc[1243, 'restaurant_location'] = "s64w15924 commerce center parkway, muskego, wi, 53150"
uber_eats_ff_rnd1_prices.loc[1998, 'restaurant_location'] = "1052 northside dr. nw, atlanta, ga, 30318"
uber_eats_ff_rnd1_prices.loc[2175, 'restaurant_location'] = "860 peachtree rd ne, atlanta, ga, 30308"

Add location columns 

In [96]:
pattern = r",\s*([a-zA-Z]{2})\s*,?\s*(\d{5}(?:-\d{4})?)"

def extract_state_zip(address):
    match = re.search(pattern, address)
    if match:
        state, zip_code = match.groups()
        return state, zip_code
    else:
        return None, None

# Apply the function to extract state and zip code
uber_eats_ff_rnd1_prices[['state', 'zip']] = uber_eats_ff_rnd1_prices['restaurant_location'].apply(lambda x: pd.Series(extract_state_zip(x)))
uber_eats_ff_rnd1_prices['zip'] = uber_eats_ff_rnd1_prices['zip'].str.split('-').str[0].astype(int)

#Get county 
uber_eats_ff_rnd1_prices = uber_eats_ff_rnd1_prices.merge(ca_zip_count, on = 'zip')

In [94]:
nan_zip_rows = uber_eats_ff_rnd1_prices[uber_eats_ff_rnd1_prices['zip'].isna()]
nan_zip_rows

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,specialty_item,combo,cheeseburger,hamburger,fries,restaurant_rating first,state,zip
1243,Wendy,"s64w15924 commerce center city pwky, muskego, ...",6.053725,4.99,3.480474,,86,35,7.31,0.0,2.67,2.43,0.0,4.4,,
1998,Five Guys,"1052 northside dr. nw, atlanta, ga, 346, us",8.887143,9.11,3.308438,4.4,28,59,13.91,,10.91,9.59,7.79,,,
2175,Five Guys,"860 peachtree st ne, atlanta, ga, us, us",8.228462,7.73,3.337332,4.5,52,7,13.31,,10.43,9.11,7.19,,,


Dataset info

In [55]:
specific_date = datetime.strptime('05162024', '%m%d%Y')
# Assign the datetime object to the entire 'date' column
uber_eats_ff_rnd1_prices['date'] = specific_date
uber_eats_ff_rnd1_prices['uber_eats'] = 1
uber_eats_ff_rnd1_prices['post_policy'] = 1
uber_eats_ff_rnd1_prices['fast_food'] = 1
uber_eats_ff_rnd1_prices['local'] = 0 

In [97]:
#Save as csv 
uber_eats_ff_rnd1_prices.to_csv('final_processed_prices_ubereats_nonca_ff_05162024.csv', index = True)