In [124]:
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [159]:
file_path = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/raw_prices_ubereats_nonca_fullserv_03252024.csv"
ca_ff = pd.read_csv(file_path)
file_path_2 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/uszips.csv"
ca_zip_count = pd.read_csv(file_path_2)
file_path_3 = "/Users/alyssanguyen/Desktop/IRLE_scraping/csv_files/processed_prices_ubereats_ca_ff_03222024.csv"
example = pd.read_csv(file_path_3)

In [160]:
#Drop all the columns we don't need 
ca_ff_ = ca_ff.drop(columns=['Unnamed: 0', 'inputted_location','restaurant_distance'])
ca_ff_ = ca_ff_.dropna(subset=['restaurant_location'])

In [161]:
ca_zip_count = ca_zip_count[['zip', 'county_name']]

In [162]:
#restaurant_rating cleaning 
# Ensure the column is of string type using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].astype(str)

# Count rows containing 'mi'
rows_with_mi = ca_ff_['restaurant_rating'].str.contains('mi').sum()
print("Number of rows with 'mi' in restaurant rating:", rows_with_mi)

# Replace invalid ratings ending with 'mi' with '0' using .loc
ca_ff_.loc[:, 'restaurant_rating'] = ca_ff_['restaurant_rating'].str.replace(r'.*mi$', '0', regex=True)

Number of rows with 'mi' in restaurant rating: 2018


In [163]:
#converting data types 
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].astype('string')
ca_ff_['menu_item'] = ca_ff_['menu_item'].str.replace(r'\s+', ' ', regex=True)
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].astype('string')
ca_ff_['restaurant_rating'] = ca_ff_['restaurant_rating'].str.strip().astype(float)

In [164]:
#cleaning up string columns 

ca_ff_['menu_item'] = ca_ff_['menu_item'].str.lower()
ca_ff_['restaurant_location'] = ca_ff_['restaurant_location'].str.lower()
ca_ff_['restaurant_name'] = ca_ff_['restaurant_name'].str.replace('_', ' ')

#remove special characters
ca_ff_['menu_item'] = ca_ff_['menu_item'].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch.isspace()))
ca_ff_

Unnamed: 0,restaurant_name,menu_item,menu_item_price,restaurant_location,restaurant_rating,number_of_ratings
0,Applebee,2 for 2x,31.29,"2409 south mckenzie street, foley, al, 36535, us",3.6,45
1,Applebee,bourbon street steak,24.39,"2409 south mckenzie street, foley, al, 36535, us",3.6,45
2,Applebee,shrimp n parmesan sirloin,26.29,"2409 south mckenzie street, foley, al, 36535, us",3.6,45
3,Applebee,classic broccoli chicken alfredo,19.39,"2409 south mckenzie street, foley, al, 36535, us",3.6,45
4,Applebee,chicken quesadilla,13.79,"2409 south mckenzie street, foley, al, 36535, us",3.6,45
...,...,...,...,...,...,...
139281,Red Robin,coleslaw,3.59,"7575 edgerton ave, greenfield, wi, 53220, us",4.8,16
139282,Red Robin,sweet potato fries,3.59,"7575 edgerton ave, greenfield, wi, 53220, us",4.8,16
139283,Red Robin,steamed broccoli,2.39,"7575 edgerton ave, greenfield, wi, 53220, us",4.8,16
139284,Red Robin,yukon chips,2.39,"7575 edgerton ave, greenfield, wi, 53220, us",4.8,16


In [165]:
def price_list(x):
    return list(x)

def mean_non_zero(x):
    return np.mean(x[x != 0]) if np.any(x != 0) else 0

def median_non_zero(x):
    return np.median(x[x != 0]) if np.any(x != 0) else 0

def std_non_zero(x):
    return np.std(x[x != 0]) if np.any(x != 0) else 0

In [166]:
#Outback Steakhouse 

ca_ff_outback = ca_ff_[ca_ff_['restaurant_name'] == 'Outback Steakhouse']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_outback = ca_ff_outback.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_outback.columns = [' '.join(col).strip() for col in grouped_outback.columns.values]



In [167]:
#Second part of grouping 
outback_lst = ['aussie fries','boomerang cheeseburger','the outbacker burger', ]

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_outback = ca_ff_outback[ca_ff_outback['menu_item'].isin(outback_lst)].sort_values('menu_item')
menu_items_outback = menu_items_outback.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_outback_2 = menu_items_outback.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_outback_2[['fries', 'cheeseburger', 'hamburger']] = grouped_outback_2['menu_item_price'].apply(pd.Series)
grouped_outback_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_outback = pd.merge(grouped_outback, grouped_outback_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_outback['combo'] = np.nan
merged_outback['specialty_item'] = np.nan
merged_outback

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,fries,cheeseburger,hamburger,combo,specialty_item
0,Outback Steakhouse,"100 buckwalter parkway, bluffton, sc, 29910, us",17.777700,13.585,13.064541,4.1,205,36,4.99,9.99,13.99,,
1,Outback Steakhouse,"100 n pointe blvd, lancaster, pa, 17601, us",16.612051,12.340,12.913717,3.5,80,23,4.99,9.99,14.99,,
2,Outback Steakhouse,"10001 westheimer road # 1010, houston, tx, 770...",17.719055,13.990,13.001316,4.3,413,100+,4.99,9.99,14.99,,
3,Outback Steakhouse,"10220 north michigan road, carmel, in, 46077, us",17.638350,13.180,13.159800,4.0,106,40,4.99,9.99,13.99,,
4,Outback Steakhouse,"10901 university avenue, clive, ia, 50325, us",17.526452,12.990,13.613577,4.4,95,39,4.99,9.99,14.99,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Outback Steakhouse,"9498 brownsboro road, louisville, ky, 40241, us",17.532019,13.585,12.734948,4.6,107,50+,4.99,9.99,13.99,,
141,Outback Steakhouse,"9501 quivira drive, overland park, ks, 66214, us",18.005895,13.180,13.394069,4.6,97,45,4.99,9.99,14.99,,
142,Outback Steakhouse,"9753 fm 1960 bypass, humble, tx, 77338, us",17.640784,13.990,12.766347,4.2,315,97,4.99,9.99,13.99,,
143,Outback Steakhouse,"missingaddress, madison, wi, 53717, us",18.470816,13.990,13.314379,3.8,101,28,4.99,9.99,13.99,,


In [168]:
#Red Robin

ca_ff_rr = ca_ff_[ca_ff_['restaurant_name'] == 'Red Robin']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_rr = ca_ff_rr.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_rr.columns = [' '.join(col).strip() for col in grouped_rr.columns.values]

In [169]:
#Second part of grouping 
rr_lst = ['reds cheeseburger','steak fries', ]

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_rr = ca_ff_rr[ca_ff_rr['menu_item'].isin(rr_lst)].sort_values('menu_item')
menu_items_rr = menu_items_rr.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_rr_2 = menu_items_rr.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_rr_2[['cheeseburger', 'fries']] = grouped_rr_2['menu_item_price'].apply(pd.Series)
grouped_rr_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_rr = pd.merge(grouped_rr, grouped_rr_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_rr['combo'] = np.nan
merged_rr['specialty_item'] = np.nan
merged_rr['hamburger'] = np.nan
merged_rr

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,combo,specialty_item,hamburger
0,Red Robin,"10 cabela drive, garner, nc, 27529, us",11.806102,10.99,7.555994,4.6,119,47,8.39,3.59,,,
1,Red Robin,"100, columbia, sc, 29212, us",11.570909,10.79,7.722017,4.4,222,18,8.39,3.59,,,
2,Red Robin,"1000 greengate centre blvd, greensburg, pa, 15...",11.767311,10.79,7.535969,4.5,120,24,8.39,3.59,,,
3,Red Robin,"101 east brannon rd, nicholasville, ky, 40356, us",11.329640,9.59,7.662360,4.3,112,15,8.39,3.59,,,
4,Red Robin,"10211, collierville, tn, 38017, us",11.873000,10.99,7.812734,4.5,101,48,8.39,3.59,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,Red Robin,"95 n moorland rd, brookfield, wi, 53005, us",11.766256,10.59,7.879995,4.2,221,10,8.39,3.59,,,
79,Red Robin,"9810 riverside pkwy, tulsa, ok, 74137, us",11.474259,10.69,7.667466,0.0,109,10,8.39,3.59,,,
80,Red Robin,"9870 von allmen court, louisville, ky, 40241, us",11.814771,10.79,7.875105,4.3,110,24,8.39,3.59,,,
81,Red Robin,"9990 e 13th st, wichita, ks, 67206, us",10.785050,11.85,5.478729,4.0,101,33,7.11,3.60,,,


In [170]:
#Panera

ca_ff_panera = ca_ff_[ca_ff_['restaurant_name'] == 'Panera Bread']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_panera = ca_ff_panera.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_panera.columns = [' '.join(col).strip() for col in grouped_panera.columns.values]

In [171]:
grouped_panera['cheeseburger'] = np.nan
grouped_panera['hamburger'] = np.nan
grouped_panera['specialty_item'] = np.nan
grouped_panera['combo'] = np.nan
grouped_panera['fries'] = np.nan

In [172]:
grouped_panera

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,hamburger,specialty_item,combo,fries
0,Panera Bread,"103 west farm to market 544, suite 101, murphy...",8.828621,7.49,7.146035,4.4,480,39,,,,,
1,Panera Bread,"11244 tara blvd, suite 100, hampton, ga, 30228...",8.182157,6.49,6.849918,4.5,504,61,,,,,
2,Panera Bread,"120 outfield drive, madison, al, 35758, us",9.976503,7.79,10.243768,4.8,178,12,,,,,
3,Panera Bread,"1410 ne douglas street, lee's summit, mo, 6408...",8.145474,6.89,6.29269,4.4,147,12,,,,,
4,Panera Bread,"16209 west 87th street parkway, lenexa, ks, 66...",8.525811,7.44,6.503731,4.8,163,21,,,,,
5,Panera Bread,"1968 peachtree rd nw, atlanta, ga, 30309, us",9.909512,7.79,9.357851,4.5,895,100,,,,,
6,Panera Bread,"204 shoemaker road, pottstown, pa, 19464, us",8.494027,6.89,6.946989,4.9,328,16,,,,,
7,Panera Bread,"2135 state street, new albany, in, 47150, us",8.613649,7.09,7.123755,0.0,163,6,,,,,
8,Panera Bread,"3140 north pleasantburg drive, greenville, sc,...",8.712297,7.09,7.120724,4.6,326,32,,,,,
9,Panera Bread,"4222 century farms terrace suite 101, antioch,...",8.726054,7.09,7.163266,4.7,486,61,,,,,


In [173]:
#Denny

ca_ff_denny = ca_ff_[ca_ff_['restaurant_name'] == 'Denny']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_denny = ca_ff_denny.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_denny.columns = [' '.join(col).strip() for col in grouped_denny.columns.values]

In [174]:
#Second part of grouping 
denny_lst = ['seasoned fries','single cheeseburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_denny = ca_ff_denny[ca_ff_denny['menu_item'].isin(denny_lst)].sort_values('menu_item')
menu_items_denny = menu_items_denny.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_denny_2 = menu_items_denny.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_denny_2[['fries', 'cheeseburger']] = grouped_denny_2['menu_item_price'].apply(pd.Series)
grouped_denny_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_denny = pd.merge(grouped_denny, grouped_denny_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_denny['combo'] = np.nan
merged_denny['specialty_item'] = np.nan
merged_denny['hamburger'] = np.nan

merged_denny

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,fries,cheeseburger,combo,specialty_item,hamburger
0,Denny,"1-35 north parmer lane, austin, tx, 78753, us",8.910233,7.105,5.680176,4.5,279,72,4.66,12.57,,,
1,Denny,"101 n. 1200th east, lehi, ut, 84043, us",9.546703,6.740,5.888592,3.2,198,13,5.18,13.19,,,
2,Denny,"1015 spur 350 west, euless, tx, 76040, us",10.584022,7.315,6.682475,3.5,100,13,5.81,15.90,,,
3,Denny,"1020 tanger dr., locust grove, ga, 30248, us",10.115816,6.740,6.285091,4.1,106,14,5.39,13.19,,,
4,Denny,"10433 n. central exprwy, dallas, tx, 75230, us",10.626630,7.315,6.639116,4.5,200,41,5.81,15.58,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,Denny,"9650 s 20th st, oak creek, wi, 53154, us",9.579890,6.740,5.887468,4.5,198,29,5.18,13.19,,,
144,Denny,"975 s main st, kernersville, nc, 27284, us",10.065222,9.395,6.284071,3.7,192,18,5.18,12.78,,,
145,Denny,"9766 katy fwy ste b, houston, tx, 77055, us",9.087727,6.840,5.560023,4.6,288,49,4.99,12.49,,,
146,Denny,"9810 gulf freeway, houston, tx, 77034, us",9.298864,6.740,5.815026,4.2,295,72,5.18,12.78,,,


In [175]:
#Applebee

ca_ff_apple = ca_ff_[ca_ff_['restaurant_name'] == 'Applebee']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_apple = ca_ff_apple.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_apple.columns = [' '.join(col).strip() for col in grouped_apple.columns.values]

In [179]:
#Second part of grouping 
apple_lst = ['classic burger', 'classic cheeseburger']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_apple = ca_ff_apple[ca_ff_apple['menu_item'].isin(apple_lst)].sort_values('menu_item')
menu_items_apple = menu_items_apple.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_apple_2 = menu_items_apple.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_apple_2[['hamburger', 'cheeseburger']] = grouped_apple_2['menu_item_price'].apply(pd.Series)
grouped_apple_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_apple = pd.merge(grouped_apple, grouped_apple_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_apple['combo'] = np.nan
merged_apple['specialty_item'] = np.nan
merged_apple['fries'] = np.nan


merged_apple

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,hamburger,cheeseburger,combo,specialty_item,fries
0,Applebee,"101 northern way, york, pa, 17402, us",11.387895,12.79,5.703683,3.9,98,29,12.19,13.39,,,
1,Applebee,"1020 e state hwy 152, mustang, ok, 73064, us",10.748654,11.79,5.957907,4.2,107,15,12.39,12.99,,,
2,Applebee,"1041 admiral callaghan lane, vallejo, ca, 9459...",13.601111,15.39,7.668075,4.2,400,39,15.39,16.69,,,
3,Applebee,"1063 east street road, upper southampton, pa, ...",13.220769,14.69,7.629524,4.7,107,41,14.99,15.59,,,
4,Applebee,"11013 lakeline mall drive, cedar park, tx, 786...",13.599804,15.64,7.225194,3.8,106,72,14.99,16.29,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,Applebee,"9702 airline hwy, baton rouge, la, 70809, us",11.619245,12.39,6.561796,4.5,109,77,12.99,13.49,,,
145,Applebee,"995 i-35, new braunfels, tx, 78130, us",13.062043,13.79,6.551719,4.5,96,33,13.79,14.99,,,
146,Applebee,"greentree mall, clarksville, in, 47129, us",10.935000,11.79,5.880857,4.3,103,50+,12.39,12.99,,,
147,Applebee,"indian brook dr, dover, nh, 03820, us",12.543774,12.49,7.142118,4.4,107,35,14.49,15.79,,,


In [180]:
#TGI Fridays

ca_ff_tgi = ca_ff_[ca_ff_['restaurant_name'] == 'TGI Fridays']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_tgi = ca_ff_tgi.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_tgi.columns = [' '.join(col).strip() for col in grouped_tgi.columns.values]

In [181]:
#Second part of grouping 
tgi_lst = ['cheeseburger', 'seasoned fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_tgi = ca_ff_tgi[ca_ff_tgi['menu_item'].isin(tgi_lst)].sort_values('menu_item')
menu_items_tgi = menu_items_tgi.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_tgi_2 = menu_items_tgi.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_tgi_2[['cheeseburger', 'fries']] = grouped_tgi_2['menu_item_price'].apply(pd.Series)
grouped_tgi_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_tgi = pd.merge(grouped_tgi, grouped_tgi_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_tgi['combo'] = np.nan
merged_tgi['specialty_item'] = np.nan
merged_tgi['hamburger'] = np.nan
merged_tgi

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,combo,specialty_item,hamburger
0,TGI Fridays,"10 n west end blvd, quakertown, pa, 18951, us",24.691736,16.435,27.610381,4.5,149,20,13.56,3.21,,,
1,TGI Fridays,"1100 timber dr e, garner, nc, 27529, us",24.063706,16.09,25.968927,4.0,296,42,13.56,3.21,,,
2,TGI Fridays,"1114 woodruff road, greenville, sc, 29607, us",24.640863,16.43,26.677954,4.1,288,19,13.33,3.09,,,
3,TGI Fridays,"115 hendersonvile rd, asheville, nc, 28803, us",24.063706,16.09,25.968927,3.8,148,17,13.56,3.21,,,
4,TGI Fridays,"1201 manhattan blvd, harvey, la, 70058, us",21.504545,11.83,25.66101,3.9,340,94,11.95,2.86,,,
5,TGI Fridays,"12811 s tryon st, charlotte, nc, us, us",24.465971,16.66,26.228879,3.5,144,26,13.56,3.21,,,
6,TGI Fridays,"1516 s willow st, manchester, nh, 03103, us",24.35295,16.43,26.252838,4.1,144,33,13.33,3.09,,,
7,TGI Fridays,"15190 crossroads pkwy, gulfport, ms, 39503, us",22.077576,12.87,25.997556,4.2,170,22,12.87,2.86,,,
8,TGI Fridays,"1835 n highway 17, mount pleasant, sc, 29464, us",22.712303,13.44,26.322835,4.2,170,34,13.56,3.21,,,
9,TGI Fridays,"1881 mount zion rd, morrow, ga, 30260, us",24.759496,17.24,26.158862,3.8,432,81,14.02,3.32,,,


In [182]:
#'Buffalo Wild Wings'
#TGI Fridays

ca_ff_buff = ca_ff_[ca_ff_['restaurant_name'] == 'Buffalo Wild Wings']

#First part of grouping 

agg_funcs = {
    'menu_item_price': [mean_non_zero, median_non_zero, std_non_zero],  # calculate the average, median, and standard dev PRICE
    'restaurant_rating': 'mean', # calculate the average RATING 
    'menu_item' : 'count',
    'number_of_ratings': 'first'
}

grouped_buff = ca_ff_buff.groupby(['restaurant_name','restaurant_location']).agg(agg_funcs).reset_index()
grouped_buff.columns = [' '.join(col).strip() for col in grouped_buff.columns.values]

In [183]:
#Second part of grouping 
buff_lst = ['allamerican cheeseburger', 'regular french fries']

# Filter rows where 'menu_item' contains any item in mcd_lst
menu_items_buff = ca_ff_buff[ca_ff_buff['menu_item'].isin(buff_lst)].sort_values('menu_item')
menu_items_buff = menu_items_buff.drop_duplicates(subset=['restaurant_name', 'restaurant_location', 'menu_item'])

grouped_buff_2 = menu_items_buff.groupby(['restaurant_name', 'restaurant_location'])['menu_item_price'].agg(price_list).reset_index()

grouped_buff_2[['cheeseburger', 'fries']] = grouped_buff_2['menu_item_price'].apply(pd.Series)
grouped_buff_2.drop(columns=['menu_item_price'], inplace=True)

#Merging the grouped dfs together 
merged_buff = pd.merge(grouped_buff, grouped_buff_2, on=['restaurant_name', 'restaurant_location'], how='inner')
merged_buff['combo'] = np.nan
merged_buff['specialty_item'] = np.nan
merged_buff['hamburger'] = np.nan
merged_buff

Unnamed: 0,restaurant_name,restaurant_location,menu_item_price mean_non_zero,menu_item_price median_non_zero,menu_item_price std_non_zero,restaurant_rating mean,menu_item count,number_of_ratings first,cheeseburger,fries,combo,specialty_item,hamburger
0,Buffalo Wild Wings,"1000 bower pkwy, columbia, sc, 29212, us",7.653878,6.24,8.908161,3.8,214,39,13.29,4.79,,,
1,Buffalo Wild Wings,"1007 w university ave ste 1, georgetown, tx, 7...",6.134494,4.49,5.854237,4.3,96,80,13.29,4.79,,,
2,Buffalo Wild Wings,"1018 crossings blvd, spring hill, tn, 37174, us",7.488478,5.04,9.060482,4.3,101,27,13.29,4.79,,,
3,Buffalo Wild Wings,"103 quarry rd, downingtown, pa, 19335, us",7.867128,6.49,8.904293,4.8,103,35,14.79,5.29,,,
4,Buffalo Wild Wings,"10497 gateway blvd w, el paso, tx, 79925, us",7.411075,5.29,8.996329,4.3,101,76,13.29,4.79,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Buffalo Wild Wings,"92 n 1200 e, lehi, ut, 84043, us",7.692653,6.24,8.925354,3.6,107,34,13.29,4.79,,,
151,Buffalo Wild Wings,"920 baltimore pike, glen mills, pa, 19342, us",7.815000,6.49,8.729356,3.5,428,54,14.79,5.29,,,
152,Buffalo Wild Wings,"9402 northlake west dr, charlotte, nc, 28216, us",7.814082,6.49,9.003215,4.0,107,100+,13.79,4.79,,,
153,Buffalo Wild Wings,"9701 roosevelt blvd, philadelphia, pa, 19114, us",7.813196,6.49,8.739313,4.4,315,57,14.49,4.79,,,


In [184]:
#Stack all restaurants
ubereats_fullserv_prices = pd.concat([merged_outback, grouped_panera, merged_rr, merged_buff, merged_tgi, merged_apple, merged_denny]).reset_index(drop=True)

In [187]:
#If there are bad addresses, replace them with the actual address here 

ubereats_fullserv_prices.loc[403, 'restaurant_location'] = "12811 s tryon st, charlotte, nc, 28273"
ubereats_fullserv_prices.loc[408, 'restaurant_location'] = "20430 us-59, humble, tx, 77338"
ubereats_fullserv_prices.loc[608, 'restaurant_location'] = "13635 san pedro ave, san antonio, tx, 78232"
ubereats_fullserv_prices.loc[640, 'restaurant_location'] =  "2500 sm 291 hwy, independence, mo, 64055"
ubereats_fullserv_prices.loc[725, 'restaurant_location'] = "9001 shawnee mission parkway, merriam, ks, 66202"

In [188]:
pattern = r",\s*([a-zA-Z]{2})\s*,?\s*(\d{5}(?:-\d{4})?)"

def extract_state_zip(address):
    match = re.search(pattern, address)
    if match:
        state, zip_code = match.groups()
        return state, zip_code
    else:
        return None, None

# Apply the function to extract state and zip code
ubereats_fullserv_prices[['state', 'zip']] = ubereats_fullserv_prices['restaurant_location'].apply(lambda x: pd.Series(extract_state_zip(x)))
ubereats_fullserv_prices['zip'] = ubereats_fullserv_prices['zip'].str.split('-').str[0].astype(int)

#Get county 
ubereats_fullserv_prices = ubereats_fullserv_prices.merge(ca_zip_count, on = 'zip')

In [189]:
specific_date = datetime.strptime('03252024', '%m%d%Y')
# Assign the datetime object to the entire 'date' column
ubereats_fullserv_prices['date'] = specific_date
ubereats_fullserv_prices['uber_eats'] = 1
ubereats_fullserv_prices['post_policy'] = 0
ubereats_fullserv_prices['fast_food'] = 0
ubereats_fullserv_prices['local'] = 0 

In [190]:
#Save as csv 
ubereats_fullserv_prices.to_csv('processed_prices_ubereats_nonca_fullserv_03252024.csv', index = True)