In [16]:
import pandas as pd
import json
import os

# Background

Post-scraping we have 2 types of data: basic restaurant details & subsequent review data. Let's clean and flatten the subsquent data frames so we can export them out into re-usable CSVs.

# Restaurant Reviews Data

In [17]:
def load_reviews(file_name): 
    '''
    load restaurant reviews data into pandas DataFrame from json file,
    flatten the nested structure to retrieve all useful information and return the dataframe

    :param feature: filename
    '''
    f = open(file_name) 
    data = json.load(f)

    df_list = []
    for restaurant, reviews in data.items():
        normalized_df = pd.json_normalize(reviews)
        normalized_df['restaurant'] = restaurant
        filtered_df = normalized_df.dropna(axis=1, how='all')
        df_list.append(filtered_df)

    return pd.concat(df_list, ignore_index=True)

# Load restaurant reviews data from web scraping
reviews = []
path = "{}/reviews".format(os.getcwd())
for file_name in os.listdir(path):
    with open(path+'/'+file_name, "r") as json_file:
        reviews.append(load_reviews(path+'/'+file_name))

df_reviews = pd.concat(reviews, ignore_index=True) 

In [18]:
df_reviews['photoCount'].fillna(0,inplace=True)
df_reviews.drop_duplicates()

Unnamed: 0,photoCount,reviewCount,eliteYear,localizedDate,rating,comment.text,comment.language,restaurant
0,1012.0,321,,11/2/2018,5,*Tartinery is one of the food vendors at Hudso...,en,tartinery-new-york-15
1,190.0,1421,,8/6/2023,3,Overall the service is very slow. We ordered 2...,en,tartinery-new-york-15
2,117.0,294,2023.0,10/17/2022,4,Yum I love a place that has interesting drinks...,en,tartinery-new-york-15
3,154.0,171,2023.0,11/29/2021,5,I always look for a comfortable go to bar/rest...,en,tartinery-new-york-15
4,6668.0,2143,2023.0,4/6/2022,3,Tartinery is where you go for happy hour wine ...,en,tartinery-new-york-15
...,...,...,...,...,...,...,...,...
331688,301.0,77,2023.0,5/21/2022,4,Great brunch spot. Cute decor. The coffee was ...,en,brownstone-pancake-factory-edgewater
331689,324.0,114,2023.0,10/1/2022,4,An &#34;All American Dinner&#34; that is affor...,en,brownstone-pancake-factory-edgewater
331690,24.0,10,,9/10/2023,5,Went today and got the triple d buffalo chicke...,en,brownstone-pancake-factory-edgewater
331691,5.0,35,,5/7/2023,5,We came to Brownstone as a group of 30 on a bu...,en,brownstone-pancake-factory-edgewater


In [19]:
df_reviews

Unnamed: 0,photoCount,reviewCount,eliteYear,localizedDate,rating,comment.text,comment.language,restaurant
0,1012.0,321,,11/2/2018,5,*Tartinery is one of the food vendors at Hudso...,en,tartinery-new-york-15
1,190.0,1421,,8/6/2023,3,Overall the service is very slow. We ordered 2...,en,tartinery-new-york-15
2,117.0,294,2023.0,10/17/2022,4,Yum I love a place that has interesting drinks...,en,tartinery-new-york-15
3,154.0,171,2023.0,11/29/2021,5,I always look for a comfortable go to bar/rest...,en,tartinery-new-york-15
4,6668.0,2143,2023.0,4/6/2022,3,Tartinery is where you go for happy hour wine ...,en,tartinery-new-york-15
...,...,...,...,...,...,...,...,...
331688,301.0,77,2023.0,5/21/2022,4,Great brunch spot. Cute decor. The coffee was ...,en,brownstone-pancake-factory-edgewater
331689,324.0,114,2023.0,10/1/2022,4,An &#34;All American Dinner&#34; that is affor...,en,brownstone-pancake-factory-edgewater
331690,24.0,10,,9/10/2023,5,Went today and got the triple d buffalo chicke...,en,brownstone-pancake-factory-edgewater
331691,5.0,35,,5/7/2023,5,We came to Brownstone as a group of 30 on a bu...,en,brownstone-pancake-factory-edgewater


In [20]:
#Transform dataframes to CSV files reusable across various analyses. 
current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'restaurant_reviews_michelin_stars.csv')

df_reviews.to_csv(file_path, index=True)

# Restaurant Basics Data

N.B: The review data is complex to handle due to scale; since we pulled 50 reviews per Manhattan restaurant for ~10,877 properties, GitHub restrictions won't allow us to upload a > 100MG file. 

Instead, we scraped review data alphabetically (reviews for Masa goes into the M folder under one M file, Rubirosa to R under one R file, etc). 

Now, we clean this data and amalgmate ALL reviews into a dataframe and subsquent CSV. 

In [23]:
def load_restaurant_data(file_name):
    '''
    load restaurant data into pandas DataFrame from json file (web scraping)
    assign each restaurant its official name, return the dataframe

    :param feature: filename
    '''
    f = open(file_name) 
    data = json.load(f)
    df_list = []
    for restaurant, info in data.items():
        normalized_df = pd.json_normalize(info)
        normalized_df['restaurant'] = restaurant
        filtered_df = normalized_df.dropna(axis=1, how='all')
        df_list.append(filtered_df)

    # Concatenate all dataframes
    df_restaurants = pd.concat(df_list, ignore_index=True)
    return flatten_category(df_restaurants)


def extract_titles(row):
    '''
    extract and return all levels of data from a nested dictionary

    :param feature: 1 row of restaurant data
    '''
    # Extracting 'title' from each column if it is not None, otherwise using None
    return [row[i]['title'] if row[i] is not None else None for i in range(len(row))]

def flatten_category(df):
    '''
    extract all labels/categories of a restaurant, return the complete dataframe
    
    :param feature: pd dataframe
    '''
    new_category = pd.json_normalize(df['categories'])
    category_df = new_category.apply(extract_titles, axis=1, result_type='expand')

    # Renaming the columns
    category_df.columns = [f'category_{i}' for i in range(new_category.shape[1])]

    # Concatenate with the original dataframe if needed
    df = pd.concat([df, category_df], axis=1)
    return df

df_restaurants = load_restaurant_data('restaurants_michelin_stars.json')

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [None]:
michelin = json.load(open("michelin_alias_michelin_stars.json") )
df_restaurants['is_michelin'] = [1 if i in michelin.values() else 0 for i in df_restaurants['restaurant']]

In [None]:
# Exclude 8 brooklyn restaurants

'''
print(df_restaurants['is_michelin'].sum())
print(len(michelin.values()))

set_data = set(df_restaurants[df_restaurants['is_michelin']==1]['restaurant'])
set_michelin = set(michelin.values())

print(set_michelin-set_data) 
print(len(set_michelin-set_data))
print(len(set_michelin))
print(len(set_data))
'''

In [None]:
df_restaurants.head()

Now, we feature engineer to derive deeper insights & metrics.

In [None]:
df_reviews_stat = df_reviews[['restaurant', 'photoCount']].groupby('restaurant').count().rename(columns={'photoCount':'reviews_cnt'})
df_reviews_stat['elite_reviews_cnt'] = df_reviews[['restaurant', 'eliteYear']].groupby('restaurant').count()['eliteYear']
df_reviews_stat['elite_reviews_perc'] = df_reviews_stat['elite_reviews_cnt']/df_reviews_stat['reviews_cnt']
df_reviews_stat['user_photo_cnt_avg'] = df_reviews[['restaurant', 'photoCount']].groupby('restaurant').mean()['photoCount']
df_reviews_stat['user_review_cnt_avg'] = df_reviews[['restaurant', 'reviewCount']].groupby('restaurant').mean()['reviewCount']
df_reviews_stat['rating_avg'] = df_reviews[['restaurant', 'rating']].groupby('restaurant').mean()['rating']
df_reviews_stat['rating_med'] = df_reviews[['restaurant', 'rating']].groupby('restaurant').median()['rating']
df_reviews_stat['rating_std'] = df_reviews[['restaurant', 'rating']].groupby('restaurant').std()['rating']
df_reviews_stat['rating_std'] = df_reviews_stat['rating_std'].fillna(0) # if we only get one review, then the std should be 0
df_reviews_stat

In [None]:
df_final_restaurants = pd.merge(df_restaurants, df_reviews_stat, how='inner', on='restaurant')

In [None]:
df_final_restaurants

In [None]:
"""
CAVEAT: This CSV file is over 100MG; Github cannot support it, so we don't directly upload it. However, we use it in our analyses so if you are attempting to run code download this file by running the cells in this section. 
"""

current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'restaurant_details_michelin_stars.csv')

df_final_restaurants.to_csv(file_path, index=True) 