In [1]:
import pandas as pd
import plotly.express as px

# Import Data

In [2]:
review_file_path = '../data/review-New_York_10.json'
df_review_data = pd.read_json(review_file_path, lines=True)

# df_review_data.head()

In [3]:
metadata_file_path = '../data/meta-New_York.json'
df_ratings = pd.read_json(metadata_file_path, lines=True)

# df_ratings.head()

# Data Processing

In [4]:

df_ratings = df_ratings[df_ratings['category'].astype(str).str.contains('restaurant', case=False, na=False)]
df_ratings = df_ratings.drop_duplicates(subset='gmap_id')

# df_ratings.head()

In [5]:
df_merged = pd.merge(df_review_data, df_ratings, on='gmap_id', how='inner')

# df_merged.head()

In [6]:
picture_data_frame = pd.DataFrame(df_merged, columns=['rating', 'pics'])
picture_data_frame['pics'] = picture_data_frame['pics'].apply(lambda x: len(x) if x is not None else 0)
picture_data_frame.rename(columns={'rating': 'Rating', 'pics': 'Num Pictures'}, inplace=True)

picture_data_frame.head()

Unnamed: 0,Rating,Num Pictures
0,4,0
1,5,1
2,3,0
3,5,0
4,1,0


In [7]:
# Calculate total count of each rating
rating_counts = picture_data_frame['Rating'].value_counts().sort_index()

# Calculate average number of pictures for each rating
avg_pictures_per_rating = picture_data_frame.groupby('Rating')['Num Pictures'].mean()
summary_data_frame = pd.DataFrame({'Count': rating_counts, 'Avg. Num Pictures': avg_pictures_per_rating}).reset_index().rename(columns={'index': 'Rating'})

summary_data_frame.head()

Unnamed: 0,Rating,Count,Avg. Num Pictures
0,1,355620,0.051409
1,2,298870,0.06366
2,3,907020,0.080425
3,4,2114082,0.119489
4,5,4566344,0.142817


In [8]:
# Plot total count
fig1 = px.bar(summary_data_frame, x='Rating', y='Count', title='Total Count of Each Rating')
fig1.update_layout(xaxis_title="Rating", yaxis_title="Total Count", uniformtext_mode='hide')

fig1.show()

In [9]:
# Plot avg. num pictures
fig2 = px.bar(summary_data_frame, x='Rating', y='Avg. Num Pictures', title='Average Number of Pictures per Rating')
fig2.update_layout(xaxis_title="Rating", yaxis_title="Average Number of Pictures", uniformtext_mode='hide')

fig2.show()

In [11]:
summary_data_frame

Unnamed: 0,Rating,Count,Avg. Num Pictures
0,1,355620,0.051409
1,2,298870,0.06366
2,3,907020,0.080425
3,4,2114082,0.119489
4,5,4566344,0.142817
