In [1]:
import gzip
import json
import pandas as pd
import plotly.express as px

# Import Data

In [2]:
def parse_gzipped_json_generator(zip_file_path):
    with gzip.open(zip_file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

def gzipped_json_to_dataframe(zip_file_path):
    generator = parse_gzipped_json_generator(zip_file_path)
    data_frame = pd.DataFrame(generator)
    return data_frame

In [3]:
review_zip_file_path = '../data/review-District_of_Columbia_10.json.gz'
review_data_frame = gzipped_json_to_dataframe(review_zip_file_path)
review_data_frame.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,104256407771930872120,Petre Ene,1551027526726,4,"the food is really good, I had a great experie...",[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b7b7851b06ef6b:0x5f356b1eb1da27
1,108642560086289718425,Keshava Mysore,1510920735052,4,High prices for not so big portions! But I thi...,,,0x89b7b7851b06ef6b:0x5f356b1eb1da27
2,111842880135815090774,Carol G,1543249586598,2,"The gelato looked ""old"" and ended up having co...",,,0x89b7b7851b06ef6b:0x5f356b1eb1da27
3,110299397397951052863,翟泉,1496597796978,4,(Translated by Google) The fast food restauran...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b7b7851b06ef6b:0x5f356b1eb1da27
4,116447973760719145759,Kitae Yim,1540701399450,1,(Translated by Google) The food is barely edib...,,,0x89b7b7851b06ef6b:0x5f356b1eb1da27


In [4]:
meta_zip_file_path = '../data/meta-District_of_Columbia.json.gz'
meta_data_frame = gzipped_json_to_dataframe(meta_zip_file_path)
meta_data_frame = meta_data_frame[meta_data_frame['category'].astype(str).str.contains('restaurant', case=False, na=False)]
meta_data_frame = meta_data_frame.drop_duplicates(subset='gmap_id')
meta_data_frame.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cafeteria-style dining at the National Gallery...,38.892077,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
7,Smoothie King,"Smoothie King, 77 H St NW Ste C, Washington, D...",0x89b7b7ce3577cbfb:0x8a6a6a3ecc20fe86,Health-conscious chain featuring blended drink...,38.90042,-77.01115,"[Juice shop, Fast food restaurant, Health food...",3.7,28,,"[[Wednesday, 7AM–9PM], [Thursday, 7AM–9PM], [F...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 9PM,"[0x89b7b7b1f05ec925:0x1d2086346ec2769f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
8,Lincoln Park Kitchen & Wine Bar,"Lincoln Park Kitchen & Wine Bar, 106 13th St S...",0x89b7b837e7c50a6f:0xd0be494f173acb48,,38.889423,-76.988012,[Restaurant],3.9,7,,"[[Wednesday, 5–9PM], [Thursday, 5–9PM], [Frida...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89b7b9ccd0369351:0x80e1de4fa628189e, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
29,Michel Richard Citronelle,"Michel Richard Citronelle, 3000 M St NW, Washi...",0x89b7b64b9ed6a13b:0x42731936aa044d91,,38.905092,-77.05966,[Restaurant],4.8,16,$$$$,,"{'Service options': ['Delivery'], 'Highlights'...",,"[0x89b7b64c343c4465:0xe2678aca9cfe765f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
48,Pizza Autentica,"Pizza Autentica, 1331 L St NW #1, Washington, ...",0x89b7b7951b1125e5:0x6848eaecb44aa2cc,"Casual local chain serving Neapolitan pizza, s...",38.904082,-77.031063,"[Pizza restaurant, Italian restaurant, Neapoli...",4.0,44,$,"[[Wednesday, 11AM–9PM], [Thursday, 11AM–9PM], ...",{'Service options': ['Delivery']},Permanently closed,[0x89b7b826c616b9e3:0xb1e16c82841218f6],https://www.google.com/maps/place//data=!4m2!3...


In [5]:
merged_data_frame = pd.merge(review_data_frame, meta_data_frame, on='gmap_id', how='inner')
merged_data_frame.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,104256407771930872120,Petre Ene,1551027526726,4,"the food is really good, I had a great experie...",[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",...,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
1,108642560086289718425,Keshava Mysore,1510920735052,4,High prices for not so big portions! But I thi...,,,0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",...,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
2,111842880135815090774,Carol G,1543249586598,2,"The gelato looked ""old"" and ended up having co...",,,0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",...,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
3,110299397397951052863,翟泉,1496597796978,4,(Translated by Google) The fast food restauran...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",...,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...
4,116447973760719145759,Kitae Yim,1540701399450,1,(Translated by Google) The food is barely edib...,,,0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",...,-77.019908,[American restaurant],2.6,28,,"[[Thursday, 11AM–3PM], [Friday, 11AM–3PM], [Sa...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM,"[0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...


# Data Processing

In [6]:
picture_data_frame = pd.DataFrame(merged_data_frame, columns=['rating', 'pics'])
picture_data_frame['pics'] = picture_data_frame['pics'].apply(lambda x: len(x) if x is not None else 0)
picture_data_frame.rename(columns={'rating': 'Rating', 'pics': 'Num Pictures'}, inplace=True)
picture_data_frame.head()

Unnamed: 0,Rating,Num Pictures
0,4,3
1,4,0
2,2,0
3,4,2
4,1,0


In [7]:
# Calculate total count of each rating
rating_counts = picture_data_frame['Rating'].value_counts().sort_index()

# Calculate average number of pictures for each rating
avg_pictures_per_rating = picture_data_frame.groupby('Rating')['Num Pictures'].mean()

summary_data_frame = pd.DataFrame({'Count': rating_counts, 'Avg. Num Pictures': avg_pictures_per_rating}).reset_index().rename(columns={'index': 'Rating'})

In [8]:
# Plot total count
fig1 = px.bar(summary_data_frame, x='Rating', y='Count', title='Total Count of Each Rating')
fig1.update_layout(xaxis_title="Rating", yaxis_title="Total Count", uniformtext_mode='hide')
fig1.show()

In [9]:
# Plot avg. num pictures
fig2 = px.bar(summary_data_frame, x='Rating', y='Avg. Num Pictures', title='Average Number of Pictures per Rating')
fig2.update_layout(xaxis_title="Rating", yaxis_title="Average Number of Pictures", uniformtext_mode='hide')
fig2.show()

In [10]:
summary_data_frame

Unnamed: 0,Rating,Count,Avg. Num Pictures
0,1,8573,0.063572
1,2,9498,0.072752
2,3,32965,0.102776
3,4,80199,0.143269
4,5,156737,0.201152
