In [1]:
import gzip
import json
import pandas as pd

In [2]:
def parse_gzipped_json_generator(zip_file_path):
    with gzip.open(zip_file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

def gzipped_json_to_dataframe(zip_file_path):
    generator = parse_gzipped_json_generator(zip_file_path)
    data_frame = pd.DataFrame(generator)
    return data_frame

In [3]:
review_zip_file_path = '../data/review-New_York_10.json.gz'
review_data_frame = gzipped_json_to_dataframe(review_zip_file_path)
review_data_frame.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,113722104692308235141,Alvin Martinez,1603494795361,5,I'm late to posting this but this store especi...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89c25fc9494dce47:0x6d63c807b59a55
1,107293441492109320298,Johnnie Jackson,1620157037403,1,Very dissatisfied I did not get my phone the p...,,"{'time': 1620268360920, 'text': 'We pride ours...",0x89c25fc9494dce47:0x6d63c807b59a55
2,100378585801819400296,Manie Blazer,1597431662039,5,Excellent very well done with professional car...,,,0x89c25fc9494dce47:0x6d63c807b59a55
3,114998161153019826512,Fashion Fiinds,1543773862044,5,Basing my review strictly on the service I rec...,,"{'time': 1543855317372, 'text': 'Thanks for th...",0x89c25fc9494dce47:0x6d63c807b59a55
4,117178185728422297915,Andres Rieloff,1597279097718,1,Bad! Disorganized. I'm being totally honest. I...,,,0x89c25fc9494dce47:0x6d63c807b59a55


In [4]:
meta_zip_file_path = '../data/meta-New_York.json.gz'
meta_data_frame = gzipped_json_to_dataframe(meta_zip_file_path)
meta_data_frame = meta_data_frame[meta_data_frame['category'].astype(str).str.contains('restaurant', case=False, na=False)]
meta_data_frame = meta_data_frame.drop_duplicates(subset='gmap_id')
meta_data_frame.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
42,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",0x89c261f60bdf13db:0x38da730e4687a97b,Long-running chain serving signature breakfast...,40.763985,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
54,Raffaello Kosher Pizza,"Raffaello Kosher Pizza, 37 W 46th St, New York...",0x89c258ffaeaba947:0x8355860772a595a9,This casual joint serves up traditional Italia...,40.756872,-73.980427,[Restaurant],3.4,8,,,"{'Service options': ['Delivery'], 'Offerings':...",,"[0x89c2f41bdce4dc0f:0xf4e3f717a4950ea3, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
68,Casa Malinche,"Casa Malinche, 3078 Coney Island Ave, Brooklyn...",0x89c244427d3e2c19:0xc1dbf7d8e71d7201,,40.581086,-73.960139,[Mexican restaurant],4.4,7,,"[[Thursday, 11AM–9:30PM], [Friday, 11AM–9:30PM...","{'Service options': ['Delivery'], 'Amenities':...",Permanently closed,"[0x89c24434f14337c9:0xbfa6dd0a67a97aa4, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
81,Kennedy's Chicken & Sandwiches,"Kennedy's Chicken & Sandwiches, 495 E 138th St...",0x89c2f5c3caef3fb5:0xb7f855503b4bc974,,40.80801,-73.919617,[Fast food restaurant],3.7,8,,"[[Thursday, 8AM–3AM], [Friday, 8AM–3AM], [Satu...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 8AM,"[0x89c2f5cfbcdeb7c1:0x1c3c27e3a6bcdab6, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
98,Cafe Kristall,"Cafe Kristall, 70 Mercer St, New York, NY 10012",0x89c2598eb0b90839:0x2b3c5161280e7169,"Austrian cuisine & drinks in a small, crystal-...",40.722466,-74.000283,"[Cafe, Restaurant]",5.0,1,$$,,"{'Service options': ['Delivery'], 'Offerings':...",,"[0x89c25a18157c4c95:0x544d671533dccc36, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...


In [5]:
merged_data_frame = pd.merge(review_data_frame, meta_data_frame, on='gmap_id', how='inner')
merged_data_frame.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,101863461389011299215,Maria Patricia Londoño,1629077998394,4,The donuts is always a good place to buy somet...,,,0x89c261f60bdf13db:0x38da730e4687a97b,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",...,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
1,101954633332535852944,Kristal,1593616678555,5,I went into this Dunkin' yesterday and got som...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89c261f60bdf13db:0x38da730e4687a97b,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",...,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
2,108987684095941070060,Efrain Hernandez,1594150943631,3,Bought a Machiato and as soon as I touched the...,,,0x89c261f60bdf13db:0x38da730e4687a97b,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",...,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
3,110146321170975563919,Kiyoshi Sudo,1564776340103,5,"Friendly staffs, nice donuts and muffins and c...",,,0x89c261f60bdf13db:0x38da730e4687a97b,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",...,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
4,112625885159369009229,Charlotte Sheppard,1602429271233,1,They got my order wrong food wasn't done unco...,,,0x89c261f60bdf13db:0x38da730e4687a97b,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",...,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...","{'Service options': ['Delivery', 'Takeout', 'D...",Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...


In [6]:
sample_data_frame = merged_data_frame.sample(n=1000000, random_state=69)
sample_data_frame.to_json('../data/sample_new_york_data.json', orient='records')

: 