In [3]:
import json
import numpy as np
import pandas as pd
from IPython.display import display

In [4]:
def load_rows(filepath, nrows = None):
    with open(filepath) as json_file:
        count = 0
        objs = []
        line = json_file.readline()
        while (nrows is None or count < nrows) and line:
            count += 1
            obj = json.loads(line)
            objs.append(obj)
            line = json_file.readline()
        return pd.DataFrame(objs)

In [5]:
businesses = load_rows('data/business/yelp_academic_dataset_business.json')
print('Business objects loaded. Count = {}'.format(businesses.shape[0]))
reviews = load_rows('data/reviews/yelp_academic_dataset_reviews.json')

In [9]:
!cat 'data/reviews/yelp_academic_dataset_reviews.json' | wc -l
print('Review objects loaded. Count = {}'.format(reviews.shape[0]))

8021122
Review objects loaded. Count = 8021122


In [10]:
cities_per_business = businesses[['business_id', 'city']]

In [11]:
funny_reviews = reviews[reviews['funny'] > 0][['review_id', 'business_id', 'funny']]
len(funny_reviews)

1601801

In [12]:
funny_reviews_with_cities = funny_reviews.set_index('business_id').join(cities_per_business.set_index('business_id'))

In [13]:
top_10_funny_cities = funny_reviews_with_cities.groupby('city').size().reset_index(name='qty').sort_values(by='qty', ascending=False).head(10)

In [14]:
top_10_funny_cities

Unnamed: 0,city,qty
396,Las Vegas,513206
687,Phoenix,184937
934,Toronto,112887
848,Scottsdale,92798
138,Charlotte,64070
925,Tempe,49241
322,Henderson,49042
703,Pittsburgh,46446
155,Cleveland,40974
496,Mesa,37819


In [15]:
reviews_qty = reviews.groupby(['user_id']).size()
reviews_qty_gt_50 = reviews_qty[reviews_qty >= 50]

In [17]:
len(reviews_qty_gt_50)

15955

In [18]:
from datetime import datetime
reviews_by_day_of_week = reviews
reviews_by_day_of_week['day_of_week'] = reviews_by_day_of_week.date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').weekday())
reviews_by_day_of_week.groupby('day_of_week').size()

day_of_week
0    1183732
1    1112199
2    1124486
3    1080564
4    1090099
5    1184179
6    1245863
dtype: int64

In [19]:
reviews_qty_5_stars = reviews[(reviews['stars'] == 5)].groupby(['user_id']).size()
reviews_qty_joined =  pd.concat([reviews_qty, reviews_qty_5_stars], axis=1, keys=['total_qty', 'starts_5_qty'])
display(reviews_qty_joined.head(2))
users_5_stars = reviews_qty_joined.query('(total_qty >= 50) & (total_qty == starts_5_qty)')
users_5_stars

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,total_qty,starts_5_qty
---1lKK3aKOuomHnwAkAow,131,75.0
---3o4ZsKYoBYBe7H6xG8A,1,1.0


Unnamed: 0,total_qty,starts_5_qty
2-PD6df20ge-k9SPHmcxiw,54,54.0
2bC-dBYN48wrmN-j0Bv-Jw,59,59.0
8JwSmvviX2dEAgaPRZ70nQ,195,195.0
MTl8QNjnenumWaORnXhing,55,55.0
aGwONlR86ERk450Gp6Ih5A,55,55.0
aYV4_aVwexS-Zn34k_P0VQ,60,60.0
caUyy1kvh-MxntoKoJr6kg,56,56.0
nkhu7NjlIEimaJ-QD8S3SA,64,64.0
trGQ2nySSedAbXaaHhcmUQ,66,66.0
uQ2w3yMEYxcspPrZLKObVw,80,80.0


In [21]:
import hashlib
reviews_with_comments = reviews
reviews_with_comments['text_hash'] = reviews_with_comments.text.map(lambda x:hashlib.sha1(x.encode('utf8')).hexdigest())

In [22]:
reviews_by_user_text = reviews_with_comments.groupby(['user_id', 'text_hash']).size().reset_index(name='same_texts_qty')
display(reviews_by_user_text.head(2))

Unnamed: 0,user_id,text_hash,same_texts_qty
0,---1lKK3aKOuomHnwAkAow,0165bb3a5fa465446d047026cb368b2f589fb34b,1
1,---1lKK3aKOuomHnwAkAow,0546415a78a54b8785d38a7eb083753246b51224,1


In [23]:
reviews_qty_texts_joined = reviews_qty.reset_index(name='total_qty').set_index('user_id').join(reviews_by_user_text.set_index('user_id'))
users_same_texts_always = reviews_qty_texts_joined.query('(total_qty >= 5) & (total_qty == same_texts_qty)')
users_same_texts_always


Unnamed: 0_level_0,total_qty,text_hash,same_texts_qty
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9II6XRbZAf2koLK1lCjz6A,11,971fa13e286683f4b300cc725b870d1b7c64121b,11
ImyvYbCzWhoYnbJzEvGGgw,6,1fdcf5f12420fef03a93c9bf1b3fdb0423e1c847,6
TV5s5qQKgMGoECfDLGdTmQ,5,1f808a2dac075af57c54b92f462fe587dfbaa9d3,5
k8Hw_ua1KjCPGVkhGOk7ew,6,b832916b96512c121321a55b4da456d3cf1ed532,6


In [31]:
own_user_id_50 = pd.read_csv("csvs/users_fifty_reviews.csv")
own_user_id_50.head()

Unnamed: 0,user_id,total_reviews
0,UgMW8bLE0QMJDCkQ1Ax5Mg,246
1,jOERvhmK6_lo_XGUBPws_w,105
2,A0j21z2Q1HGic7jW6e9h7A,813
3,TZQSUDDcA4ek5gBd6BzcjA,145
4,UreiTV1I9i-XF6_bJhK6Iw,59


In [26]:
reviews_qty = reviews.groupby(['user_id']).size()
reviews_qty_gt_50 = reviews_qty[reviews_qty >= 50]

In [29]:
len(reviews_qty_gt_50)

15955

In [32]:
len(own_user_id_50)

15955

In [35]:
own_five_stars = pd.read_csv("csvs/reviews_five_starts.csv")
own_five_stars.head()

Unnamed: 0,user_id,five_stars_reviews
0,V34qejxNsCbcgD8C0HVk-Q,8
1,5vD2kmE25YBrbayKhykNxQ,5
2,aq_ZxGHiri48TUXJlpRkCQ,35
3,dsd-KNYKMpx6ma_sRWCSkQ,1
4,P6apihD4ASf1vpPxHODxAQ,1


In [37]:
reviews_five_stars = reviews[(reviews['stars'] == 5)].groupby(['user_id']).size().reset_index()

In [38]:
len(reviews_five_stars)

1296614

In [39]:
len(own_five_stars)

1296614

In [40]:
merge = pd.merge(own_five_stars, own_user_id_50, on="user_id")
merge.head()

Unnamed: 0,user_id,five_stars_reviews,total_reviews
0,HLaSqQMDVvlcFPGJL_kGCA,21,125
1,U4INQZOPSUaj8hMjLlZ3KA,666,1762
2,aLqp_fe64ugZFuwMpsXLiw,39,104
3,xFSLb_pxXta5G4oaRB1ylQ,42,97
4,LmPxZshPCXBd4mrERDU8RA,30,98


In [42]:
merge[merge["five_stars_reviews"] == merge["total_reviews"]]

Unnamed: 0,user_id,five_stars_reviews,total_reviews
550,xBxmaLiSLXN68Gqj_zdjkQ,54,54
1270,8JwSmvviX2dEAgaPRZ70nQ,195,195
2489,aYV4_aVwexS-Zn34k_P0VQ,60,60
2858,trGQ2nySSedAbXaaHhcmUQ,66,66
3446,caUyy1kvh-MxntoKoJr6kg,56,56
3840,2bC-dBYN48wrmN-j0Bv-Jw,59,59
4795,MTl8QNjnenumWaORnXhing,55,55
6231,nkhu7NjlIEimaJ-QD8S3SA,64,64
6965,aGwONlR86ERk450Gp6Ih5A,55,55
9874,uQ2w3yMEYxcspPrZLKObVw,80,80
