## Package Import 

In [1]:
import pandas as pd
import json
from tqdm import tqdm
import datetime

## Data Loading

In [2]:
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, reviews, useful, funny, cool = [], [], [], [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]] 
        business_ids += [blob["business_id"]] 
        stars += [blob["stars"]]
        dates += [blob["date"]]
        reviews += [blob["text"]]
        useful += [blob["useful"]]
        funny += [blob['funny']]
        cool += [blob['cool']]        
ratings = pd.DataFrame({"user_id": user_ids, "business_id": business_ids, "rating": stars, 
                        "date": dates, "reviews": reviews, "useful": useful,"funny": funny, "cool": cool})

100%|██████████| 6685900/6685900 [01:23<00:00, 80157.74it/s]


In [5]:
business = pd.read_json('yelp_dataset/business.json',lines=True)

## Data Selection 

1. Select based on time

In [3]:
srating = ratings[ratings['date']>='2016']

2. Select based on users and drop closed business

In [6]:
close = list(business[business['is_open']==0]['business_id'])
nratings = srating[~srating['business_id'].isin(close)]

In [17]:
comp_counts = nratings["useful"] + nratings["funny"] + nratings["cool"]
austic_users = nratings[comp_counts > 3]["user_id"].unique().tolist()

In [18]:
len(austic_users)

155493

In [10]:
user_counts = nratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

In [15]:
len(active_users)

144090

In [20]:
users = list(set(active_users) & set(austic_users))

In [19]:
len(set(active_users) & set(austic_users))

63208

In [21]:
df = nratings[nratings['user_id'].isin(users)]

3. split train & test and check sample data

In [43]:
l = pd.DataFrame(df.groupby('user_id').max()['date'])

In [45]:
l.reset_index(inplace = True)

In [47]:
sdf = pd.merge(df,l,how='left',on= 'user_id')

In [50]:
test = sdf[sdf['date_x']==sdf['date_y']]
train = sdf[sdf['date_x']!=sdf['date_y']]

In [54]:
test.drop(columns = ['date_y'],inplace = True)
train.drop(columns = ['date_y'],inplace = True)
test.rename(columns={'date_x':'date'},inplace= True)
train.rename(columns={'date_x':'date'},inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [22]:
df.shape

(1151349, 8)

In [24]:
len(df['user_id'].unique())

63208

In [25]:
len(df['business_id'].unique())

116475

4. save the sample

In [23]:
df.to_csv('sample.csv',index = False)

In [60]:
test.to_csv('sample_test.csv',index=False)

In [None]:
train.to_csv('sample_train.csv',index=False)