In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
REVIEW_DROP = 0
RESTAURANTS_PATH = 'dataset/restaurants.csv'
USERS_PATH = 'dataset/users.csv'
REVIEWS_PATH = 'dataset/reviews.csv'

# Process users---------------------------------------------------------------------
print("Processing users")
users = pd.read_csv(USERS_PATH)
# Drop users with reviews count less than or equa REVIEW_DROP
users = users[users['review_count'] > REVIEW_DROP]
users['user_id'] = users['user_id'].astype('category')
users['user_id_num'] = users['user_id'].cat.codes
users = users[['user_id', 'user_id_num', 'review_count']]
user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))
print(users)

# Process restaurants---------------------------------------------------------------
print("Processing restaurants")
restaurants = pd.read_csv(RESTAURANTS_PATH)
restaurants['business_id'] = restaurants['business_id'].astype('category')
restaurants['business_id_num'] = restaurants['business_id'].cat.codes
restaurants = restaurants[['business_id', 'business_id_num']]
rest_id_to_num = dict(zip(restaurants['business_id'], restaurants['business_id_num']))
print(restaurants)

# Process reviews--------------------------------------------------------------------
print("Processing reviews")
reviews = pd.read_csv(REVIEWS_PATH)
# Merge user and restaurant info
reviews = pd.merge(reviews, users, how='inner', on='user_id')
reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
# Drop id (keep id in number)
reviews = reviews.drop(columns='user_id')
reviews = reviews.drop(columns='business_id')
reviews = reviews.drop(columns='review_id')
# Process date column
reviews['date'] = pd.to_datetime(reviews['date'])
reviews['date'] = reviews['date'].astype('int64') // 10**9
# Keep only the numeric data columns
reviews = reviews.select_dtypes(include =[np.number])
print(reviews)

Processing users


  users = pd.read_csv(USERS_PATH)


                        user_id  user_id_num  review_count
0        qVc8ODYU5SZjKXVBgXdI7w      1693050           585
1        j14WgRoU_-2ZE1aw1dXrJg      1461848          4333
2        2WnXYQFK0hXEoTxPtV2zvg       109828           665
3        SZDeASXq7o05mMNLshsdIA       919866           224
4        hA5lMy-EnncsH4JoR-hFGQ      1404315            79
...                         ...          ...           ...
1987892  fB3jbHi3m0L2KgGOxBv6uw      1342378            23
1987893  68czcr4BxJyMQ9cJBm6C7Q       222347             1
1987894  1x3KMskYxOuJCjRz70xOqQ        92112             4
1987895  ulfGl4tdbrH05xKzh5lnog      1825896             2
1987896  wL5jPrLRVCK_Pmo4lM1zpA      1874251             2

[1987843 rows x 3 columns]
Processing restaurants


  restaurants = pd.read_csv(RESTAURANTS_PATH)


                   business_id  business_id_num
0       Pns2l4eNsfO8kk83dixA6A            62823
1       mpf3x-BjTdTEA3yCZrAYPw           119206
2       tUFrWirKiKi_TAnsVWINQQ           134797
3       MTSW4McQd7CbVtyjqoe9mw            55038
4       mWMc6_wTdE0EUBKIGXDVfA           118460
...                        ...              ...
150341  IUQopTMmYQG-qRtBk-8QnA            45798
150342  c8GjPIOTGVmIemT7j5_SyQ            94023
150343  _QAMST-NrQobXduilWEqSw            87809
150344  mtGm22y5c2UHNXDFAjaPNw           119332
150345  jV_XOycEzSlTx-65W906pg           111381

[150346 rows x 2 columns]
Processing reviews
         stars        date  funny  cool  useful  user_id_num  review_count  \
0          3.0  1531001351      0     0       0      1575913            33   
1          5.0  1325604498      0     1       1       808602            10   
2          3.0  1391632230      0     0       0       301895          1332   
3          5.0  1420329663      0     1       1      1154269      

In [3]:
# Save data----------------------------------------------------------------------------
pickle.dump(user_id_to_num, open('./dataset/user_id_to_num.pkl', 'wb'))
pickle.dump(rest_id_to_num, open('./dataset/rest_id_to_num.pkl', 'wb'))
# Change type data of stars
np.save('./dataset/data.npy', reviews.values.astype(int))
# np.save('./dataset/data.npy', reviews.values)
