In [1]:
import pandas as pd
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime, timedelta
import ast

# unix datetime
base = pd.Timestamp("1970-01-01")
CHUNK_SIZE = 1000000
REVIEW_DROP = 0
RESTAURANTS_PATH = 'dataset/processed_rest.csv'
REVIEWS_PATH = 'dataset/reviews.csv'
USERS_PATH = 'dataset/processed_users.csv'

def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.values).long().to(device)

# Load data files
# reviews = get_reviews()
def load_data(train_percent, val_percent, test_percent):
    print("Reading users")
    users = pd.read_csv(USERS_PATH)
    users = users[users['review_count'] > REVIEW_DROP]
    users['user_id'] = users['user_id'].astype('category')
    users['user_id_num'] = users['user_id'].cat.codes
    users = users[['user_id', 'user_id_num', 'review_count']]
    user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))
    print(users)

    print("Reading businesses")
    restaurants = pd.read_csv(RESTAURANTS_PATH)
    restaurants['business_id'] = restaurants['business_id'].astype('category')
    restaurants['business_id_num'] = restaurants['business_id'].cat.codes
    restaurants = restaurants[['business_id', 'business_id_num']]
    rest_id_to_num = dict(zip(restaurants['business_id'], restaurants['business_id_num']))
    print(restaurants)

    print("Reading reviews")
    reviews = pd.read_csv(REVIEWS_PATH)

    reviews = pd.merge(reviews, users, how='inner', on='user_id')
    reviews = reviews.drop(columns='user_id')
    reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
    reviews = reviews.drop(columns='business_id')
    print("REVIEWS.HEAD() -------------------------------------------------------------------")
    print(reviews.head())
    reviews = reviews.drop(columns=reviews.columns[0], axis=1)
    print("REVIEWS.DROP() -------------------------------------------------------------------")
    print(reviews.head())
    reviews = reviews.select_dtypes(include =[np.number])
    print("reviews.select_dtypes() -------------------------------------------------------------------")
    
    print(reviews.head())

    pickle.dump(user_id_to_num, open('./dataset/user_id_to_num.pkl', 'wb'))
    pickle.dump(rest_id_to_num, open('./dataset/rest_id_to_num.pkl', 'wb'))
    np.save('./dataset/data.npy', reviews.values)

    training = reviews.sample(frac=train_percent)

    left = reviews.drop(training.index)
    validation = left.sample(frac=val_percent / (val_percent + test_percent))

    test = left.drop(validation.index)

    print("loaded")

    return df_to_tensor(training), df_to_tensor(validation), df_to_tensor(test), user_id_to_num, rest_id_to_num


if __name__ == "__main__":
    train, val, test, user, rest = load_data(0.6, 0.3, 0.1)
    print("TRAIN ----------------------------------------------")
    print(train.shape)
    print("VAL ----------------------------------------------")
    print(val.shape)
    print("TEST ----------------------------------------------")
    print(test.shape)

Reading users
                           user_id  user_id_num  review_count
0        b'qVc8ODYU5SZjKXVBgXdI7w'      1693050           585
1        b'j14WgRoU_-2ZE1aw1dXrJg'      1461848          4333
2        b'2WnXYQFK0hXEoTxPtV2zvg'       109828           665
3        b'SZDeASXq7o05mMNLshsdIA'       919866           224
4        b'hA5lMy-EnncsH4JoR-hFGQ'      1404315            79
...                            ...          ...           ...
1987892  b'fB3jbHi3m0L2KgGOxBv6uw'      1342378            23
1987893  b'68czcr4BxJyMQ9cJBm6C7Q'       222347             1
1987894  b'1x3KMskYxOuJCjRz70xOqQ'        92112             4
1987895  b'ulfGl4tdbrH05xKzh5lnog'      1825896             2
1987896  b'wL5jPrLRVCK_Pmo4lM1zpA'      1874251             2

[1987843 rows x 3 columns]
Reading businesses


  restaurants = pd.read_csv(RESTAURANTS_PATH)


                      business_id  business_id_num
0       b'Pns2l4eNsfO8kk83dixA6A'            62823
1       b'mpf3x-BjTdTEA3yCZrAYPw'           119206
2       b'tUFrWirKiKi_TAnsVWINQQ'           134797
3       b'MTSW4McQd7CbVtyjqoe9mw'            55038
4       b'mWMc6_wTdE0EUBKIGXDVfA'           118460
...                           ...              ...
150341  b'IUQopTMmYQG-qRtBk-8QnA'            45798
150342  b'c8GjPIOTGVmIemT7j5_SyQ'            94023
150343  b'_QAMST-NrQobXduilWEqSw'            87809
150344  b'mtGm22y5c2UHNXDFAjaPNw'           119332
150345  b'jV_XOycEzSlTx-65W906pg'           111381

[150346 rows x 2 columns]
Reading reviews
REVIEWS.HEAD() -------------------------------------------------------------------
                   review_id  stars  \
0  b'KU_O5udG6zpxOg-VcAEodg'    3.0   
1  b'BiTunyQ73aT9WBnpR9DZGw'    5.0   
2  b'saUsX_uimxRlCVr67Z4Jig'    3.0   
3  b'AqPFMleE6RsU23_auESxiA'    5.0   
4  b'Sx8TMOWLNuJBWer-0pcmoA'    4.0   

                           