In [1]:
import numpy as np
import json
import random
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from typing import Tuple, List, Any
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

%cd /content/drive/MyDrive/Neural-CF

/content/drive/MyDrive/Neural-CF


# Load Data Test

In [None]:
def load_data(
    data_folder: str,
    filenames=("yelp_academic_dataset_user.json", "yelp_academic_dataset_business.json", "yelp_academic_dataset_review.json")
    ) -> Tuple[List[Any], ...]:
    return tuple(map(
                  lambda f: list(map(json.loads, open(os.path.join(data_folder, f), "r", encoding="utf-8").readlines())),
                  filenames
                ))

(subset_user_data, subset_business_data, subset_review_data) = load_data("./Yelp-Dataset/subset",
                                                                        ("subset_user.json",
                                                                          "subset_business.json",
                                                                          "subset_review.json"))

In [None]:
print(subset_user_data[0].keys())
print(subset_business_data[0].keys())
print(subset_review_data[0].keys())

dict_keys(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos'])
dict_keys(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'])
dict_keys(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'])


In [None]:
# Display City names in the subset data
city_list = []
for i, business in enumerate(subset_business_data):
  if business['city'] not in city_list:
    city_list.append(business['city'])
print(city_list)

['Philadelphia', 'Tampa', 'St Louis', 'St. Louis', 'New Orleans', 'Tucson', 'Saint Louis', 'Indianapolis', 'Ardmore', 'Collingswood', 'Dunedin', 'Tarpon Springs', 'Saint Petersburg', 'Cherry Hill', 'Nashville', 'Reno', 'St. Petersburg', 'Clayton', 'Brandon', 'East Norriton', 'Pottstown', 'Tierra Verde', 'Lutz', 'Brentwood', 'Oldsmar', 'Kennett Square', 'Media', 'King Of Prussia', 'Florissant', 'King of Prussia', 'Carmel', 'ST LOUIS', 'Conshohocken', 'Clearwater Beach', 'Wynnewood', 'Sparks', 'Cinnaminson', 'Bensalem', 'North Redington Bch', 'Clearwater', 'Alton', 'Camden', 'Plymouth Meeting', 'Westmont', 'Creve Coeur', 'Horsham', 'Palm Harbor', 'Harahan', 'Manayunk', 'Maplewood', 'Kenner', 'South Tampa', 'St Petersburg', 'Maple Shade', 'Webster Groves', 'Upper Darby', 'Fishers', 'Collegeville', 'Haddon Township', 'Downingtown', 'Chesterfield', 'Wayne', 'Plant City', 'Richmond Heights']


# Data Preprocess Test (For Debugging)
For debugging only, no need to run

Ref: https://github.com/zhrlove/NCF/tree/master \
Ref2: https://github.com/hexiangnan/sigir16-eals

NCF: https://github.com/hexiangnan/neural_collaborative_filtering \
NCF (torch): https://github.com/yihong-chen/neural-collaborative-filtering/tree/master

Training Input: `userID::itemID::rating::timestamp (if have)`

In [None]:
for i, business in enumerate(subset_business_data):
  if business['business_id'] == 'gGyqnAlpFrka_qzpO7j4lQ':
    print(business['name'])
for i, user in enumerate(subset_user_data):
  if user['user_id'] == 'GcdYgbaF75vj7RO6EZhPOQ':
    print(user['name'])

Citizens Bank Park
Kathleen


In [None]:
# Reindex
user_item_interactions = subset_review_data
df = pd.DataFrame(user_item_interactions)
print("Number of Unique Users:", df['user_id'].nunique())
print("Number of Unique Businesses:", df['business_id'].nunique())

df = df.groupby(['user_id', 'business_id']).agg({'stars': 'mean'}).reset_index()
print(df.head(10))

user_id = df[['user_id']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(df, user_id, on=['user_id'], how='left')

item_id = df[['business_id']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
yelp_rating = pd.merge(ml1m_rating, item_id, on=['business_id'], how='left')
yelp_rating = yelp_rating[['userId', 'itemId', 'stars']]
print(yelp_rating.head(10))
print('Range of userId is [{}, {}]'.format(yelp_rating.userId.min(), yelp_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(yelp_rating.itemId.min(), yelp_rating.itemId.max()))



Number of Unique Users: 961
Number of Unique Businesses: 1000
                  user_id             business_id  stars
0  -3s52C4zL_DHRK0ULG6qtg  -kqjc8DxxRac4cz2qTKCLw    4.0
1  -3s52C4zL_DHRK0ULG6qtg  0QYWhij_YZ7Lyk9F6213Sg    5.0
2  -3s52C4zL_DHRK0ULG6qtg  1YflE3DkiCZGgLnf3paLnA    5.0
3  -3s52C4zL_DHRK0ULG6qtg  2BMk_drsikKWslJCXmQtjQ    2.0
4  -3s52C4zL_DHRK0ULG6qtg  2IahpaBR4U2Kdy9HF28EQA    2.5
5  -3s52C4zL_DHRK0ULG6qtg  33JlrWf0kmHd2VzW58Wp0g    3.0
6  -3s52C4zL_DHRK0ULG6qtg  6t0sNev22mcbvOB4gYVVOw    3.0
7  -3s52C4zL_DHRK0ULG6qtg  89SD5fNDDnJj-ITB40hLsQ    1.0
8  -3s52C4zL_DHRK0ULG6qtg  8O35ji_yOMVJmZ6bl96yhQ    3.0
9  -3s52C4zL_DHRK0ULG6qtg  8QZJvkx29OQNZgrM53aVbw    4.0
   userId  itemId  stars
0       0       0    4.0
1       0       1    5.0
2       0       2    5.0
3       0       3    2.0
4       0       4    2.5
5       0       5    3.0
6       0       6    3.0
7       0       7    1.0
8       0       8    3.0
9       0       9    4.0
Range of userId is [0, 960]
Range of

# Training Setup

We can train NeuralMF without training the GMF and MLP. But, the author suggest that training GMF and MLP first can lead to better performance for large scale data.

Edit the config parameter in `train.py` to adjust any hyperparameters

In [2]:
%cd ./Torch-NCF

/content/drive/MyDrive/Neural-CF/Torch-NCF


In [3]:
# Make directory to save the models
import os
if not os.path.exists('checkpoints'):
    os.makedirs('checkpoints')

# For running in Google Colab
!pip install tensorboardX==1.8.0

Collecting tensorboardX==1.8.0
  Downloading tensorboardX-1.8-py2.py3-none-any.whl (216 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/216.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m122.9/216.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.3/216.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-1.8


# Training GMF (Optional)
Before Training, adjust the config for num_users and num_items correctly

In [None]:
!python train.py --data_dir '../Yelp-Dataset/subset/subset_review.json' --model 'gmf'

# Training MLP (Optional)
Set the pretrained MF path in the mlp_config in `train.py` if you set pretrain = true.


In [None]:
!python train.py --data_dir '../Yelp-Dataset/subset/subset_review.json' --model 'mlp'

# Train NeuralMF
Edit the pretrain setting of neumf_config in `train.py` to determine whether you want to use the pretrained MLP and GMF to train the NeuralMF or not.

For small predictive factors, running NeuMF without pre-training can achieve better performance than GMF and MLP. For large predictive factors, pre-training NeuMF can yield better performance.

In [6]:
!python train.py --data_dir '../Yelp-Dataset/subset_5k-user_cnt/subset_review.json' --model 'neumf'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Training Epoch 96] Batch 849, Loss 0.17721793055534363
[Training Epoch 96] Batch 850, Loss 0.1520378291606903
[Training Epoch 96] Batch 851, Loss 0.1492203027009964
[Training Epoch 96] Batch 852, Loss 0.17099374532699585
[Training Epoch 96] Batch 853, Loss 0.16935113072395325
[Training Epoch 96] Batch 854, Loss 0.14862091839313507
[Training Epoch 96] Batch 855, Loss 0.17448438704013824
[Training Epoch 96] Batch 856, Loss 0.159054696559906
[Training Epoch 96] Batch 857, Loss 0.17863057553768158
[Training Epoch 96] Batch 858, Loss 0.1656428575515747
[Training Epoch 96] Batch 859, Loss 0.18480516970157623
[Training Epoch 96] Batch 860, Loss 0.17102020978927612
[Training Epoch 96] Batch 861, Loss 0.17859633266925812
[Training Epoch 96] Batch 862, Loss 0.16704785823822021
[Training Epoch 96] Batch 863, Loss 0.16532832384109497
[Training Epoch 96] Batch 864, Loss 0.1440219283103943
[Training Epoch 96] Batch 865, Loss 0.1711380

# Evaluated with Hit Ratio (HR) and Normalized Discounted Cumulative Gain (NDCG)

In [9]:
!lsof -i :6006

COMMAND     PID USER   FD   TYPE  DEVICE SIZE/OFF NODE NAME
tensorboa 59547 root   14u  IPv4 1477669      0t0  TCP localhost:6006 (LISTEN)


In [10]:
!kill -9 59547

In [None]:
# Load Tensorboard
%reload_ext tensorboard
%tensorboard --logdir './runs/pretrain_neumf_epoch100_l2-0.0000001' --port 6006