In [45]:
import pandas as pd
import numpy as np
import time
from surprise import KNNWithMeans, SVD
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV
from collections import defaultdict

In [35]:
businesses = pd.read_csv("businesses.csv")

In [7]:
reviews = pd.read_csv("reviews.csv")

In [8]:
users = pd.read_csv("users_over_20.csv")

In [36]:
businesses.head()

Unnamed: 0,business_id,name,latitude,longitude,attributes,categories
0,Apn5Q_b6Nz61Tq4XzPdf9A,Minhas Micro Brewery,51.091813,-114.031675,"{'BikeParking': 'False', 'BusinessAcceptsCredi...","Tours, Breweries, Pizza, Restaurants, Food, Ho..."
1,AjEbIBw6ZFfln7ePHha9PA,CK'S BBQ & Catering,35.960734,-114.939821,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...","Chicken Wings, Burgers, Caterers, Street Vendo..."
2,O8S5hYJ1SMc8fA4QBtVujA,La Bastringue,45.540503,-73.5993,"{'Alcohol': 'beer_and_wine', 'Ambience': ""{'ro...","Breakfast & Brunch, Restaurants, French, Sandw..."
3,6OuOZAok8ikONMS_T3EzXg,Thai One On,43.712946,-79.632763,"{'Alcohol': 'none', 'BusinessAcceptsCreditCard...","Restaurants, Thai"
4,8-NRKkPY1UiFXW20WXKiXg,Filiberto's Mexican Food,33.448106,-112.341302,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...","Mexican, Restaurants"


In [10]:
reviews = reviews[:100000]  # for now, let's work with a pruned dataset
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
user_id        100000 non-null object
business_id    100000 non-null object
stars          100000 non-null int64
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [11]:
reviews.head()
reviews.nunique()  # how many distinct users and items?

user_id        13429
business_id    41006
stars              5
dtype: int64

In [12]:
users.head()

Unnamed: 0,user_id,name,review_count
0,rMkz1mjevjl8xqrypZwHzw,Daniel,84
1,mN1eee0_j-dglmm57NAf3w,j,163
2,68qmfoSUXpPUaXz_VEyqzA,Mary,32
3,ZQ8ZXepUHhT-RB-THcCzEw,M,85
4,8C5396Ory3qaO-5Lsix_CQ,jason,25


In [37]:
# Let's try this again, except this time only with reviews from Las Vegas
businesses_lv = pd.read_csv("businesses_lv.csv")
reviews_lv = pd.read_csv("reviews_lv.csv")

In [14]:
# so we don't have to change all the code further down
reviews = reviews_lv

# Let's re-filter reviews so only users with at least 10 reviews show up
reviews = reviews.groupby('user_id').filter(lambda x: len(x) >= 20)
# reviews['user_id'].value_counts()  # double check this worked
print (len(reviews))

272159


In [15]:
# https://surprise.readthedocs.io/en/stable/getting_started.html#load-from-df-example
# Per surprise documentation, the data must be in form of user_id, item_id, rating (i.e. we use reviews)
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(reviews, reader)
trainset, testset = train_test_split(data, test_size=.2)

In [16]:
# User-based collab filtering
time1 = time.time()
algo_user = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
algo_user.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly ~20 seconds on reduced training set

Computing the cosine similarity matrix...
Done computing similarity matrix.
Took 15.972065210342407 seconds


In [28]:
uid = "0pf5VuzE4_1pwj5NJHG5TQ"  # raw user id
iid = "vsFFbN71ehRCp46KeR5RdQ"  # raw item id
iid2 = "yGZtG3KpoXyvF6gUC0FExQ"

# randomly make some predictions, just to see
pred = algo_user.predict(uid, iid, r_ui=5, verbose=True)
pred = algo_user.predict(uid, iid2, r_ui=2, verbose=True)

user: 0pf5VuzE4_1pwj5NJHG5TQ item: vsFFbN71ehRCp46KeR5RdQ r_ui = 5.00   est = 4.13   {'actual_k': 12, 'was_impossible': False}
user: 0pf5VuzE4_1pwj5NJHG5TQ item: yGZtG3KpoXyvF6gUC0FExQ r_ui = 2.00   est = 2.00   {'actual_k': 1, 'was_impossible': False}


In [17]:
test_pred_user = algo_user.test(testset)
print("User-based Model : Test Set")
accuracy.rmse(test_pred_user, verbose=True)

User-based Model : Test Set
RMSE: 1.0850


1.085026131982119

In [18]:
# Item-based collab filtering
time1 = time.time()
algo_item = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
algo_item.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly 3 minutes on reduced training set

Computing the cosine similarity matrix...
Done computing similarity matrix.
Took 10.129841804504395 seconds


In [12]:
uid = "0pf5VuzE4_1pwj5NJHG5TQ"  # raw user id
iid = "vsFFbN71ehRCp46KeR5RdQ"  # raw item id
iid2 = "yGZtG3KpoXyvF6gUC0FExQ"

# randomly make some predictions, just to see
pred = algo_item.predict(uid, iid, r_ui=5, verbose=True)
pred = algo_item.predict(uid, iid2, r_ui=2, verbose=True)

user: 0pf5VuzE4_1pwj5NJHG5TQ item: vsFFbN71ehRCp46KeR5RdQ r_ui = 5.00   est = 3.77   {'was_impossible': True, 'reason': 'User and/or item is unkown.'}
user: 0pf5VuzE4_1pwj5NJHG5TQ item: yGZtG3KpoXyvF6gUC0FExQ r_ui = 2.00   est = 3.77   {'was_impossible': True, 'reason': 'User and/or item is unkown.'}


In [20]:
test_pred_item = algo_item.test(testset)
print("Item-based Model : Test Set")
accuracy.rmse(test_pred_item, verbose=True)

Item-based Model : Test Set
RMSE: 1.0737


1.0736711098664622

In [21]:
# Let's try fitting item-item with a bunch of other types of similarity
# Item-based collab filtering
time1 = time.time()
algo_item = KNNWithMeans(sim_options={'name': 'msd', 'user_based': False})
algo_item.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly 3 minutes on reduced training set

test_pred = algo_item.test(testset)
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Took 3.32419490814209 seconds
Item-based Model : Test Set
RMSE: 1.0791


1.0790652526119944

In [22]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Let's try an ensemble approach of averaging the results from user and item-based filtering
predictions = []
targets = []
for i in range(len(test_pred_item)):
    assert(test_pred_item[i].uid == test_pred_user[i].uid and test_pred_item[i].iid == test_pred_user[i].iid)
    new_pred = (test_pred_item[i].est + test_pred_user[i].est) / 2
    predictions.append(new_pred)
    targets.append(test_pred_item[i].r_ui)
# test_pred_item[0].est
# test_pred_user[0]

print (rmse(np.array(predictions), np.array(targets)))

1.0683663051350372


In [40]:
# Matrix factorization method!
# Find best params
param_grid = {'n_factors': [5, 10], 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo_svd = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0656340416262176
{'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [43]:
# Matrix factorization (SVD)
time1 = time.time()
algo_svd.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly 3 minutes on reduced training set

# Test results
test_pred_svd = algo_svd.test(testset)
print("Matrix factorization Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Took 3.8218002319335938 seconds
Matrix factorization Model : Test Set
RMSE: 1.0671


1.06714947250516

In [44]:
# Let's try an ensemble approach of averaging the results from all three methods
predictions = []
targets = []
for i in range(len(test_pred_item)):
    assert(test_pred_item[i].uid == test_pred_user[i].uid and test_pred_item[i].iid == test_pred_user[i].iid)
    new_pred = (test_pred_item[i].est + test_pred_user[i].est + test_pred_svd[i].est) / 3
    predictions.append(new_pred)
    targets.append(test_pred_item[i].r_ui)

print (rmse(np.array(predictions), np.array(targets)))

1.0584211860434445
