In [3]:
import pandas as pd
import numpy as np
import time
from surprise import KNNWithMeans
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

In [4]:
businesses = pd.read_csv("businesses.csv")

In [5]:
reviews = pd.read_csv("reviews.csv")

In [6]:
users = pd.read_csv("users_over_20.csv")

In [7]:
businesses.head()

Unnamed: 0,business_id,name,latitude,longitude,attributes,categories
0,Apn5Q_b6Nz61Tq4XzPdf9A,Minhas Micro Brewery,51.0918130155,-114.031674872,"{u'BusinessParking': u""{'garage': False, 'stre...","Tours, Breweries, Pizza, Restaurants, Food, Ho..."
1,AjEbIBw6ZFfln7ePHha9PA,CK'S BBQ & Catering,35.9607337,-114.939821,"{u'RestaurantsTableService': u'False', u'GoodF...","Chicken Wings, Burgers, Caterers, Street Vendo..."
2,O8S5hYJ1SMc8fA4QBtVujA,La Bastringue,45.5405031,-73.5993003,"{u'RestaurantsTableService': u'True', u'GoodFo...","Breakfast & Brunch, Restaurants, French, Sandw..."
3,bFzdJJ3wp3PZssNEsyU23g,Geico Insurance,33.4499993,-112.0769793,,"Insurance, Financial Services"
4,8USyCYqpScwiNEb58Bt6CA,Action Engine,51.0355914,-114.0273656,{u'BusinessAcceptsCreditCards': u'True'},"Home & Garden, Nurseries & Gardening, Shopping..."


In [15]:
reviews = reviews[:100000]  # for now, let's work with a pruned dataset
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
user_id        100000 non-null object
business_id    100000 non-null object
stars          100000 non-null int64
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [24]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,0pf5VuzE4_1pwj5NJHG5TQ,vsFFbN71ehRCp46KeR5RdQ,5
1,0pf5VuzE4_1pwj5NJHG5TQ,yGZtG3KpoXyvF6gUC0FExQ,2
2,0pf5VuzE4_1pwj5NJHG5TQ,Jj8ubiwwuCR-rrhrrjcryw,2
3,0pf5VuzE4_1pwj5NJHG5TQ,ERCZtj8qxNxfXJrdXPdEsw,1
4,0pf5VuzE4_1pwj5NJHG5TQ,YTbKmjGTdn4YzoJXTC1u7g,3


In [9]:
users.head()

Unnamed: 0,user_id,name,review_count
0,rMkz1mjevjl8xqrypZwHzw,Daniel,84
1,mN1eee0_j-dglmm57NAf3w,j,163
2,68qmfoSUXpPUaXz_VEyqzA,Mary,32
3,ZQ8ZXepUHhT-RB-THcCzEw,M,85
4,8C5396Ory3qaO-5Lsix_CQ,jason,25


In [17]:
# https://surprise.readthedocs.io/en/stable/getting_started.html#load-from-df-example
# Per surprise documentation, the data must be in form of user_id, item_id, rating (i.e. we use reviews)
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(reviews, reader)

In [18]:
trainset, testset = train_test_split(data, test_size=.2)

In [21]:
# User-based collab filtering
time1 = time.time()
algo_user = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
algo_user.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")

Computing the cosine similarity matrix...
Done computing similarity matrix.
Took 18.491780757904053 seconds


In [28]:
uid = "0pf5VuzE4_1pwj5NJHG5TQ"  # raw user id
iid = "vsFFbN71ehRCp46KeR5RdQ"  # raw item id
iid2 = "yGZtG3KpoXyvF6gUC0FExQ"

# randomly make some predictions, just to see
pred = algo_user.predict(uid, iid, r_ui=5, verbose=True)
pred = algo_user.predict(uid, iid2, r_ui=2, verbose=True)

user: 0pf5VuzE4_1pwj5NJHG5TQ item: vsFFbN71ehRCp46KeR5RdQ r_ui = 5.00   est = 4.13   {'actual_k': 12, 'was_impossible': False}
user: 0pf5VuzE4_1pwj5NJHG5TQ item: yGZtG3KpoXyvF6gUC0FExQ r_ui = 2.00   est = 2.00   {'actual_k': 1, 'was_impossible': False}


In [30]:
test_pred = algo_user.test(testset)

In [31]:
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

User-based Model : Test Set
RMSE: 1.3601


1.3600525227132993

In [None]:
# Item-based collab filtering
time1 = time.time()
algo_item = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
algo_item.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")

In [None]:
uid = "0pf5VuzE4_1pwj5NJHG5TQ"  # raw user id
iid = "vsFFbN71ehRCp46KeR5RdQ"  # raw item id
iid2 = "yGZtG3KpoXyvF6gUC0FExQ"

# randomly make some predictions, just to see
pred = algo_item.predict(uid, iid, r_ui=5, verbose=True)
pred = algo_item.predict(uid, iid2, r_ui=2, verbose=True)

In [None]:
test_pred = algo_user.test(testset)
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)