In [141]:
import pandas as pd
import numpy as np
import time
from cuisines import cuisines
from surprise import AlgoBase, KNNWithMeans
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate
from sklearn.metrics.pairwise import cosine_similarity

In [86]:
businesses = pd.read_csv('businesses_features.csv')
businesses.head()

Unnamed: 0,business_id,name,stars,latitude,longitude,attributes,categories,cuisines,price_range,noise_level,good_for_kids,good_for_groups,ambience
0,vJIuDBdu01vCA8y1fwR1OQ,CakesbyToi,1.5,36.1922841,-115.159272,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","American (Traditional), Food, Bakeries, Restau...","American (Traditional),Bakeries",0,0,0,0,0
1,kgffcoxT6BQp-gJ-UQ7Czw,Subway,2.5,36.2017936,-115.281981,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...","Fast Food, Restaurants, Sandwiches","Fast Food,Sandwiches",1,2,1,1,100000000
2,0jtRI7hVMpQHpUVtUy4ITw,Omelet House Summerlin,4.0,36.2019904392,-115.283122,"{'Alcohol': 'beer_and_wine', 'Ambience': ""{'ro...","Beer, Wine & Spirits, Italian, Food, American ...","Beer,Wine & Spirits,Italian,American (Traditio...",2,2,1,1,100000000
3,JJEx5wIqs9iGGATOagE8Sg,Baja Fresh Mexican Grill,2.0,36.271169,-115.267759,"{'HasTV': 'False', 'RestaurantsReservations': ...","Mexican, Restaurants",Mexican,0,0,0,0,0
4,coVhQD_EAnCof_a8sGM03g,Provence Bakery,2.0,36.125587,-115.211199,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Bakeries, Food",Bakeries,1,0,0,0,0


In [4]:
reviews = pd.read_csv('reviews_lv.csv')
reviews = reviews.groupby('user_id').filter(lambda x: len(x) >= 20)
reviews.head()

Unnamed: 0,user_id,business_id,stars
5,AuIK5tF2GjO7SftHawTLKw,Zrx25j1M794Nh8fUGB8E9A,4
6,AuIK5tF2GjO7SftHawTLKw,4tWF2w6jP2LNK2xTVNEMFw,4
7,AuIK5tF2GjO7SftHawTLKw,1Vn_lex3LGGwuTo-xeJnww,5
8,AuIK5tF2GjO7SftHawTLKw,Zmfq3DVhHdtVChhSxzoUHw,4
9,AuIK5tF2GjO7SftHawTLKw,IcOvxejpd5FLFIkj20Opqg,4


In [6]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(reviews, reader)
trainset, testset = train_test_split(data, test_size=.2)

In [139]:
# Hybrid algorithm using content-based item profiles to calculate similarity
class Hybrid(KNNWithMeans):

    def fit(self, trainset):
        KNNWithMeans.fit(self, trainset)

        item_features = []
        for item in trainset.all_items():
            id = trainset.to_raw_iid(item)
            stars = businesses[businesses['business_id'] == id]['stars'].item()
            item_cuisines = businesses[businesses['business_id'] == id]['cuisines'].item()
            features = []
            if not pd.isnull(item_cuisines):
                cuisine_types = item_cuisines.split(',')
                features += list(map(lambda x: stars if x in item_cuisines else 0, cuisines))
            else:
                features += [0 for _ in range(len(cuisines))]
            features.append(businesses[businesses['business_id'] == id]['price_range'].item())
            features.append(businesses[businesses['business_id'] == id]['noise_level'].item())
            features.append(businesses[businesses['business_id'] == id]['good_for_kids'].item())
            features.append(businesses[businesses['business_id'] == id]['good_for_groups'].item())
            # features += map(lambda x: int(x), list(str(businesses[businesses['business_id'] == id]['ambience'].item())))
            
            item_features.append(features)
        
        self.sim = cosine_similarity(item_features, item_features)
        return self

In [148]:
# User-based collab filtering
time1 = time.time()
algo_hybrid = Hybrid(sim_options={'name': 'msd', 'user_based': False})
algo_hybrid.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly ~20 seconds on reduced training set

test_pred_hybrid = algo_hybrid.test(testset)
print("Hybrid Model : Test Set")
accuracy.rmse(test_pred_user, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Took 121.83186388015747 seconds
Hybrid Model : Test Set
RMSE: 1.0643


1.064334570897796

In [147]:
# Item-based collab filtering
time1 = time.time()
algo_item = KNNWithMeans(sim_options={'name': 'msd', 'user_based': False})
algo_item.fit(trainset)

# how long did this take?
time2 = time.time()
print (f"Took {time2 - time1} seconds")  # took roughly 3 minutes on reduced training set

test_pred_item = algo_item.test(testset)
print("Item-based Model : Test Set")
accuracy.rmse(test_pred_item, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Took 5.6237499713897705 seconds
Item-based Model : Test Set
RMSE: 1.0745


1.0744833749544198

In [149]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Let's try an ensemble approach of averaging the results from user and item-based filtering
predictions = []
targets = []
for i in range(len(test_pred_item)):
    assert(test_pred_item[i].uid == test_pred_hybrid[i].uid and test_pred_item[i].iid == test_pred_hybrid[i].iid)
    new_pred = (test_pred_item[i].est + test_pred_hybrid[i].est) / 2
    predictions.append(new_pred)
    targets.append(test_pred_item[i].r_ui)

print (rmse(np.array(predictions), np.array(targets)))

1.0613805774232785
