# Matrix-based approach

## similarity by category breakdown

In [5]:
#import necessary packages

import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA 

from sklearn import ensemble
from sklearn import metrics
from sklearn.cluster import KMeans

from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
#read in cleaned csv files
businesses = pd.read_csv("businesses.csv")
reviews = pd.read_csv("reviews.csv")
users = pd.read_csv("users.csv")


In [7]:
#too many users, kernel will die, adding additional filter
limit_users = users[users['review_count'] > 150]

In [8]:
#make copies of data
business_df = businesses.copy()
review_df = reviews.copy()
user_df = limit_users.copy()

In [9]:
#only keep useful columns 
user_df = user_df[['friends', 'name', 'review_count', 'user_id']]
user_df.rename(columns={"name":"user_name"}, inplace = True)
business_df = business_df[['name', 'business_id', 'categories', 'city', 'state']]
business_df.rename(columns={"name":"business_name"}, inplace = True)
review_df = review_df[['review_id', 'business_id', 'user_id', 'stars']]


In [10]:
#merge all three dataframes into one dataframe
info = review_df.merge(business_df, on = "business_id")
final = info.merge(user_df, on = "user_id")
final = final.dropna()

In [11]:
#get list of category preferences of each user
final_lst = final.assign(categories = final.categories.str.split(', '))
preferences = final_lst.groupby(['user_id']).agg({'categories': 'sum'})

In [12]:
preferences

Unnamed: 0_level_0,categories
user_id,Unnamed: 1_level_1
---1lKK3aKOuomHnwAkAow,"[Doctors, Active Life, Float Spa, Health & Med..."
--17Db1K-KujRuN7hY9Z0Q,"[Cocktail Fish & Chips, Seafood, Bars, Bagel..."
--2vR0DIsmQ6WfcSzKWigw,"[ Men's Clothing, Department Stores, Women's C..."
--3WaS23LcIXtxyFULJHTA,"[American (New), Cajun/Creole, American (New..."
--41c9Tl0C9OGewIR7Qyzg,"[ Coffee & Tea, Coffee Roasteries, Bakeries, D..."
...,...
zzhO1jW4skCDWNrWtSLbTw,"[Men's Clothing, Formal Wear, Plus Size Fashi..."
zziJLt25YU6dp01sewR-IQ,"[Bakeries, Donuts, Delis, Coffee & Tea, Am..."
zziWJMYwDjyVi7kJmgRUvg,"[Coffee & Tea, Bagels, Bakeries, Coffee & Te..."
zzr6MQqGgjkAEu6yTDK_TQ,"[Cafes, Vegan, Breakfast & Brunch]"


In [13]:
#perform some cleaning of category column
def cleaned_column(col):
    new_lst = []
    for item in col:
        new_lst.append(item.strip(' '))
    return new_lst

preferences['categories'] = preferences['categories'].apply(cleaned_column)
preferences.reset_index(inplace=True)

In [14]:
#multilabelbinarizer with duplicate values
#https://stackoverflow.com/questions/65788264/multilabelbinarizer-with-duplicated-values

import itertools
import numpy as np

class MultiLabelCounter():
    def __init__(self, classes=None):
        self.classes_ = classes

    def fit(self,y):
        self.classes_ = sorted(set(itertools.chain.from_iterable(y)))
        self.mapping = dict(zip(self.classes_,
                                         range(len(self.classes_))))
        return self

    def transform(self,y):
        yt = []
        for labels in y:
            data = [0]*len(self.classes_)
            for label in labels:
                data[self.mapping[label]] +=1
            yt.append(data)
        return yt

    def fit_transform(self,y):
        return self.fit(y).transform(y)

In [15]:
#apply to multilabelcounter to data
mlc = MultiLabelCounter()
mlc_output = mlc.fit_transform(preferences['categories'])

#create dataframe of each users preferences 
counted_preferences = pd.DataFrame(data=mlc_output, columns=mlc.classes_)

In [16]:
#drop the categories with less than 10 to narrow in on most popular categories
pop_cat = counted_preferences[counted_preferences.columns[counted_preferences.sum()>10]]

In [17]:
#perform pca
pca = PCA(n_components=50)
components = pca.fit_transform(pop_cat)
print(pca.explained_variance_ratio_)

[0.60201887 0.04161291 0.02752649 0.02500066 0.02275489 0.01799728
 0.01409961 0.01208709 0.01013606 0.00871944 0.00861447 0.00721725
 0.00626163 0.00600404 0.00534891 0.00522526 0.00498598 0.00459499
 0.00443944 0.00423472 0.00373384 0.00339545 0.00334502 0.00316541
 0.00310955 0.00293424 0.00289944 0.00281431 0.00266897 0.00260592
 0.00240993 0.00236513 0.0023427  0.00229376 0.00227015 0.00217003
 0.00211416 0.00204189 0.00201808 0.00194944 0.00189616 0.001809
 0.00175862 0.00172825 0.00167932 0.00163404 0.00160173 0.00156847
 0.00149554 0.00147354]


In [18]:
sum(pca.explained_variance_ratio_)

0.9081720561476512

In [19]:
components_df = pd.DataFrame(data = components, columns = ['pc1','pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10','pc11','pc12','pc13','pc14','pc15','pc16','pc17','pc18','pc19','pc20','pc21','pc22','pc23','pc24','pc25','pc26','pc27','pc28','pc29','pc30','pc31','pc32','pc33','pc34','pc35','pc36','pc37','pc38','pc39','pc40','pc41','pc42','pc43','pc44','pc45','pc46','pc47','pc48','pc49','pc50'])

In [20]:
pca_df = components_df.copy()

In [21]:
pca_df

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,...,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49,pc50
0,-4.777337,0.413104,0.433513,-0.218103,0.271227,0.213336,-0.304754,-0.239145,0.099209,-0.116999,...,0.360508,-0.068288,0.166718,-0.252151,0.084647,0.275587,0.494130,-0.346467,-0.067362,-0.085571
1,-2.941033,-0.050189,0.712286,0.547965,0.486559,-0.422789,0.021322,1.404651,0.019799,-0.615739,...,-0.187395,-0.197238,-0.020703,-0.389856,0.659809,-0.117468,-0.476919,-0.643666,0.135450,0.207004
2,-3.882321,0.461509,0.040408,1.679158,0.927852,0.970277,-1.324932,0.144781,0.845791,-0.275232,...,0.867832,-0.782599,-0.118694,-0.158296,0.125285,-0.755768,-0.131263,1.041142,-0.501052,0.620691
3,-0.859514,-0.296950,0.955067,-0.806596,-1.598475,1.140489,1.430270,-0.941965,-0.070862,-0.689887,...,-0.000950,0.284746,-0.022294,-0.618873,-0.046456,-0.187240,0.048921,0.001592,-0.530884,-0.226385
4,-3.615115,2.105222,0.592363,0.000930,-0.631703,-0.237371,1.003583,-0.070577,0.104154,-0.291278,...,0.103762,0.084812,-0.042970,-0.046761,-0.046768,0.003751,0.162043,0.007749,0.063658,-0.090888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49599,-4.320574,1.120762,0.251233,-0.746671,-0.119721,0.008887,-0.712636,0.565584,-0.090132,-0.763444,...,-0.395176,0.584851,0.292619,-0.278721,-0.100268,0.210170,0.170945,0.190613,-0.058126,-0.053230
49600,5.465425,-0.622428,0.242368,-2.217896,0.648517,4.089584,2.325991,-1.344567,2.390704,1.181374,...,0.167795,-1.506930,1.132210,-0.979523,0.191035,0.922475,-0.121721,0.882841,0.315231,-0.226414
49601,-0.918654,-0.718605,-3.776629,1.573610,-2.027102,0.006213,0.409672,0.548115,-0.042799,-0.947783,...,0.212466,0.082954,0.236714,-0.009898,-0.264575,0.300401,0.047904,0.393111,-0.399701,0.346351
49602,-4.559487,0.559302,0.369101,-0.283846,-0.098300,-0.108874,0.070805,-0.256962,-0.252897,-0.151130,...,0.068431,0.168291,-0.069606,-0.063228,-0.116842,-0.145480,-0.047693,0.054098,0.189275,-0.060773


In [22]:
pca_df.insert(0,'user_id', preferences['user_id'])

In [23]:
pca_df.set_index(['user_id'], inplace=True)


In [33]:
pca_df

Unnamed: 0_level_0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,...,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49,pc50
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,-4.777337,0.413104,0.433513,-0.218103,0.271227,0.213336,-0.304754,-0.239145,0.099209,-0.116999,...,0.360508,-0.068288,0.166718,-0.252151,0.084647,0.275587,0.494130,-0.346467,-0.067362,-0.085571
--17Db1K-KujRuN7hY9Z0Q,-2.941033,-0.050189,0.712286,0.547965,0.486559,-0.422789,0.021322,1.404651,0.019799,-0.615739,...,-0.187395,-0.197238,-0.020703,-0.389856,0.659809,-0.117468,-0.476919,-0.643666,0.135450,0.207004
--2vR0DIsmQ6WfcSzKWigw,-3.882321,0.461509,0.040408,1.679158,0.927852,0.970277,-1.324932,0.144781,0.845791,-0.275232,...,0.867832,-0.782599,-0.118694,-0.158296,0.125285,-0.755768,-0.131263,1.041142,-0.501052,0.620691
--3WaS23LcIXtxyFULJHTA,-0.859514,-0.296950,0.955067,-0.806596,-1.598475,1.140489,1.430270,-0.941965,-0.070862,-0.689887,...,-0.000950,0.284746,-0.022294,-0.618873,-0.046456,-0.187240,0.048921,0.001592,-0.530884,-0.226385
--41c9Tl0C9OGewIR7Qyzg,-3.615115,2.105222,0.592363,0.000930,-0.631703,-0.237371,1.003583,-0.070577,0.104154,-0.291278,...,0.103762,0.084812,-0.042970,-0.046761,-0.046768,0.003751,0.162043,0.007749,0.063658,-0.090888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzhO1jW4skCDWNrWtSLbTw,-4.320574,1.120762,0.251233,-0.746671,-0.119721,0.008887,-0.712636,0.565584,-0.090132,-0.763444,...,-0.395176,0.584851,0.292619,-0.278721,-0.100268,0.210170,0.170945,0.190613,-0.058126,-0.053230
zziJLt25YU6dp01sewR-IQ,5.465425,-0.622428,0.242368,-2.217896,0.648517,4.089584,2.325991,-1.344567,2.390704,1.181374,...,0.167795,-1.506930,1.132210,-0.979523,0.191035,0.922475,-0.121721,0.882841,0.315231,-0.226414
zziWJMYwDjyVi7kJmgRUvg,-0.918654,-0.718605,-3.776629,1.573610,-2.027102,0.006213,0.409672,0.548115,-0.042799,-0.947783,...,0.212466,0.082954,0.236714,-0.009898,-0.264575,0.300401,0.047904,0.393111,-0.399701,0.346351
zzr6MQqGgjkAEu6yTDK_TQ,-4.559487,0.559302,0.369101,-0.283846,-0.098300,-0.108874,0.070805,-0.256962,-0.252897,-0.151130,...,0.068431,0.168291,-0.069606,-0.063228,-0.116842,-0.145480,-0.047693,0.054098,0.189275,-0.060773


In [24]:
#determine optimal number of clusters
# size_lst = range(2,10)
# sq_dist = []

# for cluster_size in size_lst:
#     kmeans = KMeans(n_clusters=cluster_size)
#     kmeans.fit(pca_df)
#     sq_dist.append(kmeans.inertia_)

# plt.plot(size_lst,sq_dist,'bx-')
# plt.xlabel('Values of K') 
# plt.ylabel('Sum of squared distances/Inertia') 
# plt.title('Elbow Method For Optimal k')
# plt.show()

In [25]:
#perform k-means clustering with 15 clusters
def createClusters(cluster_size):
    result = pd.DataFrame(pca_df.index)
    user_kmeans_model = KMeans(n_clusters=cluster_size, n_init = 18).fit(pca_df)
    result['label'] = pd.Series(user_kmeans_model.labels_, index= result.index)
    return result

labeled_data = createClusters(15)

In [34]:
labeled_data

Unnamed: 0,user_id,label
0,---1lKK3aKOuomHnwAkAow,1
1,--17Db1K-KujRuN7hY9Z0Q,1
2,--2vR0DIsmQ6WfcSzKWigw,1
3,--3WaS23LcIXtxyFULJHTA,7
4,--41c9Tl0C9OGewIR7Qyzg,1
...,...,...
49599,zzhO1jW4skCDWNrWtSLbTw,1
49600,zziJLt25YU6dp01sewR-IQ,10
49601,zziWJMYwDjyVi7kJmgRUvg,7
49602,zzr6MQqGgjkAEu6yTDK_TQ,1


In [26]:
#make sure each cluster has at least 5+ users
cluster_distribution = labeled_data.groupby(['label']).count()
cluster_distribution

Unnamed: 0_level_0,user_id
label,Unnamed: 1_level_1
0,989
1,35713
2,37
3,92
4,2
5,317
6,150
7,8247
8,43
9,16


In [27]:
#create table for recommendation system
recommender_table = pop_cat.copy()
recommender_table.insert(0,'user_id', preferences['user_id'])

In [28]:
recommender_table

Unnamed: 0,user_id,Acai Bowls,Accessories,Acne Treatment,Active Life,Acupuncture,Adult,Adult Education,Adult Entertainment,Advertising,...,Wine Venues & Event Spaces,Wine Wine Tasting Room,Wine Wineries,Wine Wraps,Wineries,Women's Clothing,Wraps,Yelp Events,Yoga,Zoos
0,---1lKK3aKOuomHnwAkAow,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,--17Db1K-KujRuN7hY9Z0Q,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,--2vR0DIsmQ6WfcSzKWigw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,--3WaS23LcIXtxyFULJHTA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,--41c9Tl0C9OGewIR7Qyzg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49599,zzhO1jW4skCDWNrWtSLbTw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49600,zziJLt25YU6dp01sewR-IQ,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49601,zziWJMYwDjyVi7kJmgRUvg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49602,zzr6MQqGgjkAEu6yTDK_TQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
#make sure dataframe printout is not truncated
pd.options.display.max_colwidth = 100

In [30]:
#determine common interests between users
def common_interests(user_1, user_2): 
    og_df = recommender_table[recommender_table['user_id'].isin([user_1, user_2])]
    interests = (og_df!=0).all()
    interests_lst = list(interests.index[interests])
    interest_df = og_df[interests_lst]
    common_counts = interest_df.sum().to_dict()
    common_counts.pop("user_id", None)
    #print(interest_df)
    sorted_common = sorted(common_counts.items(), key=lambda x:x[1], reverse = True)
    common = [tup[0] for tup in sorted_common][0:3]
    return(common)

In [31]:
#give friend recommendations
def friend_recommender(user_id):
    cluster = int(labeled_data[labeled_data['user_id'] == user_id]['label'])
    other_users = labeled_data[labeled_data['label'] == cluster]['user_id']
  
    dists = {}
    current_user = pca_df.loc[user_id].to_numpy()
    for user in other_users:
        other_user = pca_df.loc[user].to_numpy()
        dist = np.linalg.norm(current_user - other_user)
        dists[user] = dist
    
    sorted_friends = sorted(dists.items(), key=lambda x:x[1])[1:6]
    close_friends = [tup[0] for tup in sorted_friends]
    common = []
    
    for friend in close_friends: 
        common.append(common_interests(user_id, friend))
    
    final_dict = {'Recommended User': close_friends, 'Common Interests': common}
    final_df = pd.DataFrame(final_dict)
    return(final_df)
    
    

In [32]:
friend_recommender('zyV4wWbSQOJAVRZnMUslfQ')

Unnamed: 0,Recommended User,Common Interests
0,XC9nZtaQutqVuxabJppftQ,"[American (New), Active Life, Golf]"
1,NuMGh7LKV27I20MDAuVy2Q,"[American (New), Active Life, Golf]"
2,bY1svhjnaxLqoAuLSuij6g,"[American (New), Hotels, Hotels & Travel]"
3,GTzd901i1MFhuR8wbhgZdQ,"[American (New), Active Life, Golf]"
4,eF1Ku3VZtPqezbHjaGg1qg,"[American (New), Italian, Steakhouses]"


In [52]:
friend_recommender('zzhO1jW4skCDWNrWtSLbTw')

Unnamed: 0,Recommended User,Common Interests
0,dDTC6H2_yMHzEljVJOoDNA,"[Chinese, Shanghainese]"
1,-hAbdeB1C42iO93iyg-57Q,[Chinese]
2,LFSIJ3auEGvO97mBGXA9Xw,[Chinese]
3,rCgzn387hf4a2Wx-iAr5pg,"[Chinese, Dim Sum, Shanghainese]"
4,TqQtK-rfIHDhQ2dVMMVuhw,"[Chinese, Coffee & Tea]"


In [49]:
friend_recommender('--FQ61qe5wDSc6lDENlrUQ')

Unnamed: 0,Recommended User,Common Interests
0,oRafkFmvH7qp-z_rsFXdTg,"[American (Traditional), Caribbean, Laotian]"
1,tKE1-v8F1r0fgm5-vxi5ag,"[American (Traditional), Caribbean, Laotian]"
2,Q7JZ4mptLqMe0ZDSWtgjuw,"[American (Traditional), Arts & Entertainment, Music Venues]"
3,Q83R2UssCoErEJX36wrlvg,"[American (Traditional), Arts & Entertainment, Music Venues]"
4,nqUm_8yPU6a-KxojtREcww,"[American (Traditional), Arts & Entertainment, Music Venues]"
