In [None]:
## Beta recommender system

In [1]:
import pandas as pd
import numpy as np

In [3]:
%%time
# Load reviews
reviews = pd.read_csv('../data/yelp_reviews_restaurant.csv')
reviews.head()

Wall time: 50.3 s


Unnamed: 0,user_id,business_id,review_stars,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
0,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
1,zFCuveEe6M-ijY1iy23IJg,HQl28KMwrEKHqhFrrDqVNQ,5,6,2,5,"We walked into Melt. ""Did you want to put your...",2011-08-25 04:24:23,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
2,4V985R3RG-rv0B7WCPQzeQ,HQl28KMwrEKHqhFrrDqVNQ,1,1,0,0,I commented on how slow the service was last A...,2015-03-04 20:37:43,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
3,nFGcoL6wuPQzxsNJVSfGrA,HQl28KMwrEKHqhFrrDqVNQ,4,2,0,0,We walked in off the streets on a September ni...,2014-09-10 01:38:55,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
4,CJqgUQeWhdgbDyLAFy7xvQ,HQl28KMwrEKHqhFrrDqVNQ,4,0,0,0,Brunch on Saturday was excellent. The Bloody M...,2018-01-21 18:50:29,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American


In [None]:
# As seen is the EDA, there are a lot of users which does very few reviews. 
# That makes really difficult to classify and with the big amount of data, it is also needed to remove some values. 
# Therefore in order to be in the model, a threshold of 50 reviews/user has been selected. 

In [7]:
# Remove users that have less than 50 reviews.
grouped_users = reviews.groupby('user_id')['text'].count().reset_index()
grouped_users.head()

Unnamed: 0,user_id,text
0,---1lKK3aKOuomHnwAkAow,23
1,---94vtJ_5o_nikEs6hUjg,1
2,---RfKzBwQ8t3wu-LXvx3w,1
3,---tGbMnMitD_7srW6Nfzg,1
4,---udAKDsn0yQXmzbWQNSw,1


In [17]:
result = grouped_users.query('text <= 50')
result['text'].sum()

2551372

In [12]:
reviews.shape[0]

2841386

In [18]:
print(f'Difference: {reviews.shape[0] - result["text"].sum()}')

Difference: 290014


In [21]:
reviews_filtered = reviews[['user_id', 'name', 'review_stars']][~reviews['user_id'].isin(result['user_id'])]
reviews_filtered.shape

(290013, 3)

In [51]:
## Utility matrix for users (for restaurants take utility_matrix.T)
from scipy.sparse import csr_matrix

utility_matrix = reviews_filtered.pivot_table(index='user_id', values='review_stars', columns='name').fillna(0)
um = csr_matrix(utility_matrix) 

In [54]:
restaurant_ids = (utility_matrix.columns)
users_ids = (utility_matrix.index)

In [25]:
# Normalizing the data

sum_ratings_per_restaurant = um.sum(axis=0)
n_ratings_per_restaurant = um.getnnz(axis=0)
mean_rating_per_restaurant = sum_ratings_per_restaurant / n_ratings_per_restaurant

In [28]:
um_mean_restaurant = np.tile(mean_rating_per_restaurant, (um.shape[0],1))
um_mean_restaurant.shape

(3346, 13284)

In [30]:
um_norm = um - csr_matrix(um_mean_restaurant)

In [32]:
# Comparing after and before normalizing

um.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 4., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
um_norm.todense()

matrix([[-2.625     , -4.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ],
        [-2.625     , -4.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ],
        [-2.625     , -4.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ],
        ...,
        [-2.625     , -0.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ],
        [-2.625     , -4.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ],
        [-2.625     , -4.15126812, -3.        , ..., -4.        ,
         -5.        , -4.5       ]])

In [38]:
# create the cosine_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(um_norm, um_norm)

In [None]:
## Restaurant recomendation (item-item) with k-NN

In [36]:
from sklearn.neighbors import NearestNeighbors

In [49]:
indices = pd.Series(reviews_filtered.index, index=reviews['name'])
print(indices)
def recommendations(restaurant, k=10):
    
    # Initialize the empty list of recommended restaurants
    recommended_restaurants = []
    
    #Get the index of the restaurant that matches with the name
    #idx = indices[indices == restaurant].index
    idx = 
    print(idx)
    
    # creating a series with the similarity scores
    scores_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    
    #getting the indices of the most similar restaurants
    top_restaurants = list(score_series.iloc[:k].index)
    
    # creating the dataframe with the most similar restaurants
    for i in top_restaurants:
        recommended_restaurants.append(list(reviews_filtered.index)[i])
        
    return recommended_restaurants

0               1
1               4
2              27
3              29
4              34
           ...   
290008    2841352
290009    2841364
290010    2841366
290011    2841372
290012    2841374
Length: 290013, dtype: int64


In [81]:
names = reviews_filtered['name']
restaurant_ids = reviews_filtered.index

def recommendations(restaurant, distance, k=10):
    correct_name = reviews_filtered[reviews_filtered.name.str.startswith(restaurant)]['name'].values[0]
    idx = restaurant_ids.get_loc(reviews_filtered[reviews_filtered.name.str.startswith(restaurant)]['name'].index[0])
    
    sim_scores = list(enumerate(distance[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:k]
    
    similar_restaurants = [i[0] for i in sim_scores]
    
    print(f'Recommendations for {correct_name}')
    return reviews_filtered['name'].iloc[similar_restaurants]

In [80]:
recommendations('Liberty', cosine_sim)

Recommendations for Liberty Village Market & Cafe


32268             Picanha Steak Truck
19286       The Smashed Pig Gastropub
161                         Pink Taco
13043                     Sunset Room
7210              Village Pub & Poker
1975           Brevard Court Sundries
18142           Riva by Wolfgang Puck
6417                    Tam's Kitchen
3256                Cafe des Artisans
32293           Mi Casa Grill Cantina
6888                   Salty Senorita
10181             Putter's Charleston
31385                   El Pollo Loco
11199            Rimrock Bar & Grille
27584    Brooklyn V's Pizza- Chandler
12025                 Jasmine Express
20566       Dairy Queen Grill & Chill
35223                   Bombay Palace
1099       Sabor Miami Cafe & Gallery
34343                 Taqueria Mexico
Name: name, dtype: object

In [74]:
reviews_filtered[reviews_filtered.name.str.startswith('Liberty')]['name'].index[0]

1

In [73]:
restaurant_ids

Int64Index([      1,       4,      27,      29,      34,      35,      43,
                 90,     161,     167,
            ...
            2841305, 2841311, 2841321, 2841330, 2841345, 2841352, 2841364,
            2841366, 2841372, 2841374],
           dtype='int64', length=290013)