In [1]:
import pandas as pd
import numpy as np
import random
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import itertools
from collections import defaultdict

In [2]:
!pip install pickle5
import pickle5



In [3]:
!pip install implicit
import implicit
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k



In [4]:
# user_review = pd.read_csv("/content/drive/My Drive/archive/user_review_mat_food_rest_v2.csv/")
with open('/content/drive/My Drive/archive/user_review_mat_v3.pkl','rb') as file:
    user_review = pickle5.load(file)

In [5]:
user_review.head()

Unnamed: 0,user_id,business_id,user_stars,name
0,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4.0,aising Cane's Chicken Finger
1,5vD2kmE25YBrbayKhykNxQ,nlxHRv1zXGT0c0K51q3jDg,5.0,Firehouse Subs
2,aq_ZxGHiri48TUXJlpRkCQ,Pthe4qk5xh4n-ef-9bvMSg,5.0,Chon Thai Food
3,P6apihD4ASf1vpPxHODxAQ,e_BiI4ej1CW1F0EyVLr-FQ,5.0,Casa Mia
4,HJECayULRM-6xh2GCCvLiA,l-nL4BmhzpZjcavooO48PQ,4.0,The Wokker Restaurant


In [6]:
## need to remove the duplicate reviews based on date and not just the last one ##
user_review = user_review.drop_duplicates(subset=['user_id', 'business_id'], keep='last')
len(user_review)

3305941

In [7]:
# Cleaning the user and business ids
# user_review['user_id'] = user_review.user_id.str[2:-1]
# user_review['user_id'] = user_review['user_id'].str.strip()
# user_review['business_id'] = user_review.business_id.str[2:-1]
# user_review['business_id'] = user_review['business_id'].str.strip( )
# user_review['name'] = user_review.name.str[2:-1]
# user_review['name'] = user_review['name'].str.strip()

## remove users having less than 10 ratings to avoid coldstarts ##
## need to try different 'n' and check accuracy ##
users = user_review[['user_id','business_id']].groupby('user_id').count().sort_values(by='business_id',ascending=False).reset_index()
users = users[users.business_id >= 25]

In [8]:
## joining with the user_review table to filter out the users with more than 15 reviews ##
user_review = pd.merge(user_review,users['user_id'],how='inner', on='user_id')

business_lookup = user_review[['business_id', 'name']].drop_duplicates()
business_lookup['business_id'] = business_lookup.business_id.astype(str)

user_review = user_review[['user_id','business_id','name','user_stars']]

In [9]:
user_review.head()

Unnamed: 0,user_id,business_id,name,user_stars
0,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,aising Cane's Chicken Finger,4.0
1,UgMW8bLE0QMJDCkQ1Ax5Mg,QoKn3zRpDrBj3hCPYGDCSA,El Pollo Loco,3.0
2,UgMW8bLE0QMJDCkQ1Ax5Mg,_t8B7bb-Q7kmn50kvGEKzw,iller's Ale House - Henderso,4.0
3,UgMW8bLE0QMJDCkQ1Ax5Mg,VIG1MhUSl5FFBJ1KtCFqBg,apa John's Pizz,5.0
4,UgMW8bLE0QMJDCkQ1Ax5Mg,blBZJ_UbVb2ieOcktYRuZg,Smashburger,4.0


In [10]:
# Create a numeric user_id and artist_id column for row,column accessing
user_review['user_id'] = user_review['user_id'].astype("category")
user_review['business_id'] = user_review['business_id'].astype("category")
user_review['user_code'] = user_review['user_id'].cat.codes
user_review['business_code'] = user_review['business_id'].cat.codes

In [11]:
user_review.head()

Unnamed: 0,user_id,business_id,name,user_stars,user_code,business_code
0,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,aising Cane's Chicken Finger,4.0,7294,13141
1,UgMW8bLE0QMJDCkQ1Ax5Mg,QoKn3zRpDrBj3hCPYGDCSA,El Pollo Loco,3.0,7294,18769
2,UgMW8bLE0QMJDCkQ1Ax5Mg,_t8B7bb-Q7kmn50kvGEKzw,iller's Ale House - Henderso,4.0,7294,25525
3,UgMW8bLE0QMJDCkQ1Ax5Mg,VIG1MhUSl5FFBJ1KtCFqBg,apa John's Pizz,5.0,7294,21757
4,UgMW8bLE0QMJDCkQ1Ax5Mg,blBZJ_UbVb2ieOcktYRuZg,Smashburger,4.0,7294,26792


In [12]:
# Create sparse matrix for the implicit library, one for user-business (recommending business to user) and second for business-user(finding similar businesses)
sparse_user_business = sparse.csr_matrix((user_review['user_stars'].astype(float), (user_review['user_code'], user_review['business_code'])))
sparse_business_user = sparse.csr_matrix((user_review['user_stars'].astype(float), (user_review['business_code'], user_review['user_code'])))

In [13]:
# def make_train_test(sparse_mat,pct_train):
#   '''
#   This function will take in the sparse matrices (user-business/business-user) and separate a percentage of the original ratings as test dataset

#   parameters: 

#   sparse_mat - the original ratings matrix from which we want to generate a train/test set. The new train 
#   and test datasets are in sparse csr_matrix form. 

#   pct_train - The percentage of user-business ratings that should be kept in the training data

#   returns:

#   train - The altered version of the original data with a certain percentage of the user-item pairs 
#   that originally had interaction set back to zero.

#   test - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
#   compares with the actual interactions.
#   '''
#   # Divide data into train and test for evaluation
#   train,test = train_test_split(sparse_mat,train_percentage=pct_train)
#   return train, test

In [14]:
def make_train_test(sparse_user_business,test_samples=10):
  '''
  This function will take in the sparse matrices (user-business/business-user) and separate a percentage of the original ratings as test dataset

  parameters: 

  sparse_mat - the original ratings matrix from which we want to generate a train/test set. The new train 
  and test datasets are in sparse csr_matrix form. 

  pct_train - The percentage of user-business ratings that should be kept in the training data

  returns:

  train - The altered version of the original data with a certain percentage of the user-item pairs 
  that originally had interaction set back to zero.

  test - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
  compares with the actual interactions.
  '''
  #pct_test = 0.2
  test_set = sparse_user_business.copy() # Make a copy of the original set to be the test set
  test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
  training_set = sparse_user_business.copy() # Make a copy of the original data we can alter as our training set
  nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
  #nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,business index into list
  nonzero_pairs = defaultdict(list)
  for a, b in zip(nonzero_inds[0], nonzero_inds[1]):
    nonzero_pairs[a].append(b)
  random.seed(0) # Set the random seed to zero for reproducibility

  #num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
  samples = defaultdict(list)
  for users in nonzero_pairs.keys():
    samples[users] = random.sample(nonzero_pairs[users],test_samples)
  # Generating all the user indices which need to be masked
  lst = range(0,list(samples.keys())[-1]+1)
  user_inds = list(itertools.chain.from_iterable(itertools.repeat(x,test_samples) for x in lst))
  # Business indices to be masked
  business_inds = []
  for key,val in samples.items():
    business_inds = business_inds + val
  training_set[user_inds, business_inds] = 0 # Assign all of the randomly chosen user-business pairs to zero
  return training_set, test_set, samples

In [15]:
def get_business_visited(user_code, train, user_review):
  '''
  This function returns the businesses rated by a specific user in the training set. 

  parameters: 

  user_id - Input the user's id number that you want to see prior visits of at least once

  train - User-business matrix with ratings in the training  data
  
  user_review - original data with user and business codes to map the business ids to business names

  returns:

  A list of business IDs and names for a particular user that were already rated in the training data
  '''

  business_data = user_review[user_review.user_code == user_code] # Dataframe with business rated by the user
  business_data[business_data['business_code'].isin(train[user_code,:].nonzero()[1].tolist())] # Ratings in the training dataset
  business_data = business_data[['business_id','name']]
  return business_data

In [16]:
def rec_business(model, user_code, sparse_user_item, num_business = 10):
  '''
  This function will return the top recommended items to our users using the recommend function

  parameters:

  user_code - The user code (substitute for long the user ids) that you want to get recommendations for

  sparse_user_item - The user_business training matrix used for matrix factorization fitting

  num_business - The number of businesses you want to recommend in order of best recommendations. Default is 10. 

  returns:

  - The top n recommendations chosen based on the user/business vectors for businesses that were never rated before
  '''

  all_recommendations = model.recommend(user_code, sparse_user_item,N=num_business)

  businesses = []
  scores = []

  # Get business names from the ids
  for rec in all_recommendations:
      idx, score = rec
      businesses.append(user_review.name.loc[user_review.business_code == idx].iloc[0])
      scores.append(score)

  recommendations = pd.DataFrame({'restaurants': businesses, 'score': scores})
  return recommendations

In [17]:
train,test,masked_ind = make_train_test(sparse_user_business,test_samples=20)

In [18]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.05, iterations=50)
model.fit(train*10)
model2 = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.1, iterations=20)
model2.fit(train*10)
model3 = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.9, iterations=30)
model3.fit(train*5)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [19]:
all_preds = np.dot(model.item_factors,model.user_factors.T)
all_preds2 = np.dot(model2.item_factors,model2.user_factors.T)
all_preds3 = np.dot(model3.item_factors,model3.user_factors.T)

# Evaluation metrics

In [20]:
def mean_precision_k(all_preds,masked_ind,k=5):
  '''
  This function returns the average precision for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. Precision is calculated as (actual and predicted positives)/(predicted positives).
  Highest value will be 1 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of precision array which holds precision values for all users for k recommendations

  '''
  precision = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    postive_ratings = np.where(actual_ratings >= 3)[0].shape[0]  #Get the number of restaurants with actual rating >=3
    precision.append(postive_ratings/k) #Calculate precision per user and return the mean precision
  return np.mean(precision)  

In [21]:
def ndcg_k(all_preds,masked_ind, k=5):
  '''
  This function returns the mean binary NDCG for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. NDCG is calculated as (relevancy)/(log2(position+1)).
  So the NDCG score will be higher if positively rated restaurants are recommended first. 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of NDCG array which holds ndcg values for all users for k recommendations

  '''  
  ndcg = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    ndcg_num = np.where(actual_ratings >= 3, 1, 0)  # numerator will be 1 if the restaurant was rated postive by the user, 0 otherwise similar to relevancy
    ndcg_deno = np.log2(np.arange(k)+2) # denominator is log(position + 1) and NDCG is (relevancy)/(log2(position+1))
    ndcg.append(np.sum(ndcg_num/ndcg_deno)) # NDCG is calculated as the sum of all the recommended restaurants
  return np.mean(ndcg)

In [22]:
def mrr(all_preds,masked_ind,k=5):
  '''
  This function returns the Mean Reciprocal Rank(MRR) for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. MRR is the reciprocal for the position of first positively recommended restaurant
  Highest value will be 1 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of reciprocal rank array which holds reciprocal rank values for all users for k recommendations

  '''
  reci_rank = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    first_positive = np.argmax(actual_ratings >= 3)+1  # position of first positive
    reci_rank.append(1/first_positive)
  return np.mean(reci_rank)

# Metrics at 10

In [23]:
print('Precision@10: ',mean_precision_k(all_preds,masked_ind,k=10))
print('NDCG@10: ',ndcg_k(all_preds,masked_ind,k=10))
print('MRR@10: ',mrr(all_preds,masked_ind,k=10))

Precision@K:  0.8574865031094102
NDCG@K:  3.92406696130144
MRR@K:  0.9367379558488695


In [24]:
print('Precision@10: ',mean_precision_k(all_preds2,masked_ind,k=10))
print('NDCG@10: ',ndcg_k(all_preds2,masked_ind,k=10))
print('MRR@10: ',mrr(all_preds2,masked_ind,k=10))

Precision@K:  0.8610401148089936
NDCG@K:  3.9408045770128206
MRR@K:  0.9387109371186457


In [25]:
print('Precision@10: ',mean_precision_k(all_preds3,masked_ind,k=10))
print('NDCG@10: ',ndcg_k(all_preds3,masked_ind,k=10))
print('MRR@10: ',mrr(all_preds3,masked_ind,k=10))

Precision@K:  0.8586414269117748
NDCG@K:  3.9307673387294466
MRR@K:  0.9380633467081905


# Metrics at 5

In [26]:
print('Precision@5: ',mean_precision_k(all_preds,masked_ind,k=5))
print('NDCG@5: ',ndcg_k(all_preds,masked_ind,k=5))
print('MRR@5: ',mrr(all_preds,masked_ind,k=5))

Precision@K:  0.8698011344221963
NDCG@K:  2.575077418349114
MRR@K:  0.9387867605184628


In [27]:
print('Precision@5: ',mean_precision_k(all_preds2,masked_ind,k=5))
print('NDCG@5: ',ndcg_k(all_preds2,masked_ind,k=5))
print('MRR@5: ',mrr(all_preds2,masked_ind,k=5))

Precision@K:  0.8743661586824302
NDCG@K:  2.5876882197780118
MRR@K:  0.9402970454907856


In [28]:
print('Precision@5: ',mean_precision_k(all_preds3,masked_ind,k=5))
print('NDCG@5: ',ndcg_k(all_preds3,masked_ind,k=5))
print('MRR@5: ',mrr(all_preds3,masked_ind,k=5))

Precision@K:  0.8723433335611289
NDCG@K:  2.582122325827708
MRR@K:  0.9394143374564341


# Recommendations for a specific user

In [29]:
get_business_visited(user_code=10,train=train,user_review=user_review)

Unnamed: 0,business_id,name
537589,0FUtlsQrJI7LhqDPxLumEw,oe's Farm Gril
537590,kctSmjXXK_1laQV7J8-3Cg,.T. O'Sullivan'
537591,IEVrILZ7bkuJMYKsbAfWaQ,Sushi San
537592,u2q_84hHvKGl5hKnAE7zNw,Sushi Ave
537593,D2_y52mbmTrNLHux6aCrIg,Texas Roadhouse
537594,S37sKRRfkhFZRpxaYzWo_A,China Magic Noodle House
537595,ALn_0f-Usn3n0a9WBcjhhg,anny'
537596,DymZ5vpm7vkSsR__-4sflw,Roka Akor - Scottsdale
537597,BS2nj4dQKvC5oi6ikKvX0A,Korean BBQ
537598,6oKZoCI_0ePyzfRqSFMBig,oy's Restauran


In [30]:
rec_business(model3,user_code=10,sparse_user_item=sparse_user_business)

Unnamed: 0,restaurants,score
0,Bowl of Greens,0.489055
1,Red Ribbon Bakeshop,0.476898
2,JJ Madisons All American Grill,0.411548
3,Kailash Parbat,0.391263
4,Scoop and Joy Lounge,0.383012
5,Staples -Richmond Hill,0.380174
6,Shoppers Drug Mart,0.376408
7,endy',0.370781
8,Uddin Jewelry,0.366056
9,Mariscos Y Barbacoa La Bella,0.354249


# Finding similar businesses

In [25]:
# Find the 10 most similar to Deagan's Kitchen & Bar"
business_code = 7688 #some restaurant
n_similar = 10

# Get the user and item vectors from our trained model
user_vecs = model.user_factors
business_vecs = model.item_factors

# Calculate the vector norms
business_norms = np.sqrt((business_vecs * business_vecs).sum(axis=1))

In [27]:
# Calculate the similarity score, grab the top N items and
# create a list of item-score tuples of most similar artists
scores = business_vecs.dot(business_vecs[business_code]) / business_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / business_norms[business_code]), key=lambda x: -x[1])

In [27]:
# Print the names of our most similar artists
for business in similar:
    idx, score = business
    print(user_review.name.loc[user_review.business_code == idx].iloc[0])

# Adding Social context
Including the restaurants reviewed the most by the friends of the users. The chances of a user visiting the restaurants that his friends have reviewed is very high.

## Data preprocessing

In [4]:
with open('/content/drive/My Drive/archive/user_curtailed.pkl','rb') as file:
    data = pickle5.load(file)

In [None]:
data = data.assign(**{'friends':data['friends'].str.split(',')})
data = data.explode('friends')
data = data[['user_id','friends']]
# removing user without friends
data = data[~data.friends.isin(['None'])]
data['friends'] = data['friends'].str.strip()

In [None]:
### Import user reviews directly instead of business and reviews separately everytime ###
user_review = pd.read_csv('/N/u/abbane/Carbonate/Downloads/user_review_mat.csv')
user_review['user_id'] = user_review.user_id.str[2:-1]
user_review['user_id'] = user_review['user_id'].str.strip()

In [None]:
user_friends = pd.merge(data,user_review,left_on='friends',right_on='user_id')
user_friends = user_friends[['user_id_x','friends','business_id','user_stars','name']]

In [None]:
user_friends = user_friends.groupby(by=['user_id_x','business_id','name']).count()
user_friends = user_friends.reset_index()

In [None]:
## Need to reduce number of users from 800k. Pick users whose friends have rated more than 50 businesses
users = user_friends[['user_id_x','business_id']].groupby(by=['user_id_x']).count()
users = users.reset_index()
users = users[users.business_id > 50]
user_friends = pd.merge(users['user_id_x'],user_friends,left_on='user_id_x',right_on='user_id_x',how='inner')

## Train and evaluate models on restaurants reviewed by friends

In [23]:
## Import the preprocessed data instead of running the previous code snippet and save time
with open('/content/drive/My Drive/archive/user_friends_v2','rb') as file:
    user_review_friends = pickle5.load(file)

In [24]:
len(user_review_friends)

5302251

In [25]:
## Optional step since Colab would only support 85 million rows
#user_review_friends = user_review_friends[:60000000]
user_review_friends.columns = ['user_id','business_id','name','friends','user_stars']
# Cleaning the user and business ids
user_review_friends['user_id'] = user_review_friends['user_id'].str.strip()
user_review_friends['business_id'] = user_review_friends['business_id'].str.strip( )
user_review_friends['name'] = user_review_friends['name'].str.strip()

In [26]:
# Create a numeric user_id and artist_id column for row,column accessing
user_review_friends['user_id'] = user_review_friends['user_id'].astype("category")
user_review_friends['business_id'] = user_review_friends['business_id'].astype("category")
# user_review_friends['user_code'] = user_review_friends['user_id'].cat.codes
# user_review_friends['business_code'] = user_review_friends['business_id'].cat.codes

To test the  models joining the friends dataset with the existing dataset which has actual reviews. This will give us ratings that we can evaluate against. Importing already joined dataset from drive.
The dataset is huge >100 million rows and needs to be worked with on Carbonate

In [31]:
## Imorting the data instead of preprocessing
with open('/content/drive/My Drive/archive/user_review_friends','rb') as file:
    user_review_friends = pickle5.load(file)

# Create Sparse matrices and models

In [32]:
# Create sparse matrix for the implicit library, one for user-business (recommending business to user) and second for business-user(finding similar businesses)
sparse_user_business = sparse.csr_matrix((user_review_friends['friends'].astype(float), (user_review_friends['user_code'], user_review_friends['business_code'])))
sparse_business_user = sparse.csr_matrix((user_review_friends['friends'].astype(float), (user_review_friends['business_code'], user_review_friends['user_code'])))

## Simmultaneously create a sparse matrix for ratings so that we only select restaurants that have ratings in the test data
# Create sparse matrix for the implicit library, one for user-business (recommending business to user) and second for business-user(finding similar businesses)
sparse_user_business_stars = sparse.csr_matrix((user_review_friends['user_stars'].astype(float), (user_review_friends['user_code'], user_review_friends['business_code'])))
sparse_business_user_stars = sparse.csr_matrix((user_review_friends['user_stars'].astype(float), (user_review_friends['business_code'], user_review_friends['user_code'])))

## Functions for the user friends  review

In [33]:
def make_train_test_social(sparse_user_business, sparse_user_business_stars,test_samples=10):
    '''
    This function will take in the sparse matrices (user-business/business-user) and separate a percentage of the original ratings as test dataset

    parameters: 

    sparse_mat - the original ratings matrix from which we want to generate a train/test set. The new train 
    and test datasets are in sparse csr_matrix form. 

    pct_train - The percentage of user-business ratings that should be kept in the training data

    returns:

    train - The altered version of the original data with a certain percentage of the user-item pairs 
    that originally had interaction set back to zero.

    test - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
    compares with the actual interactions.
    '''
    test_set = sparse_user_business.copy() # Make a copy of the original set to be the test set
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = sparse_user_business_stars.copy() # Make a copy of the original data we can alter as our training set
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    #nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,business index into list
    nonzero_pairs = defaultdict(list)
    for a, b in zip(nonzero_inds[0], nonzero_inds[1]):
        nonzero_pairs[a].append(b)
    random.seed(0) # Set the random seed to zero for reproducibility

    #num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = defaultdict(list)
    for users in nonzero_pairs.keys():
        samples[users] = random.sample(nonzero_pairs[users],test_samples)
    # Generating all the user indices which need to be masked
    lst = range(0,list(samples.keys())[-1]+1)
    user_inds = list(itertools.chain.from_iterable(itertools.repeat(x,test_samples) for x in lst)) #repeating user ids 
    business_inds = []     # Business indices to be masked
    for key,val in samples.items():
        business_inds = business_inds + val
    training_set[user_inds, business_inds] = 0 # Assign all of the randomly chosen user-business pairs to zero
    ## New steps for social data 
    sparse_user_business[user_inds,business_inds] = 0 #Make the friend ratings for test data 0
    return sparse_user_business, test_set, samples

In [34]:
train,test,masked_ind = make_train_test_social(sparse_user_business,sparse_user_business_stars,test_samples=5)

In [35]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.01, iterations=50)
model.fit(train*10)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [36]:
# All predictions
all_preds = np.dot(model.item_factors,model.user_factors.T)

## Evaluation metrics

In [37]:
def mean_precision_k_social(all_preds,masked_ind,k=5):
  '''
  This function returns the average precision for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. Precision is calculated as (actual and predicted positives)/(predicted positives).
  Highest value will be 1 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of precision array which holds precision values for all users for k recommendations

  '''
  precision = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals #businesses in test samples
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business_stars[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    postive_ratings = np.where(actual_ratings >= 3)[0].shape[0]  #Get the number of restaurants with actual rating >=3
    precision.append(postive_ratings/k) #Calculate precision per user and return the mean precision
  return np.mean(precision)  

In [38]:
def ndcg_k_social(all_preds,masked_ind, k=5):
  '''
  This function returns the mean binary NDCG for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. NDCG is calculated as (relevancy)/(log2(position+1)).
  So the NDCG score will be higher if positively rated restaurants are recommended first. 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of NDCG array which holds ndcg values for all users for k recommendations

  '''  
  ndcg = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business_stars[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    ndcg_num = np.where(actual_ratings >= 3, 1, 0)  # numerator will be 1 if the restaurant was rated postive by the user, 0 otherwise similar to relevancy
    ndcg_deno = np.log2(np.arange(k)+2) # denominator is log(position + 1) and NDCG is (relevancy)/(log2(position+1))
    ndcg.append(np.sum(ndcg_num/ndcg_deno)) # NDCG is calculated as the sum of all the recommended restaurants
  return np.mean(ndcg)

In [39]:
def mrr_k_social(all_preds,masked_ind,k=5):
  '''
  This function returns the Mean Reciprocal Rank(MRR) for all the users calculated on the test dataset. The top k recommended restaurants 
  are assumed to have positive predicted ratings. MRR is the reciprocal for the position of first positively recommended restaurant
  Highest value will be 1 
  
  parameters
  
  masked_ind: the indexes which were masked during the train and test split. These indexes will be used to compare the predictions 
  against the actual ratings

  k: Top k recommendations

  Returns

  Mean of reciprocal rank array which holds reciprocal rank values for all users for k recommendations

  '''
  reci_rank = []
  test_samples = len(list(masked_ind.values())[0]) # number of ratings masked for test data per user
  for key,vals in masked_ind.items():
    user_inds = [key]*test_samples # repeat to get the same user code
    business_inds = vals
    sorted_val = np.argsort(all_preds[user_inds,business_inds])[::-1] #Sort the restaurants according to the scores
    sorted_ind = np.array(business_inds)[sorted_val][:k] # Filter top k restaurants as the recommendations
    actual_ratings = sparse_user_business_stars[key].toarray()[0][sorted_ind] #Get the actual ratings for the top recommended restaurants
    first_positive = np.argmax(actual_ratings >= 3)+1  # position of first positive
    reci_rank.append(1/first_positive)
  return np.mean(reci_rank)

In [42]:
print('Prcision@5: ',mean_precision_k_social(all_preds,masked_ind,k=5))
print('NDCG@5: ',ndcg_k_social(all_preds,masked_ind,k=5))
print('MRR@5: ',mrr_k_social(all_preds,masked_ind,k=5))

Prcision@5:  0.8882207697893973
NDCG@5:  2.632632306007746
MRR@5:  0.9515928346647302
