In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt, seaborn as sns
import folium 
from sklearn.metrics.pairwise import (cosine_similarity, euclidean_distances, manhattan_distances) 
from IPython.display import display
from surprise import SVD, Dataset, NormalPredictor, Reader, accuracy, KNNBasic, evaluate, BaselineOnly
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, KFold
from collections import defaultdict


%matplotlib inline

In [3]:
def model_stats(business_city, business_review_count, user_review_count):
    # Read in datasets
    df_business = pd.read_csv('../yelp_academic_dataset/yelp_business.csv')
    df_user = pd.read_csv('../yelp_academic_dataset/yelp_user.csv')
    df_reviews = pd.read_csv('../yelp_academic_dataset/yelp_review.csv')

    
    # Mask business dataset by 'City' & 'Review Count' 
    df_business = df_business[(df_business['city'] == business_city) & (df_business['review_count'] > business_review_count)]
    
    # Mask user dataset by user review count
    df_user = df_user[df_user['review_count'] > user_review_count]
    
    # Truncate datasets
    df_business_trunc = df_business[['business_id', 'name', 'city', 'state', 'review_count']]
    df_user_trunc = df_user[['user_id', 'review_count']] 
    df_reviews_trunc = df_reviews[['review_id', 'user_id', 'business_id','stars']]
    
    # Merge into single dataset & Drop extra columnns
    df_merge = df_reviews_trunc.merge(df_user_trunc, how='inner', on='user_id').merge(df_business_trunc, how='inner', on='business_id')
    df_merge.drop(['review_id','review_count_x', 'name', 'city', 'state', 'review_count_y'], axis=1, inplace=True)
    
    # Limit new dataframe to Users with above 20 reviews included and businesses with above 10 reviews
    # In tests this yeiled the most consistent results
    df_merge = df_merge.groupby('user_id').filter(lambda group: len(group) > 20)
    df_merge = df_merge.groupby('business_id').filter(lambda group: len(group) > 10)
    
    # Print Merged Dataframe Counts 
    print("City: %s " %business_city)
    print("Number of reviews included: %s " %df_merge.shape[0])
    print("")
    print("Number of unique users included: %s" %df_merge['user_id'].unique().size)
    print("")
    print("Number of unique businesses included: %s" %df_merge['business_id'].unique().size)
    print("\n")
    
    # Print Statistics of Ratings
    print(df_merge['stars'].describe())
    print("\n")
    
    
    # Create Reader w/ Scale to Import DataFrame into Surprise
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df_merge[['user_id', 'business_id', 'stars']], reader)

    # Establish Baseline Scoring for Model 
    # Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
    
    random_algo = NormalPredictor()

    # Use Train_Test_Split on data
    trainset, testset = train_test_split(data=data, test_size=.3, random_state=9)

    # Train the algorithm on the trainset, and predict ratings for the testset
    random_algo.fit(trainset)
    random_predictions = random_algo.test(testset)

    # Then compute RMSE & MAE
    print("Baseline Scores using Random Ratings")
    print(accuracy.rmse(random_predictions))
    print(accuracy.mae(random_predictions))
    print("\n")

    # Define Algorithm for Model
    
    #algo = sim_options = {'name': 'cosine',
    #            'user_based': True  # compute  similarities between items
    #           }
    #algo = KNNBasic(sim_options=sim_options)
    
    algo = SVD()

    # Use Train_Test_Split on data
    trainset, testset = train_test_split(data=data, test_size=.3, random_state=9)

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    print("Accuracy Score Using Model")
    print(accuracy.rmse(predictions))
    print(accuracy.mae(predictions)) 

In [4]:
%%time

model_stats("Toronto", 300, 100)

City: Toronto 
Number of reviews included: 3224 

Number of unique users included: 119

Number of unique businesses included: 105


count    3224.000000
mean        3.754342
std         0.893219
min         1.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: stars, dtype: float64


Baseline Scores using Random Ratings
RMSE: 1.2472
1.2471766155244717
MAE:  0.9965
0.9964798296857523


Accuracy Score Using Model
RMSE: 0.8426
0.8425689305418099
MAE:  0.6545
0.6545289174222104
CPU times: user 55.9 s, sys: 11.9 s, total: 1min 7s
Wall time: 1min 12s


In [5]:
# Read in datasets
df_business = pd.read_csv('../yelp_academic_dataset/yelp_business.csv')
df_user = pd.read_csv('../yelp_academic_dataset/yelp_user.csv')
df_reviews = pd.read_csv('../yelp_academic_dataset/yelp_review.csv')

    
# Mask business dataset by 'City' & 'Review Count' 
df_business = df_business[(df_business['city'] == 'Toronto') & (df_business['review_count'] > 300)]
    
# Mask user dataset by user review count
df_user = df_user[df_user['review_count'] > 100]
    
# Truncate datasets
df_business_trunc = df_business[['business_id', 'name', 'city', 'state', 'review_count']]
df_user_trunc = df_user[['user_id', 'review_count']] 
df_reviews_trunc = df_reviews[['review_id', 'user_id', 'business_id','stars']]
    
# Merge into single dataset & Drop extra columnns
df_merge = df_reviews_trunc.merge(df_user_trunc, how='inner', on='user_id').merge(df_business_trunc, how='inner', on='business_id')
df_merge.drop(['review_id','review_count_x', 'name', 'city', 'state', 'review_count_y'], axis=1, inplace=True)
    
# Limit new dataframe to Users with above 20 reviews included and businesses with above 10 reviews
# In tests this yeiled the most consistent results
df_merge = df_merge.groupby('user_id').filter(lambda group: len(group) > 20)
df_merge = df_merge.groupby('business_id').filter(lambda group: len(group) > 10)

# Create Reader w/ Scale to Import DataFrame into Surprise
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_merge[['user_id', 'business_id', 'stars']], reader)

In [7]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
trainset = data.build_full_trainset()
# algo = sim_options = {'name': 'cosine',
#                  'user_based': True  # compute  similarities between items 
#                 }
    
# algo = KNNBasic(sim_options=sim_options)

algo = SVD()
               
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

#predictions

top_n = get_top_n(predictions, n=3)

#create dataframe with top 3 recommendations by user with estimated scores and business_ids

top_3df = pd.DataFrame(list(top_n.items()), columns=['user_id', 'recos'])
top_3df[['Reco_1','Reco_2', 'Reco_3']] = pd.DataFrame(top_3df.recos.values.tolist(), index= top_3df.index)
top_3df.drop(['recos'], axis=1, inplace=True)
top_3df[['1st Recommendation', '1st Est Rating']] = top_3df['Reco_1'].apply(pd.Series)
top_3df[['2nd Recommendation', '2nd Est Rating']] = top_3df['Reco_2'].apply(pd.Series)
top_3df[['3rd Recommendation', '3rd Est Rating']] = top_3df['Reco_3'].apply(pd.Series)
top_3df.drop(['Reco_1', 'Reco_2', 'Reco_3'], axis=1, inplace=True)
top_3df.head()

Unnamed: 0,user_id,1st Recommendation,1st Est Rating,2nd Recommendation,2nd Est Rating,3rd Recommendation,3rd Est Rating
0,0uNxhZAFbkalQImzJ6UDUA,ZumOnWbstgsIE6bJlxw0_Q,4.293893,nT16Y6AsJDwEpUB1JICKzg,4.197741,nqTvE7ivdU23oUWdI01tOA,4.076923
1,-od707p4FHGul0gte29AoQ,SGP1jf6k7spXkgwBlhiUVw,4.483974,nT16Y6AsJDwEpUB1JICKzg,4.316417,nqTvE7ivdU23oUWdI01tOA,4.302581
2,VHc1yqBhsE-l3JQFXuWydQ,SGP1jf6k7spXkgwBlhiUVw,4.899382,nT16Y6AsJDwEpUB1JICKzg,4.859276,nqTvE7ivdU23oUWdI01tOA,4.565736
3,NzlqN1Ca9SW5z780thoiAg,nT16Y6AsJDwEpUB1JICKzg,4.588173,SGP1jf6k7spXkgwBlhiUVw,4.424797,pSMK_FtULKiU-iuh7SMKwg,4.358573
4,z6gseuVl0cR7tRLQa_DXuQ,SGP1jf6k7spXkgwBlhiUVw,4.188522,nT16Y6AsJDwEpUB1JICKzg,4.137975,7oEKIG7d1ttPRejppZ3WIA,4.07466
