## Capstone - Restaurant Recommender System

## This is a recommender system for Melbourne cafes/restaurants.


The Data:
- Source: Zomato
- URL: https://www.zomato.com/melbourne/great-food-no-bull
- Best of Melbourne Collection
- Retrieved 250 restaurants, and a max of 200 reviews per restaurant

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error

from geopy.geocoders import Nominatim
from geopy.distance import geodesic

from math import sqrt
from datetime import date
from time import sleep

from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import re

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

### Scrape business profiles

In [None]:
# get the url's of the businesses featured in 'best of Melbourne' collection 
def get_business_urls(html):
    
    # restaurant url's
    try:
        url_list=Selector(text=html).xpath('//div[@class="row col-res-list collection_listings_container"]/div/div/div/a/@href').extract()
    except:
        url_list=np.NaN
        return np.NaN
    
    print(url_list)
    return url_list

In [None]:
# get the reviews for a specific business
def get_reviews_per_url(html):

    reviews_per_business = []
    review_index = 0
    
    # get restaurant id and review id (separately, as these two are outside the xpath scope defined below)
    # review id list 
    try:
        review_id_list = Selector(text=html).xpath('//div[@id="reviews-container"]/*//div[@data-snippet="restaurant-review"]/@data-review_id').extract()
        print('# of review ids: ',len(review_id_list))
    except:
        review_id_list= np.NaN
        print('error on review id list for : ', review_id_list)
        
    # restaurant id
    try:
        restaurant_id = Selector(text=html).xpath('//div[@id="reviews-container"]/*//div[@data-snippet="restaurant-review"]/@data-res_id').extract()[0]
    except:
        restaurant_id = np.NaN
        print('error on restaurant_id for : ', restaurant_id)
        
    # get data for displayed reviews 
    visible_reviews_full = Selector(text=html).xpath('//div[@id="reviews-container"]/*//div[@data-snippet="restaurant-review"]/div[contains(@class,"ui segment")]').extract()
    num_reviews=len(visible_reviews_full)

    # iterate through the displayed reviews
    for review in visible_reviews_full:

        # review id
        try:
            review_id= review_id_list[review_index]
        except:
            print('error on iterating through review id for: ', review_id_list)
            
        # user id
        try:
            user_id=Selector(text=review).xpath('//div[contains(@class,"content col-l-11")]/*/a[contains(@href,"https://www.zomato.com")]/@data-entity_id').extract()[0]
        except:
            user_id=np.NaN

        # user name
        try:
            user_name=Selector(text=review).xpath('//div[contains(@class,"content col-l-11")]/*/a[contains(@href,"https://www.zomato.com")]/text()').extract()[0].strip()
        except:
            user_name=np.NaN

        # no. of reviews by user
        try:
            user_review_count=Selector(text=review).xpath('//div[contains(@class,"content col-l-11")]/span[contains(@class,"grey-text")]/text()').extract()[0].strip().split(',')[0].strip().split(' ')[0]
        except:
            user_review_count=np.NaN

        # no. of followers of user
        try:
            user_followers_count=Selector(text=review).xpath('//div[contains(@class,"content col-l-11")]/span[contains(@class,"grey-text")]/text()').extract()[0].strip().split(',')[1].strip().split(' ')[0]
        except:
            user_followers_count=np.NaN

        # review date
        try:
            review_date=Selector(text=review).xpath('//time/@datetime').extract()[0]
        except:
            review_date=np.NaN

        # star rating
        try:
            star_rating=Selector(text=review).xpath('//div[contains(@class,"rev-text")]/div[contains(@class,"ttupper fs12px")]/@aria-label').extract()[0].split(' ')[1]
        except:
            star_rating=np.NaN

        # textual review
        try:
            textual_review=Selector(text=review).xpath('//div[contains(@class,"rev-text ")]/text()').extract()[1:]
        except:
            textual_review=np.NaN

        # clean textual review, convert to string
        try:
            textual_review[0] = textual_review[0].replace('\xa0\n', '').strip()
            textual_review = ''.join([line.strip() for line in textual_review if line.strip() is not ''])
        except:
            print('error in cleaning textual_review for : ', textual_review)
            
        # append scraped review to list
        reviews_per_business.append([review_id, restaurant_id, user_id, user_name, user_review_count, user_followers_count, review_date, star_rating, textual_review])
        review_index += 1
        
    return reviews_per_business, num_reviews

In [None]:
# start of code for getting the business url's in the 'best of Melbourne' collection

# open business reviews url
main_url = 'https://www.zomato.com/melbourne/great-food-no-bull'
driver = webdriver.Chrome(executable_path="chromedriver.exe")
try:
    sleep(5)
    driver.get(main_url)
except:
    print('error in loading driver for business url: ', main_url)        
assert "Zomato" in driver.title

# parse first results page
sleep(5)
# set HTML to the response from the http request
html = driver.page_source

url_list = get_business_urls(html)

driver.close()

In [None]:
# start of code for getting the reviews for each business featured in 'best of Melbourne' collection

# target count of reviews to extract
review_target_count = 200
all_business_reviews = []
num_reviews_list = []
business_with_zero_reviews = []

monitor_count=0    

total_count=len(url_list)
# iterate through the url's of the businesses to extract the reviews
for business_url in url_list:

    # open business reviews url
    driver = webdriver.Chrome(executable_path="chromedriver.exe")
    try:
        driver.get(business_url + '/reviews')
    except:
        print('error in loading driver for business url: ', business_url)
        
    assert "Zomato" in driver.title

    # if there's a Popular (Reviews) section, change and click on All Reviews section
    driver = switch_from_Popular_to_All_Reviews(driver, business_url + '/reviews')
    
    # display more reviews (in preparation for scraping the target # of reviews)
    driver, visible_review_count = display_more_reviews(driver, review_target_count)

    # get final count of visible reviews (for this particular business)
    html = driver.page_source

    # if review list is not empty
    if visible_review_count > 0:
        # scrape the displayed reviews
        reviews_per_business, num_reviews = get_reviews_per_url(html)

        num_reviews_list.append(num_reviews)
        
        # add reviews per business to overall list of business reviews
        all_business_reviews += reviews_per_business

    # businesses with zero reviews
    else:
        business_with_zero_reviews.append(business_url)

    print('url: ', business_url)    
    monitor_count += 1
    print(str(monitor_count), ' of ', str(total_count), " businesses completed...")

    # save reviews for every 20 businesses to a csv file
    if (monitor_count % 20) == 0:
        reviews_20 = pd.DataFrame(all_business_reviews)
        # save the latest 20 businesses
        reviews_20.to_csv('./datasets/Reviews - Best of Melbourne ' + str(monitor_count-20) + ' to ' + str(monitor_count-1) +  '.csv')
        # clear business reviews list to make way for the next 20 businesses
        all_business_reviews = []
    
    # close the Chrome webdriver
    driver.close()
    
print('business with zero reviews: ', business_with_zero_reviews)
print('number of reviews: ', num_reviews_list)

### Scrape Business Reviews

In [None]:
# display more reviews by clicking on 'Load More' element
def display_more_reviews(driver, review_target_count):
    
    html = driver.page_source
    
    try:
        # get the visible reviews and review count (for this business)
        visible_reviews = Selector(text=html).xpath('//div[@id="reviews-container"]/*//div[@data-snippet="restaurant-review"]/div[contains(@class,"ui segment")]').extract()    
        visible_review_count = len(visible_reviews)

        # open more reviews by clicking on Load More until the desired # of reviews (for this particular business) is displayed
        while visible_review_count < review_target_count:
            try:
                # access and click on Load More element - to extend the list of displayed reviews
                load_more_element = driver.find_element_by_xpath('//div[@class="load-more bold ttupper tac cursor-pointer fontsize2"]/span[contains(@class,"zred")]')
                load_more_element.click()
                sleep(5)

            except:
                break

            # update the count of visible reviews (for this particular business)
            html = driver.page_source
            visible_reviews = Selector(text=html).xpath('//div[@id="reviews-container"]/*//div[@data-snippet="restaurant-review"]/div[contains(@class,"ui segment")]').extract()
            visible_review_count = len(visible_reviews)
    except:
        print('ERROR IN DISPLAY_MORE_REVIEWS')
    
    return driver, visible_review_count

In [None]:
# Switch from Popular Reviews to All Reviews, if applicable
def switch_from_Popular_to_All_Reviews(driver, url):
    
    html = driver.page_source
    try:
        # see if Popular Reviews tab exists
        popular_section = Selector(text=html).xpath('//div[@id="selectors"]/a[contains(@data-sort,"reviews-top")]/text()').extract()[0]
        try:
            # access and click on All Reviews element - to switch from Popular Reviews list
            load_more_element = driver.find_element_by_xpath('//div[@id="selectors"]/a[contains(@data-sort,"reviews-dd")]')
            sleep(3)
            load_more_element.click()
            sleep(1)
            print('Popular Reviews section exists for: ' + url)

        except:
            display('All Reviews element not found, or click not successful for:' + url)
    
    except:
        print('No Popular Reviews section for: ' + url)
        
    return driver

In [None]:
# combine all subset csv's for the 'best of Melbourne' reviews into 1 csv, 'reviews - best of Melbourne.csv'
csv_list = []
reviews_df = pd.DataFrame()

import os
# create list of all review csv's
for file in os.listdir():
    if file.startswith("Reviews - Best of Melbourne"):
        csv_list.append(file)

# load csv's on to a dataframe
for csv in csv_list:
    reviews_df_per_csv = pd.read_csv(csv)
    reviews_df = pd.concat([reviews_df, reviews_df_per_csv])

# save entire dataframe to a single csv for backup
reviews_df.columns=['per_csv_id','review_id', 'restaurant_id', 'user_id', 'user_name', 'user_review_count', 'user_followers_count', 'review_date', 'star_rating', 'textual_review']
reviews_df.to_csv('./datasets/reviews - best of Melbourne.csv')
reviews_df.head()

### Data Preprocessing / EDA

In [None]:
# Extract restaurant details through Zomato API 
# This has to be done via Zomato API to include the latitude/longitude details of the businesses in 'best of Melbourne' collection

#api_key = <zomato-api-here>

import pickle
import requests
import pandas as pd
import json
from time import sleep

# using restaurant id's in the extracted reviews, retrieve the restaurant details via Zomato API
def extract_businesses_details(restaurant_id_list):
    
    # set headers
    headers = {
        'Accept': 'application/json',
        'user-key': api_key
    }

    businesses_details_df = pd.DataFrame()
    
    # for every restaurant_id in the Zomato businesses dataset
    for restaurant_id in restaurant_id_list:
        response = requests.get('https://developers.zomato.com/api/v2.1/restaurant?res_id=' + str(restaurant_id), headers=headers)
        response_df = pd.DataFrame([response.json()])
        businesses_details_df = pd.concat([businesses_details_df,response_df], ignore_index=True)

    # pickle dataset
    save_pickle = open('./datasets/Zomato - Best of Melbourne.pkl',"wb")
    pickle.dump(businesses_details_df, save_pickle)
    save_pickle.close()
    
    return businesses_details_df

In [None]:
# Load onto a dataframe the 'best of Melbourne' business profiles
restaurant_id_list = reviews_df.restaurant_id.unique()
businesses_df = extract_businesses_details(restaurant_id_list)

### Load Businesses and Reviews dataframes (post-scraping)

In [None]:
# load businesses and reviews onto their respective dataframes 
businesses_df = pd.DataFrame()
reviews_df = pd.DataFrame()
# load the csv of the scraped businesses on to a dataframe
businesses_df = pd.read_pickle('./datasets/Zomato - Best of Melbourne.pkl')
# load the csv of the scraped reviews on to a dataframe
reviews_df = pd.read_csv('./datasets/reviews - best of Melbourne - clean.csv')

In [None]:
# drop the first (unnamed) column
reviews_df = reviews_df.iloc[:, 1:]

In [None]:
# rename id, and convert to float
businesses_df.rename({'id': 'restaurant_id'},axis=1,inplace=True)
businesses_df.loc[:,'restaurant_id'] = businesses_df.restaurant_id.apply(lambda x: int(x))

In [None]:
def sns_plot(plot_type='scatter', x=None, y=None, data=None, xlabel='_', ylabel='_', title='_', bins=20, figsize=(10,5)):

    # plot bar chart
    fig = plt.figure(figsize=figsize)
    ax = fig.gca()
    if plot_type == 'dist':
        sns.distplot(x,bins=bins, ax=ax)
    elif plot_type == 'scatter':
        plt.scatter(x=x,y=y,alpha=0.6, edgecolors='w')
#        sns.pointplot(x=x,y=y,ax=ax)
#    elif plot_type == 'joint':
#        sns.jointplot(x=x, y=y, data=data, kind='reg', space=0, size=5, ratio=4)
    ax.set_xlabel(xlabel, fontsize=18)
    ax.set_ylabel(ylabel, fontsize=19)
    ax.set_title(title, fontsize=20)
    return

In [None]:
# total # of unique restaurants, users, and reviews in the dataset
pd.DataFrame([[len(businesses_df.restaurant_id.unique())], [len(reviews_df.user_id.unique())], [reviews_df.review_id.count()]], columns=['Total'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T

In [None]:
# list businesses that have duplicates in the dataset
businesses_df[businesses_df.restaurant_id.isin(businesses_df.restaurant_id.value_counts()[businesses_df.restaurant_id.value_counts()>1].index)]

In [None]:
# plot distribution for the # of reviews per restaurant
sns_plot(plot_type='dist', x=reviews_df.restaurant_id.value_counts(), xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per Restaurant', bins=50)

In [None]:
# plot distribution for the # of reviews given per restaurant, when the # of reviews is less than or equal to 50
sns_plot(plot_type='dist', x=reviews_df.restaurant_id.value_counts()[reviews_df.restaurant_id.value_counts()<=50], xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per Restaurant\nReviews Per Restaurant <= 50')

In [None]:
#reviews_df.groupby('restaurant_id')['restaurant_id'].count().sort_values()
reviews_df[reviews_df.restaurant_id==16585728]

In [None]:
# no. of reviews per restaurant
review_count_per_restaurant = reviews_df.groupby('restaurant_id')['restaurant_id'].count()
review_count_per_restaurant.describe()

In [None]:
# quantile range of review count per restaurant 
review_count_per_restaurant.quantile(np.arange(0,1,.05))

In [None]:
# no. of restaurants with total reviews = 25
(review_count_per_restaurant==200).sum()

In [None]:
# plot distribution for the # of reviews given per restaurant, when the # of reviews is greater than 50
sns_plot(plot_type='dist', x=reviews_df.restaurant_id.value_counts()[reviews_df.restaurant_id.value_counts()>50], xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per Restaurant\nReviews Per Restaurant > 50')

In [None]:
reviews_df[reviews_df.star_rating.isnull()]

In [None]:
# set review_date column to datetime object
reviews_df.loc[:,'review_date'] = pd.to_datetime(reviews_df.review_date)

In [None]:
# drop the reviews with no star ratings - these reviews date back to the 2014-2015 period, thus, relevance-wise might be worth dropping
reviews_df.drop(index=reviews_df[reviews_df.star_rating.isnull()].index, inplace=True)

In [None]:
## drop the votes column - votes column as per research is not user-generated and instead derived out of an algorithm that's exclusive to Zomato. Thus, it's worth removing.
#businesses_df.drop(columns=['votes'],inplace=True)

In [None]:
businesses_df.info()

In [None]:
# change user_followers_count from NaN to zero
reviews_df.loc[reviews_df.user_followers_count.isnull(), 'user_followers_count'] = 0

In [None]:
# # change non-numeric values in price column to zero
# businesses_df.loc[businesses_df.price == '[]', 'price'] = '[0]'
# businesses_df.price = businesses_df.price.str.replace(r"[A$\'\[\]]",'').astype('int64')

In [None]:
# # change non-numeric values in ratings column to zero
# # first change non-numeric to NaN
# businesses_df.overall_rating = pd.to_numeric(businesses_df.overall_rating, errors='coerce')
# # then change NaN to zero
# businesses_df.overall_rating.fillna(value=0, inplace=True)

In [None]:
# tally no. of restaurants with 1 review , and with less than 5 reviews
review_count_per_restaurant = reviews_df.groupby('restaurant_id')['restaurant_id'].count()
pd.DataFrame([(review_count_per_restaurant==1).sum(), (review_count_per_restaurant<5).sum()], columns=['Total'], index=['Restaurants with only 1 review', 'with less than 5 reviews']).T

In [None]:
# after above data preprocessing, show updated # of unique restaurants, users, and reviews in the dataset
pd.DataFrame([[len(businesses_df.restaurant_id.unique())], [len(reviews_df.user_id.unique())], [reviews_df.review_id.count()]], columns=['Total'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T

In [None]:
# # save dataframe as csv for backup reflecting all changes to this point
# businesses_df.to_pickle('./datasets/Zomato - All Cafes in Melbourne - pass 1.pkl')
# # save dataframe as for backup reflecting all changes to this point
# reviews_df.to_pickle('./datasets/reviews - clean - pass 1.pkl')

### ---------------------
#### Load from Pickle

In [None]:
# businesses_df = pd.DataFrame()
# reviews_df = pd.DataFrame()
# # load the csv of the scraped businesses on to a dataframe
# businesses_df = pd.read_pickle('./datasets/Zomato - All Cafes in Melbourne - pass 1.pkl')
# # load the csv of the scraped reviews on to a dataframe
# reviews_df = pd.read_pickle('./datasets/reviews - clean - pass 1.pkl')

### ---------------------

In [None]:
# plot distribution for the # of reviews given per user
sns_plot(plot_type='dist', x=reviews_df.user_id.value_counts(), xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per User', bins=200)

In [None]:
# plot distribution for the # of reviews given per user, when the # of reviews is less than or equal to 50
sns_plot(plot_type='dist', x=reviews_df.user_id.value_counts()[reviews_df.user_id.value_counts()<=50], xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per User\nReviews Per User <= 50')

In [None]:
# plot distribution for the # of reviews given per user, when the # of reviews is greater than 50
sns_plot(plot_type='dist', x=reviews_df.user_id.value_counts()[reviews_df.user_id.value_counts()>50], xlabel='# of Reviews', ylabel='Frequency', title='# of Reviews Per User\nReviews Per User > 50')

In [None]:
# filter reviews dataset so that it only shows the 2018 reviews
reviews_df_2018 = reviews_df[reviews_df.review_date.dt.year == 2018]
reviews_df_2018 = reviews_df_2018[['restaurant_id','review_date', 'star_rating']]
reviews_df_2018.set_index('review_date',inplace=True)
reviews_df_2018.head()

In [None]:
businesses_df.name.values

In [None]:
# latest 2018 ratings for a given restaurant 
rest_id = 16578793 # Patricia Coffee Brewers
rest_name = businesses_df[businesses_df.restaurant_id==rest_id]['name'].values[0]
per_restaurant_ratings_timeseries = reviews_df_2018[reviews_df_2018.restaurant_id==rest_id]
per_restaurant_ratings_timeseries.index = per_restaurant_ratings_timeseries.index.date
# plot star_rating for a particular restaurant
sns_plot(plot_type='scatter', x=per_restaurant_ratings_timeseries.index, y=per_restaurant_ratings_timeseries.star_rating, 
         xlabel='Date', ylabel='Rating', title=rest_name + '\n' + 'Star Ratings for 2018', figsize=(15,5))
#per_restaurant_ratings_timeseries.sort_index().head(25)

In [None]:
# plot distribution for the star ratings for a restaurant in 2018
sns_plot(plot_type='dist', x=per_restaurant_ratings_timeseries.star_rating, xlabel='Star Ratings', ylabel='Frequency', title=rest_name + '\nStar Ratings Distribution for 2018',bins=10)

In [None]:
# latest 2018 ratings for a given restaurant 
rest_id = 16577492 # Chin Chin
rest_name = businesses_df[businesses_df.restaurant_id==rest_id]['name'].values[0]
per_restaurant_ratings_timeseries = reviews_df_2018[reviews_df_2018.restaurant_id==rest_id]
per_restaurant_ratings_timeseries.index = per_restaurant_ratings_timeseries.index.date
# plot star_rating for a particular restaurant
sns_plot(plot_type='scatter', x=per_restaurant_ratings_timeseries.index, y=per_restaurant_ratings_timeseries.star_rating, 
         xlabel='Date', ylabel='Rating', title=rest_name + '\n' + 'Star Ratings for 2018', figsize=(15,5))

In [None]:
# plot distribution for the star ratings for a restaurant in 2018
sns_plot(plot_type='dist', x=per_restaurant_ratings_timeseries.star_rating, xlabel='Star Ratings', ylabel='Frequency', title=rest_name + '\nStar Ratings Distribution for 2018',bins=10)

In [None]:
# checking correlation between a user's # of reviews IN the dataset and the user's average rating
review_count_per_user = reviews_df.groupby('user_id')['review_id'].count()
average_rating_per_user = reviews_df.groupby('user_id')['star_rating'].mean()
average_rating_review_count = pd.merge(pd.DataFrame(average_rating_per_user), pd.DataFrame(review_count_per_user), left_on='user_id', right_on='user_id')
sns_plot(plot_type='scatter', x=average_rating_review_count.review_id, y=average_rating_review_count.star_rating,
         xlabel='Total # of Reviews Per User', ylabel='Average Rating', title='User Review Count vs. Average Rating\nReview Count - total within Dataset', figsize=(15,5))

In [None]:
# checking correlation between a user's # of reviews (profiled as overall total in Zomato) and the user's average rating
average_rating_per_user = pd.DataFrame(reviews_df.groupby('user_id')['star_rating'].mean())
average_rating_per_user['total_review_count'] = average_rating_per_user.index.map(lambda x: np.max(reviews_df[reviews_df.user_id == x]['user_review_count']))
sns_plot(plot_type='scatter', x=average_rating_per_user.total_review_count, y=average_rating_per_user.star_rating,
         xlabel='Total # of Reviews Per User', ylabel='Average Rating', title="User Review Count vs. Average Rating\nReview Count - User's Total in Zomato", figsize=(15,5))

In [None]:
# plot distribution for the average star ratings of all restaurants in the dataset
rating_mean_df = reviews_df.groupby('restaurant_id')['star_rating'].mean()
sns_plot(plot_type='dist', x=rating_mean_df, xlabel='Average Star Ratings per Restaurant', ylabel='Frequency', title="Average Star Ratings Distribution\n All Restaurants in 'Best of Melbourne' collection",bins=10)


In [None]:
# plot distribution for the average followers count of all users in the dataset
rating_mean_df = reviews_df.groupby('user_id')['user_followers_count'].mean()
sns_plot(plot_type='dist', x=rating_mean_df, xlabel='# of Followers', ylabel='Frequency', title="# of Followers Per User",bins=10)

In [None]:
# plot distribution for the average followers count of all users in the dataset
followers_mean_df = reviews_df[reviews_df.user_followers_count>=1000].groupby('user_id')['user_followers_count'].mean()
sns_plot(plot_type='dist', x=followers_mean_df, xlabel='# of Followers', ylabel='Frequency', title="# of Followers > 1000 Per User",bins=10)

In [None]:
reviews_df.info()

In [None]:
# heatmap shows there is virtually no correlation between average rating per user and total review count per user, even when setting # of reviews greater than 650
sns.heatmap(average_rating_per_user[average_rating_per_user.total_review_count > 650].corr(), annot=True, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)

In [None]:
# checking correlation between a user's # of followers (profiled as overall total in Zomato) and the user's average rating
average_rating_follower_per_user = pd.DataFrame(reviews_df.groupby('user_id')['star_rating'].mean())
average_rating_follower_per_user['total_follower_count'] = average_rating_follower_per_user.index.map(lambda x: np.max(reviews_df[reviews_df.user_id == x]['user_followers_count']))
sns_plot(plot_type='scatter', x=average_rating_follower_per_user.total_follower_count, y=average_rating_follower_per_user.star_rating,
         xlabel='Total # of Followers Per User', ylabel='Average Rating', title="User Follower Count vs. Average Rating\nFollower Count - User's Total in Zomato", figsize=(15,5))

In [None]:
# heatmap shows a slight correlation between star rating and total follower count of a user, even when # of followers is greater than 1000
# testing the effect to star ratings provided when the # of followers is high - very slight correlation exists
sns.heatmap(average_rating_follower_per_user[average_rating_follower_per_user.total_follower_count > 1000].corr(), annot=True, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)

In [None]:
# heatmap below shows good correlation between user_review_count and user_follower_count;
# it also shows no correlation between star_rating and the each of the former 2 features
sns.heatmap(reviews_df[['user_review_count','user_followers_count','star_rating']].corr(), annot=True, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)

In [None]:
# combine 2 dataframes by column, retaining the index of the left dataframe
def append_dataframes_by_column_retain_left_index(df_left, df_right):

    # change right dataframe's index to be the same as the left dataframe
    df_right.set_index(df_left.index,inplace=True)
    # concatenate the 2 dataframes by the columns
    df_left = pd.concat([df_left,df_right],axis=1)

    return df_left

In [None]:
def dummify_columns(df, columns_to_dummify_list):

    # for every column to dummify
    for column in columns_to_dummify_list:

        # fit labelbinarizer to job_category_1 column
        lb = preprocessing.LabelBinarizer(sparse_output=True)
        dummies_sparse = lb.fit_transform(df[column])

        # dummify values to a matrix and load on to a dataframe
        column_names = [(column + '_' + dummy_class) for dummy_class in lb.classes_]
        df_dummies = pd.DataFrame(dummies_sparse.todense(), columns=column_names)
        # drop last column for dummies (to dummify - 1)
        df_dummies.drop(df_dummies.columns[-1], axis=1, inplace=True)
        # drop the original column (the one dummies were based on)
        df.drop(column, axis=1, inplace=True)
        # combine the 2 dataframes, retaining the left dataframe's index
        df = append_dataframes_by_column_retain_left_index(df,df_dummies)
    
    return df

### Slicing the dataset
Trimming the dataset down may be necessary to increase the relevance of the results generated from a recommender system. 

As an example, if the restaurants were filtered to those that have at least 20 reviews, thus disregarding those that have less than 20 reviews and effectively impacting the number of users and reviews left in the dataset, then the recommender would be able to provide more useful recommendations. These recommendations are ideally calculated with user-item and item-item collaborative filtering methods.

In [None]:
# assess the impact of slicing the dataset by analyzing the resulting number of restaurants, users, and ratings
def assess_slice_dataset(minimum_count, slice_on_feature, reviews_df):
    
    reviews_sliced = reviews_df.copy()

    # if slicing is based on the feature restaurant_id
    if slice_on_feature == 'restaurant_id':
        
        # get the count of reviews per restaurant
        review_count_per_restaurant = reviews_sliced.groupby('restaurant_id')['restaurant_id'].count()
        # filter to restaurants that satisfy the minimum # of reviews requirement
        review_count_per_restaurant = review_count_per_restaurant[review_count_per_restaurant>=minimum_count]
        # get the restaurant_id's
        feature_ids = list(review_count_per_restaurant.index)

        # apply filter to dataset
        reviews_sliced = reviews_sliced[reviews_sliced.restaurant_id.isin(feature_ids)]

        # count # of restaurants
        restaurant_count = len(feature_ids)
        # count # of users who left review(s) to one or more of these restaurants
        user_count = reviews_sliced.user_id.nunique()
        
    # if slicing is based on the feature user_id
    elif slice_on_feature == 'user_id':
        # get the count of reviews per user
        review_count_per_user = reviews_sliced.groupby('user_id')['user_id'].count()
        # filter to users that satisfy the minimum # of reviews requirement
        review_count_per_user = review_count_per_user[review_count_per_user>=minimum_count]
        # get the user_id's
        feature_ids = list(review_count_per_user.index)

        # apply filter to dataset
        reviews_sliced = reviews_sliced[reviews_sliced.user_id.isin(feature_ids)]    

        # count # of restaurants
        restaurant_count = reviews_sliced.restaurant_id.nunique()
        # count # of users who left review(s) to one or more of these restaurants
        user_count = len(feature_ids)
    
    # count # of reviews for these restaurants
    review_count = reviews_sliced.review_id.count()

    display('Sliced by ' + slice_on_feature + '; minimum count at ' + str(minimum_count))
    display('Remaining # of restaurants: ' + str(restaurant_count))
    display('Remaining # of users: ' + str(user_count))
    display('Remaining # of reviews: ' + str(review_count))
    display('Reviews to matrix size ratio: ' + str(review_count / (restaurant_count * user_count)))
    display('-' * 20)
    return reviews_sliced

In [None]:
# slice the dataset with these criteria

# slice dataset to filter to restaurants with 25 or more reviews
reviews_sliced = assess_slice_dataset(25, 'restaurant_id', reviews_df)
# slice dataset to filter to users with 5 or more reviews in the dataset
reviews_sliced = assess_slice_dataset(10, 'user_id', reviews_sliced)
reviews_sliced = assess_slice_dataset(5, 'restaurant_id', reviews_sliced)

In [None]:
# slice the dataset with these criteria

#reviews_sliced = assess_slice_dataset(25, 'restaurant_id', reviews_df)
reviews_sliced = assess_slice_dataset(10, 'user_id', reviews_df)
reviews_sliced = assess_slice_dataset(10, 'restaurant_id', reviews_sliced)
reviews_sliced = assess_slice_dataset(5, 'user_id', reviews_sliced)

### Train-Dev-Test split the dataset

In [None]:
# split into to train and test sets
train_reviews_df, test_reviews_df = train_test_split(reviews_df, test_size=0.2, random_state=42)

# split train further into train and dev sets
train_reviews_df, dev_reviews_df = train_test_split(train_reviews_df, test_size=0.2, random_state=42)

# display shapes
print('Full data : ',  reviews_df.shape)
print('Train set : ', train_reviews_df.shape)
print('Dev set : ',  dev_reviews_df.shape)
print('Test set : ', test_reviews_df.shape)

In [None]:
# total # of unique restaurants, users, and reviews before dataset split
stat_before_split = pd.DataFrame([[len(reviews_df.restaurant_id.unique())], [len(reviews_df.user_id.unique())], [reviews_df.review_id.count()]], columns=['Total before Split'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T
display(stat_before_split.head())
# total # of unique restaurants, users, and reviews for train dataset
stat_train = pd.DataFrame([[len(train_reviews_df.restaurant_id.unique())], [len(train_reviews_df.user_id.unique())], [train_reviews_df.review_id.count()]], columns=['Total for Train set'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T
display(stat_train.head())
# total # of unique restaurants, users, and reviews for dev dataset
stat_dev = pd.DataFrame([[len(dev_reviews_df.restaurant_id.unique())], [len(dev_reviews_df.user_id.unique())], [dev_reviews_df.review_id.count()]], columns=['Total for Dev set'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T
display(stat_dev.head())
# total # of unique restaurants, users, and reviews for test dataset
stat_test = pd.DataFrame([[len(test_reviews_df.restaurant_id.unique())], [len(test_reviews_df.user_id.unique())], [test_reviews_df.review_id.count()]], columns=['Total for Test set'], index = ['# of Restaurants', '# of Users', '# of Reviews']).T
display(stat_test.head())

In [None]:
# check sparsity level of the reviews dataset
sparsity=round(1.0-len(reviews_df)/float(reviews_df.shape[0]*reviews_df.shape[1]),3)
print("The sparsity level of Zomato 'Best of Melbourne' main dataset is " +  str(sparsity*100) + '%')

In [None]:
# set matrix (user-item-rating) matrix for the main reviews dataframe
main_matrix = reviews_df.pivot_table(values='star_rating', columns='restaurant_id', index='user_id').fillna(2.5)
main_matrix.head()

In [None]:
# prepare matrices for the recommender system (change null rating to 2.5)
main_matrix = reviews_df.pivot_table(values='star_rating', columns='restaurant_id', index='user_id').fillna(2.5)
train_matrix = train_reviews_df.pivot_table(values='star_rating', columns='restaurant_id', index='user_id').fillna(2.5)
dev_matrix = dev_reviews_df.pivot_table(values='star_rating', columns='restaurant_id', index='user_id').fillna(2.5)
test_matrix = test_reviews_df.pivot_table(values='star_rating', columns='restaurant_id', index='user_id').fillna(2.5)

In [None]:
matrix_shape_list = [main_matrix.shape,train_matrix.shape,dev_matrix.shape,test_matrix.shape]
row_list = []
column_list = []
for shape_row, shape_column in matrix_shape_list:
    row_list.append(shape_row)
    column_list.append(shape_column)

matrix_shapes_df = pd.DataFrame({'# of Rows':row_list, '# of Columns:':column_list}, index=['Full matrix','Train matrix','Dev matrix','Test matrix'])
matrix_shapes_df.head()

In [None]:
# get the top n restaurants with the most # of m ratings, where m can be a single or a list of ratings 
def get_n_restaurants_with_m_ratings(assess_reviews_df, restaurant_count, ratings_list):    
    top_n_restaurants_with_m_ratings = assess_reviews_df[assess_reviews_df.star_rating.isin(ratings_list)].groupby('restaurant_id')['restaurant_id'].count().sort_values(ascending=False).head(restaurant_count)
    top_n_restaurants_df = businesses_df[businesses_df.restaurant_id.isin(top_n_restaurants_with_m_ratings.index[:restaurant_count])][['restaurant_id','name']].set_index('restaurant_id')
    top_n_restaurants_df = top_n_restaurants_df.join(top_n_restaurants_with_m_ratings, how='inner')

    star_column_label = ''
    for rating in ratings_list:
        if ratings_list[0]==rating:
            star_column_label = str(rating)
        else:
            star_column_label += '/' + str(rating)
    star_column_label += ' Star Rating Count'

    top_n_restaurants_df.rename(columns={'name': 'Name', 'restaurant_id': star_column_label},inplace=True)
    return top_n_restaurants_df.sort_values(star_column_label, ascending=False)

In [None]:
# select a random restaurant from a restaurants dataframe
def get_random_restaurant_from_df(top_n_restaurants_df):
    random_index = np.random.choice(np.arange(0, top_n_restaurants_df.shape[0]), size=1)
    subject_restaurant = pd.DataFrame(user_item_matrix.loc[:,top_n_restaurants_df.index[random_index[0]]])
    return subject_restaurant

### -----------------------------------------------------------------
### Personal Favorites-Based Recommendation

In [None]:
# list # of reviews per user (to spot the most active reviewers) in the dataset
# choose one user in this list to test the personal favorites-based recommendation
reviews_df.groupby('user_id')['restaurant_id'].count().sort_values(ascending=False).head()

In [None]:
# Get the subject user's 5-star rated srestaurants, sorted chronologically
# performing this against the entire dataset, rather than limiting it to the training set,
# for the purpose of maximizing the entire list of reviews of a given user and, thus, yield more insightful results

# choose subject user
subject_user = user_item_matrix.sample()
subject_user.head()
# for testing: set subject user to the one with the highest # of reviews in the dataset
subject_user = user_item_matrix[user_item_matrix.index==37495749]

# get the restaurants rated with 5 stars by subject user
personal_favorites_df = reviews_df[(reviews_df.user_id==subject_user.index[0]) & (reviews_df.star_rating.isin([5]))][['restaurant_id','user_id','user_name','review_date','star_rating','textual_review']].sort_values('review_date',ascending=False).set_index('user_id')
user_name = personal_favorites_df.iloc[0,1]

#businesses_df[['restaurant_id', 'name']].set_index('restaurant_id')

personal_favorites_df = personal_favorites_df.merge(businesses_df[['restaurant_id','name']], on='restaurant_id') #left_on=personal_favorites_df.index, right_on='restaurant_id')
personal_favorites_df = personal_favorites_df[['review_date','name','star_rating','textual_review']]
personal_favorites_df = personal_favorites_df.rename(columns={'review_date':'Review Date', 'name':'Restaurant Name', 'star_rating':'Star Rating', 'textual_review': 'Review'}).set_index('Review Date')

# Get the top 10 restaurants in the entire dataset with the most # of 5-Star Ratings

display('User Name: ' + str(user_name))
display('Personal Favorites: 5 Star Rated Restaurants sorted chronologically (entire dataset):')
display(personal_favorites_df.head(10))

### -----------------------------------------------------------------

### ---------------------------------------------------------------------------
### Popularity-Based Recommendation

In [None]:
# Get the top 10 restaurants in the entire dataset with the most # of 5-Star Ratings
restaurant_count=10
ratings_list=[5]
top_n_restaurants_df = get_n_restaurants_with_m_ratings(reviews_df, restaurant_count, ratings_list)
display('Popularity-Based: Top 10 Restaurants with most 5-Star Ratings (entire dataset):')
display(top_n_restaurants_df.head(restaurant_count))

In [None]:
# Get the top 10 restaurants in the train set with the most # of 5-Star Ratings
restaurant_count=10
ratings_list=[5]
top_n_restaurants_df = get_n_restaurants_with_m_ratings(train_reviews_df, restaurant_count, ratings_list)
display('Popularity-Based: Top 10 Restaurants with most 5-Star Ratings (train set):')
display(top_n_restaurants_df.head(restaurant_count))

In [None]:
# Get the top 10 restaurants in the dev set with the most # of 5-Star Ratings
restaurant_count=10
ratings_list=[5]
top_n_restaurants_df = get_n_restaurants_with_m_ratings(dev_reviews_df, restaurant_count, ratings_list)
display('Popularity-Based: Top 10 Restaurants with most 5-Star Ratings (dev set):')
display(top_n_restaurants_df.head(restaurant_count))

In [None]:
# Get the top 10 restaurants in the test set with the most # of 5-Star Ratings
restaurant_count=10
ratings_list=[5]
top_n_restaurants_df = get_n_restaurants_with_m_ratings(test_reviews_df, restaurant_count, ratings_list)
display('Popularity-Based: Top 10 Restaurants with most 5-Star Ratings (test set):')
display(top_n_restaurants_df.head(restaurant_count))

### ---------------------------------------------------------------------------

### --------------------------------------------------------------------------
### Content-Based Filtering

In [None]:
# extract location dictionary into separate columns
businesses_df['location_address'] = businesses_df['location'].apply(lambda x: x['address'])
businesses_df['location_locality'] = businesses_df['location'].apply(lambda x: x['locality'])
businesses_df['location_city'] = businesses_df['location'].apply(lambda x: x['city'])
businesses_df['location_latitude'] = businesses_df['location'].apply(lambda x: x['latitude'])
businesses_df['location_longitude'] = businesses_df['location'].apply(lambda x: x['longitude'])
businesses_df['location_zipcode'] = businesses_df['location'].apply(lambda x: x['zipcode'])
businesses_df['location_country_id'] = businesses_df['location'].apply(lambda x: x['country_id'])
# drop location column
businesses_df.drop(columns='location',inplace=True)

In [None]:
# obtain the latitude/longitude for a given address
def get_lat_long_given_address(current_address):
    geolocator = Nominatim(user_agent="zmatorecsys")
    location = geolocator.geocode(current_address)
    current_lat_long = (location.latitude, location.longitude)
    #print(current_lat_long)    
    return current_lat_long

In [None]:
# compare distances from current location to each of the restaurants in the given dataframe, get top n closest
def get_distance_to_restaurants(assess_businesses_df, current_lat_long):

    restaurant_id_list = []
    distance_to_restaurants_list = []

    # get distance to each restaurant
    for key, restaurant in assess_businesses_df.iterrows():
        # get geodesic distance between 2 locations    
        #print(restaurant)
        dest_loc=(restaurant.location_latitude, restaurant.location_longitude)
        distance_to_restaurants_list.append(round(geodesic(current_lat_long, dest_loc).kilometers, 2))
        restaurant_id_list.append(restaurant.restaurant_id)

    # save restaurant_id's and their respective distances to a dataframe
    distance_to_restaurants_df = pd.DataFrame({'restaurant_id': restaurant_id_list, 'Distance(km)': distance_to_restaurants_list})
    distance_to_restaurants_df = assess_businesses_df[['restaurant_id','name']].merge(distance_to_restaurants_df, left_on='restaurant_id', right_on='restaurant_id')
    distance_to_restaurants_df.rename(columns={'name': 'Name'}, inplace=True)
    distance_to_restaurants_df.set_index('restaurant_id', inplace=True);
    
    return distance_to_restaurants_df

In [None]:
# Google Maps API - get distance, duration of travel, duration of travel with traffic
# use Google Maps API sparingly, limit it to the 'last leg' of the distance computation, due to its pay-after-n-calls
# once the top results are obtained through geopy's geocode, recompute the Google Map Client's geocode
import googlemaps
import datetime
#api_key=<google-maps-api-here>
gm_client = googlemaps.Client(key=api_key)
gm_client.distance_matrix(current_lat_long, dest_loc, departure_time=datetime.datetime.now())

In [None]:
# Google Maps API - test only - get lat lon coordinates given the address
gm_geocode = gm_client.geocode('45 William St Melbourne')
# geocode lat: -37.8185548  lon: 144.9590755
print(gm_geocode)

In [None]:
# Geocoder - test only - get current position's lat and long
# location is based on ip - if ip is not static, location may refer back to an exchange/node or address related to the ISP
import geocoder
g = geocoder.ip('me')
print(g.latlng)

In [None]:
# set current address
current_address = "45 William St Melbourne"
#current_address = "328 Swanston St, Melbourne"
#current_address = "330 Collins St Melbourne"

# get current location coordinates
current_lat_long = get_lat_long_given_address(current_address)

# get distances from current address to all restaurants in the dataset
distance_to_restaurants_df = get_distance_to_restaurants(businesses_df, current_lat_long)

display('Current Location: ' + current_address)
display('Content-Based Filtering: "Restaurants that are Nearby"')
distance_to_restaurants_df.sort_values('Distance(km)').head(10)

In [None]:
# To-Do: enhance Content-Based Recommendation by incorporating price range  
businesses_df[businesses_df.price_range==4][['price_range', 'average_cost_for_two']].sort_values('average_cost_for_two')
#businesses_df.price_range, businesses_df.average_cost_for_two

### --------------------------------------------------------------------------

### --------------------------------------------------------------------------
### Memory-Based Collaborative Filtering

In [None]:
user_item_matrix = train_matrix.copy()
user_item_matrix.head()

In [None]:
# get pairwise distances for user-item similarity using cosine as metric
user_similarity = pairwise_distances(user_item_matrix, metric='cosine')
# get pairwise distances for item-item similarity using cosine as metric
item_similarity = pairwise_distances(user_item_matrix.T, metric='cosine')

In [None]:
user_similarity.shape

### --------------------------------------------------------------------------
### User-Item Collaborative Filtering

In [None]:
# choose subject user
subject_user = user_item_matrix.sample()
subject_user.head()

In [None]:
# list the subject user's reviews
train_reviews_df[train_reviews_df.user_id == subject_user.index[0]]

In [None]:
# perform pairwise distance - subject user and user-item matrix
per_user_similarity = pairwise_distances(subject_user.values.reshape(1,-1), user_item_matrix,  metric='cosine')

In [None]:
# list and sort (by highest to lowest similarity) the users relative to the subject user
per_user_similarity_series = pd.Series(per_user_similarity.flatten(), index=user_item_matrix.index)

In [None]:
# get mean value for each column
def get_mean_per_column(x):
    # change 2.5 (un-rated) to zero
    x.replace(2.5,0, inplace=True)
    if (x != 0).sum()>0: 
        return x.sum()/(x != 0).sum()
    else:
        return 0 

In [None]:
# prepare businesses key columns, for merging with recommender df
restaurant_names_df = businesses_df[['restaurant_id','name']]
restaurant_names_df['overall_rating'] = businesses_df.user_rating.apply(lambda x: x['aggregate_rating'])
restaurant_names_df.head()

In [None]:
# choose top n most similar users to subject user
top_similar_users = per_user_similarity_series.sort_values().iloc[:100]

# get their restaurant ratings
top_similar_users = user_item_matrix[user_item_matrix.index.isin(top_similar_users.index)]

# filter to those restaurants that subject user hasn't rated
user_item_recommended_restaurants_df = top_similar_users.loc[:, (subject_user == 2.5).values.flatten()]

# convert 2.5's to zero and get the mean of the ratings per restaurant that subject user hasn't tried/rated
user_item_recommended_restaurants_df = pd.DataFrame(user_item_recommended_restaurants_df.apply(get_mean_per_column), columns=['Average Rating(Similar Users)'])

# show top 5 recommended, append restaurant name
user_item_recommended_restaurants_df = user_item_recommended_restaurants_df.merge(restaurant_names_df, left_on=user_item_recommended_restaurants_df.index, right_on='restaurant_id')
user_item_recommended_restaurants_df.set_index('restaurant_id',inplace=True)
user_item_recommended_restaurants_df.index.name = 'User-Item CF: Top 5 Restaurants'
user_item_recommended_restaurants_df = user_item_recommended_restaurants_df[['name', 'overall_rating', 'Average Rating(Similar Users)']]
user_item_recommended_restaurants_df.rename(columns={'name': 'Name', 'overall_rating': 'Restaurant Overall Rating'}, inplace=True)
user_item_recommended_restaurants_df.sort_values('Average Rating(Similar Users)', ascending=False).head(10)

### Item-Item Collaborative Filtering

In [None]:
user_item_matrix.shape

In [None]:
item_similarity.shape

In [None]:
# choose subject user
subject_user = user_item_matrix.sample()
subject_user.head()

In [None]:
# reviews posted, or restaurants rated, by the user
train_reviews_df[train_reviews_df.user_id == subject_user.index[0]]

In [None]:
# get list of restaurants that the subject user rated with 4 stars or 5 stars (basically, its the restaurants the subject user has rated positively)
subject_user_high_rated_restaurants = subject_user.T
subject_user_high_rated_restaurants = subject_user_high_rated_restaurants[(subject_user_high_rated_restaurants.values==5) | (subject_user_high_rated_restaurants.values==4)]

# if user has rated multiple restaurants with a 4-star or a 5-star rating, randomly select one among the list
if len(subject_user_high_rated_restaurants) > 1:
    # do random choice between the high rated restaurants
    print('user has more than 1 high rated restaurant')
    random_index = np.random.choice(np.arange(0, len(subject_user_high_rated_restaurants)), size=1)
    subject_restaurant = pd.DataFrame(user_item_matrix.loc[:,subject_user_high_rated_restaurants.index[random_index[0]]])
    
# if user has rated only 1 restaurant with a 4-star or a 5-star rating, choose this 1 restaurant
elif len(subject_user_high_rated_restaurants) == 1:
    # choose the restaurant as the subject restaurant
    print('user has 1 high rated restaurant')
    subject_restaurant = pd.DataFrame(user_item_matrix.loc[:,subject_user_high_rated_restaurants.index[0]])
    subject_restaurant.head()
    
# if user hasn't rated any restaurant with a 4-star or a 5-star rating, choose randomly among the most popular restaurants based on other people's ratings
elif len(subject_user_high_rated_restaurants) == 0:
    # return a random restaurant out of list of most popular restaurants
    # get a random restaurant out of the 10 most popular in Zomato's Best of Melbourne' collection 
    # (most popular in this case refers to the restaurant with the highest # of 4 or 5 star ratings)
    print('user has no high rated restaurant')
    restaurant_count=10
    ratings_list=[4,5]
    top_n_restaurants_df = get_n_restaurants_with_m_ratings(train_reviews_df, restaurant_count, ratings_list)
    #display(top_n_restaurants_df.head(restaurant_count))
    subject_restaurant = get_random_restaurant_from_df(top_n_restaurants_df)

In [None]:
# show subject user's list of restaurants rated as 4 or 5 stars, if there's any
subject_user_high_rated_restaurants.head()

In [None]:
# show subject restaurant to be used for item-item similarity
subject_restaurant.head()

In [None]:
# out of the user's high-rated restaurants, or show the chosen subject restaurant out of the high-rated restaurants by the subject user, if there's any
businesses_df[businesses_df.restaurant_id == subject_restaurant.columns[0]][['restaurant_id', 'name']]

In [None]:
# perform pairwise distance - subject restaurant and user-item matrix
per_item_similarity = pairwise_distances(subject_restaurant.values.reshape(1,-1), user_item_matrix.T,  metric='cosine')
per_item_similarity

In [None]:
# list and sort by highest to lowest similarity the restaurants relative to the subject restaurant
per_item_similarity_series = pd.Series(per_item_similarity.flatten(), index=user_item_matrix.columns)
#per_item_similarity_series.sort_values()

In [None]:
# show top 5 recommended restaurants

# filter to the restaurants that the subject user has not rated
restaurants_not_tried_list = subject_user[subject_user == 2.5].dropna(axis=1).columns
item_item_recommended_restaurants_df = pd.DataFrame(round(per_item_similarity_series[per_item_similarity_series.index.isin(restaurants_not_tried_list)], 3), columns=['Similarity (High to Low)'])
item_item_recommended_restaurants_df = item_item_recommended_restaurants_df.merge(restaurant_names_df, left_on=item_item_recommended_restaurants_df.index, right_on='restaurant_id')
item_item_recommended_restaurants_df.set_index('restaurant_id',inplace=True)
item_item_recommended_restaurants_df.index.name ='Item-Item CF: Top 10 Restaurants'
item_item_recommended_restaurants_df = item_item_recommended_restaurants_df[['name','overall_rating','Similarity (High to Low)']]
item_item_recommended_restaurants_df.rename(columns={'name': 'Name', 'overall_rating': 'Restaurant Overall Rating'}, inplace=True)
item_item_recommended_restaurants_df.sort_values('Similarity (High to Low)').head(10)

### --------------------------------------------------------------------------

### -----------------------------------------------------------------------------------------------------------------------------------
### Hybrid Recommender - User-Item CF & Content- Based

In [None]:
# Content-Based component: we refer to the location attribute of the restaurants
# Collaborative Filtering component: User-Item

# set current address
current_address = "45 William St Melbourne"
#current_address = "328 Swanston St, Melbourne"
#current_address = "330 Collins St Melbourne"

# get current location coordinates
current_lat_long = get_lat_long_given_address(current_address)

# format the user-item cf dataframe to calculate distance to restaurants
hybrid_user_item_df = user_item_recommended_restaurants_df.merge(businesses_df[['restaurant_id','location_latitude','location_longitude']],
                                                       left_on=user_item_recommended_restaurants_df.index, right_on='restaurant_id').rename(columns={'Name': 'name'})

# get distances from current address to the top 10 restaurants in the user-item CF dataframe
distance_to_restaurants_df = get_distance_to_restaurants(hybrid_user_item_df.sort_values('Average Rating(Similar Users)', ascending=False).head(10), current_lat_long)

display('Current Location: ' + current_address)
display('Hybrid Recommender (User-Item CF & Content-Based) : "Restaurants that similar users liked, and are Nearby"')

# merge the User-Item CF dataframe with distance-to-restaurants dataframe to include the relevant columns
hybrid_user_item_df = hybrid_user_item_df.set_index('restaurant_id').join(distance_to_restaurants_df, how='inner').drop(columns=['Name','location_latitude','location_longitude'], axis=1)
hybrid_user_item_df = hybrid_user_item_df.rename(columns={'name': 'Name'})
hybrid_user_item_df.sort_values('Distance(km)').head(10)

### --------------------------------------------------------------------------

### -----------------------------------------------------------------------------------------------------------------------------------
### Hybrid Recommender - Item-Item CF & Content- Based

In [None]:
# Content-Based component: we refer to the location attribute of the restaurants
# Collaborative Filtering component: Item-Item

# set current address
current_address = "45 William St Melbourne"
#current_address = "328 Swanston St, Melbourne"
#current_address = "330 Collins St Melbourne"

# get current location coordinates
current_lat_long = get_lat_long_given_address(current_address)

# format the user-item cf dataframe to calculate distance to restaurants
hybrid_item_item_df = item_item_recommended_restaurants_df.merge(businesses_df[['restaurant_id','location_latitude','location_longitude']],
                                                       left_on=item_item_recommended_restaurants_df.index, right_on='restaurant_id').rename(columns={'Name': 'name'})

# get distances from current address to the top 10 restaurants in the user-item CF dataframe
distance_to_restaurants_df = get_distance_to_restaurants(hybrid_item_item_df.sort_values('Similarity (High to Low)').head(10), current_lat_long)

display('Current Location: ' + current_address)
display('Hybrid Recommender (Item-Item CF & Content-Based) : "Restaurants that received similar ratings, and are Nearby"')

# merge the User-Item CF dataframe with distance-to-restaurants dataframe to include the relevant columns
hybrid_item_item_df = hybrid_item_item_df.set_index('restaurant_id').join(distance_to_restaurants_df, how='inner').drop(columns=['Name','location_latitude','location_longitude'], axis=1)
hybrid_item_item_df = hybrid_item_item_df.rename(columns={'name': 'Name'})
hybrid_item_item_df.sort_values('Distance(km)').head(10)

### --------------------------------------------------------------------------

### -----------------------------------------------------------------------------------------------------------------------------------
### User-Item, Item-Item Collaborative Filtering - using pairwise-normalized dot product method

In [None]:
train_matrix.shape

In [None]:
test_matrix.shape

In [None]:
dev_matrix.shape

In [None]:
# serialize the restaurant_id's and user_id's, to prepare for setting up the matrices for prediction/evaluation
main_restaurant_id_list = pd.Series(main_matrix.columns.unique())
main_user_id_list = pd.Series(main_matrix.index)

In [None]:
reviews_df[reviews_df.star_rating.isnull()]

In [None]:
train_matrix_for_prediction = np.zeros((main_matrix.shape[0], main_matrix.shape[1]))
train_matrix_for_prediction[train_matrix_for_prediction == 0] = 2.5
for line in train_reviews_df.itertuples():
    row_index = main_user_id_list.loc[(main_user_id_list == line[3])].index[0]
    column_index = main_restaurant_id_list.loc[(main_restaurant_id_list == line[2])].index[0]
    train_matrix_for_prediction[row_index, column_index] = line[8]

dev_matrix_for_prediction = np.zeros((main_matrix.shape[0], main_matrix.shape[1]))
dev_matrix_for_prediction[dev_matrix_for_prediction == 0] = 2.5
for line in dev_reviews_df.itertuples():
    row_index = main_user_id_list.loc[(main_user_id_list == line[3])].index[0]
    column_index = main_restaurant_id_list.loc[(main_restaurant_id_list == line[2])].index[0]
    dev_matrix_for_prediction[row_index, column_index] = line[8]  

test_matrix_for_prediction = np.zeros((main_matrix.shape[0], main_matrix.shape[1]))
test_matrix_for_prediction[test_matrix_for_prediction == 0] = 2.5
for line in test_reviews_df.itertuples():
    row_index = main_user_id_list.loc[(main_user_id_list == line[3])].index[0]
    column_index = main_restaurant_id_list.loc[(main_restaurant_id_list == line[2])].index[0]
    test_matrix_for_prediction[row_index, column_index] = line[8]  

In [None]:
# create dummy matrix with extreme values, to test prediction against train matrix' cosine similarity, as well as RMSE evaluation
dummy_matrix_for_prediction = np.zeros((main_matrix.shape[0], main_matrix.shape[1]))
dummy_matrix_for_prediction[dummy_matrix_for_prediction == 0] = 10
for line in train_reviews_df.itertuples():
    row_index = main_user_id_list.loc[(main_user_id_list == line[3])].index[0]
    column_index = main_restaurant_id_list.loc[(main_restaurant_id_list == line[2])].index[0]
    dummy_matrix_for_prediction[row_index, column_index] = line[8] + 30


In [None]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_matrix_for_prediction, metric='cosine')
item_similarity = pairwise_distances(train_matrix_for_prediction.T, metric='cosine')

In [None]:
# predict rating for a matrix or an indidivual user or restaurant
def make_prediction(ratings, similarity, cf_type='user', original_matrix_shape=None):
    # if given rating is of an individual user
    if cf_type == 'user' and ratings.shape[0]==1:
        print('user, ==1')
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 
    # if given rating is of a matrix
    elif cf_type == 'user' and ratings.shape[0]>1:
        print('user, >1')
        # check if rating matrix shape is identical to that of the similarity matrix
        if ratings.shape[0] == original_matrix_shape[0] and ratings.shape[1] == original_matrix_shape[1]:
            mean_user_rating = ratings.mean(axis=1)
            ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
            pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        else:
            # make a matrix that is identical in shape to similarity matrix
            pass    
    elif cf_type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [None]:
item_similarity.shape, user_similarity.shape, train_matrix_for_prediction.shape

In [None]:
train_item_prediction = make_prediction(train_matrix_for_prediction, item_similarity, 'item', train_matrix_for_prediction.shape)
train_user_prediction = make_prediction(train_matrix_for_prediction, user_similarity, 'user', train_matrix_for_prediction.shape)
#train_item_prediction

In [None]:
single_train_item_prediction = make_prediction(train_matrix_for_prediction[0], item_similarity, 'item', train_matrix_for_prediction.shape)

In [None]:
np.array_equal(single_train_item_prediction, train_item_prediction[0])

In [None]:
train_matrix_for_prediction.shape

In [None]:
dev_matrix_for_prediction.shape

In [None]:
test_matrix_for_prediction.shape

In [None]:
dev_item_prediction = make_prediction(dev_matrix_for_prediction, item_similarity, 'item', train_matrix_for_prediction.shape)
dev_user_prediction = make_prediction(dev_matrix_for_prediction, user_similarity, 'user',train_matrix_for_prediction.shape)
dev_item_prediction

In [None]:
test_item_prediction = make_prediction(test_matrix_for_prediction, item_similarity, 'item', train_matrix_for_prediction.shape)
test_user_prediction = make_prediction(test_matrix_for_prediction, user_similarity, 'user', train_matrix_for_prediction.shape)
test_item_prediction

In [None]:
dummy_item_prediction = make_prediction(dummy_matrix_for_prediction, item_similarity, 'item', train_matrix_for_prediction.shape)
dummy_user_prediction = make_prediction(dummy_matrix_for_prediction, user_similarity, 'user', train_matrix_for_prediction.shape)
dummy_item_prediction

In [None]:
# perform evaluation of prediction through RMSE
def rmse(prediction, ground_truth):
#    prediction = prediction[ground_truth.nonzero()].flatten() 
#    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:
print('Train Set Evaluation:')
print('User-based CF RMSE: ' + str(rmse(train_user_prediction, train_matrix_for_prediction)))
print('Item-based CF RMSE: ' + str(rmse(train_item_prediction, train_matrix_for_prediction)))

In [None]:
print('\n\nDev Set Evaluation:\n')
print('User-based CF RMSE: ' + str(rmse(dev_user_prediction, dev_matrix_for_prediction)))
print('Item-based CF RMSE: ' + str(rmse(dev_item_prediction, dev_matrix_for_prediction)))

In [None]:
print('\n\nTest Set Evaluation:\n')
print('User-based CF RMSE: ' + str(rmse(test_user_prediction, test_matrix_for_prediction)))
print('Item-based CF RMSE: ' + str(rmse(test_item_prediction, test_matrix_for_prediction)))

In [None]:
print('\n\nDummy Set Evaluation:\n')
print('User-based CF RMSE: ' + str(rmse(dummy_user_prediction, dummy_matrix_for_prediction)))
print('Item-based CF RMSE: ' + str(rmse(dummy_item_prediction, dummy_matrix_for_prediction)))

### -----------------------------------------------------------------------------------------------------------------

### -----------------------------------------------------------------------------------------------------------------
### Matrix Factorization - Singular Value Decomposition (SVD) - using scipy's svds

In [None]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_matrix_for_prediction, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF - MSE - train matrix: ' + str(rmse(X_pred, train_matrix_for_prediction)))
print('User-based CF - MSE - dev matrix: ' + str(rmse(X_pred, dev_matrix_for_prediction)))
print('User-based CF - MSE - test matrix: ' + str(rmse(X_pred, test_matrix_for_prediction)))
print('User-based CF - MSE - dummy matrix: ' + str(rmse(X_pred, dummy_matrix_for_prediction)))



### --------------------------------------------------------------------------

### ----------------------------------------------------------------------------------------------------------------------
### Matrix Factorization - Singular Value Decomposition (SVD) - using Surpriselib SVD

In [None]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# setup Reader - range of ratings
reader = Reader(rating_scale=(1, 5))

In [None]:
# format df's as required by Surprise SVD
svd_train_reviews_df = train_reviews_df[['user_id','restaurant_id','star_rating']]
svd_dev_reviews_df = dev_reviews_df[['user_id','restaurant_id','star_rating']]
svd_test_reviews_df = test_reviews_df[['user_id','restaurant_id','star_rating']]

In [None]:
# setup dataset through Surprise's Dataset module - to prepare it for Surprise's train_test_split
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(svd_train_reviews_df, reader)

In [None]:
# split data through Surprise's train_test_split (temporarily set split size to .01 as this set has already been previously split as a train set)
trainset, testset = train_test_split(data, test_size=.01)

In [None]:
# train SVD with random value: 100 (latent features)
model = SVD(n_factors=100)
model.fit(trainset)

In [None]:
# Normalization
display('before normalized: ', pd.DataFrame(model.qi).iloc[0].pow(2).sum())
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
display('after normalized: ', pd.DataFrame(model.qi).iloc[0].pow(2).sum())

In [None]:
# view restaurants with highest (or lowest) average ratings
train_reviews_df.groupby('restaurant_id')['star_rating'].mean().sort_values(ascending=True).head(10)

In [None]:
# list star ratings for a speific restaurant
train_reviews_df[train_reviews_df.restaurant_id==16579337]['star_rating'].value_counts()

In [None]:
# predict the rating an existing user would give a restaurant that the user hasn't rated
a_user = 28165833
a_product = 16571144
print(model.predict(a_user, a_product))

In [None]:
# predict the rating every user would give a particular restaurant
for index, user in train_reviews_df.user_id.iloc[:20].iteritems():
    a_product = 16571144
    print(type(user))
    print(model.predict(user, a_product))

### --------------------------------------------------------------------------