In [None]:
import numpy as np 
import pandas as pd 
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline



In [None]:
df_electronics=pd.read_csv("ratings_Electronics.csv",names=['userId', 'productId','Rating','timestamp'])

In [None]:
# Display the data
df_electronics.head()

In [None]:
#Shape of the data
df_electronics.shape

In [None]:
#Taking subset of the dataset as the data is huge
df_electronics=df_electronics.iloc[:1048576,0:]

In [None]:
df_electronics.info()

In [None]:
# Summary 
df_electronics.describe()['Rating'].T

In [None]:
#Understanding the range of ratings
print('Minimum rating is: ', (df_electronics.Rating.min()))
print('Maximum rating is: ', (df_electronics.Rating.max()))

In [None]:
#looking for missing values 
print('Number of missing values across columns: \n',df_electronics.isnull().sum())

In [None]:
# Rating Distribution
with sns.axes_style('black'):
    g = sns.factorplot("Rating", data=df_electronics, aspect=2.0,kind='count')
    g.set_ylabels(" Checking Total number of ratings")


In [None]:
print("Total data ")
print("\nNo of ratings :",df_electronics.shape[0])
print(" No of Users   :", len(np.unique(df_electronics.userId)))
print(" No of products  :", len(np.unique(df_electronics.productId)))


In [None]:
#Dropping irrelevant columns  
df_electronics.drop(['timestamp'], axis=1,inplace=True)

In [None]:
#Analysis of rating given by the user 

rate_products_eachUser = df_electronics.groupby(by='userId')['Rating'].count().sort_values(ascending=False)

rate_products_eachUser.head()

In [None]:
rate_products_eachUser.describe()

In [None]:
check_quantile = rate_products_eachUser.quantile(np.arange(0,1.01,0.01), interpolation='higher')check_quantile = rate_products_eachUser.quantile(np.arange(0,1.01,0.01), interpolation='higher')
plt.figure(figsize=(12,11))
plt.title("check_quantile & Values")
check_quantile.plot()
# check_quantile with 0.10 difference
plt.scatter(x=check_quantile.index[::5], y=check_quantile.values[::10], c='orange', label="check_quantile with 0.10 intervals")
# check_quantile with 0.30 difference
plt.scatter(x=check_quantile.index[::30], y=check_quantile.values[::30], c='m', label = "check_quantile with 0.30 intervals")
plt.ylabel('No of ratings by user')
plt.xlabel('Value at the quantile')
plt.show()

In [None]:
print('\n No of rated products more than 50 per user : {}\n'.format(sum(rate_products_eachUser >= 50)) )

Demand Based Model


In [None]:
#Getting the new dataframe which contains users who has given 50 or more ratings

df_filtered=df_electronics.groupby("productId").filter(lambda x:x['Rating'].count() >=50)

In [None]:
no_of_ratings_per_product = df_filtered.groupby(by='productId')['Rating'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_product.values)
plt.xlabel('Product')
plt.ylabel('No of ratings per product')

plt.show()

In [None]:
#Average rating of the product 
df_filtered.groupby('productId')['Rating'].mean().head()

In [None]:
# Average highest rating of the products
df_filtered.groupby('productId')['Rating'].mean().sort_values(ascending=False).head()

In [None]:
#Total number of rating for product
df_filtered.groupby('productId')['Rating'].count().sort_values(ascending=False).head()

In [None]:
ratings_mean = pd.DataFrame(df_filtered.groupby('productId')['Rating'].mean())

In [None]:
ratings_mean['rating_counts'] = pd.DataFrame(df_filtered.groupby('productId')['Rating'].count())

In [None]:
# Max number of ratings
ratings_mean['rating_counts'].max()

In [None]:
# Histogram showing rating counts
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean['rating_counts'].hist(bins=50)

In [None]:
# Histogram showing the ratings
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean['Rating'].hist(bins=50)

In [None]:
# Comparing the rating counts over ratings
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
sns.jointplot(x='Rating', y='rating_counts', data=ratings_mean, alpha=0.4)

In [None]:
demanded_products = pd.DataFrame(df_filtered.groupby('productId')['Rating'].count())
most_popular = demanded_products.sort_values('Rating', ascending=False)
most_popular.head(30).plot(kind = "bar")

Collaberative filtering (Item-Item recommedation)

In [None]:
"""
Surprise is a Python scikit for building and analyzing recommender systems that deals with explicit rating data.
"""

from surprise import SVD,  SlopeOne
from surprise import KNNBaseline, KNNBasic

from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [None]:
#Reading the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_filtered,reader)

In [None]:
#Splitting the dataset, going with the traditional 70-30 split
trainSet, testSet = train_test_split(data, test_size=0.3,random_state=10)

In [None]:
# Defining bsl_options and sim_options for all methods , 
# Documentation: https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5 }
sim_options={'name': 'pearson_baseline', 'user_based': False}

In [None]:
## 1st approach with basic KNN

# defining user_based true/false field to switch between user-based or item-based collaborative filtering
mod_KNNBasic = KNNBasic(k=5,sim_options = sim_options , bsl_options = bsl_options)
predictions_KNNBasic = mod_KNNBasic.fit(trainSet).test(testSet)
rmse_KNNBasic = accuracy.rmse(predictions_KNNBasic)

### A basic collaborative filtering algorithm taking into account a baseline rating.¶


In [None]:
## 2nd approach using a baseline rating 

mod_KNNBaseline = KNNBaseline(k=5,sim_options = sim_options , bsl_options = bsl_options)
predictions_KNNBaseline = mod_KNNBaseline.fit(trainSet).test(testSet)
rmse_KNNBaseline = accuracy.rmse(predictions_KNNBaseline)

Model-based collaborative filtering system

In [None]:
# Matrix Factorization-based SVD
from surprise import SVD

mod_SVD = SVD()
predictions_SVD = mod_SVD.fit(trainSet).test(testSet)
rmse_SVD = accuracy.rmse(predictions_SVD)

In [None]:
# Matrix Factorization-based SlopeOne
from surprise import  SlopeOne

mod_SlopeOne = SlopeOne()
predictions_SlopeOne = mod_SlopeOne.fit(trainSet).test(testSet)
rmse_SlopeOne = accuracy.rmse(predictions_SlopeOne)

Item-based methods vs Model-Based methods

For item based models, algo.get_neighbors() will produce the nearest neighbors to a product, if purchased by user and the neighbors of that product are to be recommended to the user. 

For model-based methods, algo.predict(user_id, product_id) can be used to create a list of reccomended items.

Both approaches have different use cases and our implied accordingly.


In [None]:
# List of all unique users and products

unique_user_ids = list(df_filtered['userId'].unique())
unique_products = list(df_filtered['productId'].unique())

unique_user_ids[:5]
unique_products[:5]

In [None]:
# Number of unique users present in the dataset
len(unique_user_ids)

In [None]:
# Choosing a random index for a particular user to generate recommended items list

user_index = 100
uid = unique_user_ids[user_index]

In [None]:
# Applying KNNBasic
# Listing items already purchased by user chosen above

items_purchased = trainSet.ur[trainSet.to_inner_uid(uid)]

print("User " + str(uid) +  " has previously purchased items with productId: ")
for items in items_purchased[0]: 
    print(mod_KNNBasic.trainSet.to_raw_iid(items))

#getting K Neareset Neighbors for first item purchased by our user
KNN__rec_Product = mod_KNNBasic.get_neighbors(items_purchased[0][0], 13)

recommended_products = []
for product_iid in KNN__rec_Product:
    if not product_iid in items_purchased[0]: 
        purchased_item = mod_KNNBasic.trainSet.to_raw_iid(product_iid)
        recommended_products.append(purchased_item)
print("Items recommended for user " + str(uid) + " by KNNBasic \n"  , recommended_products)    

In [None]:
# Applying KNNBaseline

items_purchased = trainSet.ur[trainSet.to_inner_uid(uid)]


print("User " + str(uid) +  " has previously purchased items with productId: ")
for items in items_purchased[0]: 
    print(mod_KNNBaseline.trainSet.to_raw_iid(items))



#getting K Neareset Neighbors for first item purchased by the choosen user
KNN__rec_Product = mod_KNNBaseline.get_neighbors(items_purchased[0][0], 13)

recommended_products = []
for product_iid in KNN__rec_Product:
    if not product_iid in items_purchased[0]: 
        purchased_item = mod_KNNBaseline.trainSet.to_raw_iid(product_iid)
        recommended_products.append(purchased_item)
print("Items recommended for user " + str(uid) + " by KNNBaseline \n"  , recommended_products)    

In [None]:
# Applying SVD

items_purchased = trainSet.ur[trainSet.to_inner_uid(uid)]


print("User " + str(uid) +  " has previously purchased items with productId: ")
for items in items_purchased[0]: 
    print(mod_SVD.trainSet.to_raw_iid(items))


recommended_products = []

UserID = unique_user_ids[user_index]

for product_id in unique_products:
    pred = mod_SVD.predict(UserID,  product_id, r_ui=4, verbose=True)
    recommended_products.append(pred)
    
print("Items recommended for user " + str(uid) + " by SVD \n", recommended_products)

## Analysis 

In [None]:
#Predictions for KNNBasic

pred_KNNBasic = pd.DataFrame(predictions_KNNBasic, columns=['uid', 'iid', 'rui', 'est', 'details'])
pred_KNNBasic['Iu'] = pred_KNNBasic.uid.apply(get_Iu)
pred_KNNBasic['Ui'] = pred_KNNBasic.iid.apply(get_Ui)
pred_KNNBasic['err'] = abs(pred_KNNBasic.est - pred_KNNBasic.rui)

In [None]:
pred_KNNBasic.head()

In [None]:
best_predictions = pred_KNNBasic.sort_values(by='err')[:10]
best_predictions

In [None]:
print("For KNNBasic: \n")
print("\nTotal no of ratings :",pred_KNNBasic.shape[0])
print("Total No of Users   :", len(np.unique(pred_KNNBasic.uid)))
print("Total No of products  :", len(np.unique(pred_KNNBasic.iid)))


In [None]:
#Predictions for KNNBaseline

pred_KNNBaseline = pd.DataFrame(predictions_KNNBaseline, columns=['uid', 'iid', 'rui', 'est', 'details'])
pred_KNNBaseline['Iu'] = pred_KNNBaseline.uid.apply(get_Iu)
pred_KNNBaseline['Ui'] = pred_KNNBaseline.iid.apply(get_Ui)
pred_KNNBaseline['err'] = abs(pred_KNNBaseline.est - pred_KNNBaseline.rui)

In [None]:
pred_KNNBaseline.head()

In [None]:
best_predictions = pred_KNNBaseline.sort_values(by='err')[:10]
best_predictions

In [None]:
print("For KNNBaseline: \n")
print("\nTotal no of ratings :",pred_KNNBaseline.shape[0])
print("Total No of Users   :", len(np.unique(pred_KNNBaseline.uid)))
print("Total No of products  :", len(np.unique(pred_KNNBaseline.iid)))


In [None]:
#Predictions for SVD

pred_SVD = pd.DataFrame(predictions_SVD, columns=['uid', 'iid', 'rui', 'est', 'details'])
pred_SVD['Iu'] = pred_SVD.uid.apply(get_Iu)
pred_SVD['Ui'] = pred_SVD.iid.apply(get_Ui)
pred_SVD['err'] = abs(pred_SVD.est - pred_SVD.rui)

pred_SVD.head()

In [None]:
best_predictions = pred_SVD.sort_values(by='err')[:10]
best_predictions

In [None]:
print("For SVD: \n")
print("\nTotal no of ratings :",pred_SVD.shape[0])
print("Total No of Users   :", len(np.unique(pred_SVD.uid)))
print("Total No of products  :", len(np.unique(pred_SVD.iid)))

In [None]:
#Predictions for SlopeOne

pred_SlopeOne = pd.DataFrame(predictions_SlopeOne, columns=['uid', 'iid', 'rui', 'est', 'details'])
pred_SlopeOne['Iu'] = pred_SlopeOne.uid.apply(get_Iu)
pred_SlopeOne['Ui'] = pred_SlopeOne.iid.apply(get_Ui)
pred_SlopeOne['err'] = abs(pred_SlopeOne.est - pred_SlopeOne.rui)

pred_SlopeOne.head()

In [None]:
best_predictions = pred_SlopeOne.sort_values(by='err')[:10]
best_predictions

In [None]:
print("For SlopeOne: \n")
print("\nTotal no of ratings :",pred_SlopeOne.shape[0])
print("Total No of Users   :", len(np.unique(pred_SlopeOne.uid)))
print("Total No of products  :", len(np.unique(pred_SlopeOne.iid)))