## Collaborative filtering Recommendation system 

In [1]:
# pip install pymongo

In [2]:
# Utilities
import math, random, warnings
from time import time
from datetime import datetime
from collections import defaultdict
from IPython.core.interactiveshell import InteractiveShell
from tabulate import tabulate
from IPython.display import display
# Mathematical calculation
import numpy as np
from scipy.sparse.linalg import svds
from sklearn import model_selection
from sklearn.metrics.pairwise import cosine_similarity

# Data handling
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Fake Rates

In [3]:
fake_rates = pd.read_csv("FakeRates.csv")
fake_rates.tail()

Unnamed: 0,userId,productId,rating
33032,A2LF16F0KX9L7P,661ec89148881b3a8fbed0c1,5
33033,A2LF16F0KX9L7P,661ec89148881b3a8fbed0c3,5
33034,A2LF16F0KX9L7P,661ec89148881b3a8fbed0c7,5
33035,A2EF8N7ZE7ONWX,661ec89148881b3a8fbed0c8,5
33036,A379DLIAF0MFM5,661ec89148881b3a8fbed0ca,5


## Rates from DB

In [4]:
import pymongo
import pandas as pd

In [5]:
client = pymongo.MongoClient("mongodb+srv://Reem:xsHvzkOTrpUukM42@cluster0.5lyger6.mongodb.net/project")
dbs = databases = client.list_database_names()
print(dbs)

['project', 'admin', 'local']


In [6]:
db = client.get_database("project")
collections = db.list_collection_names()
print(collections)

['recommendations', 'users', 'reviews', 'CF_recom', 'Products']


In [7]:
collection = db["reviews"]
cursor = collection.find({})
data = list(cursor)
ratings = pd.DataFrame(data)
ratings = ratings.drop(['review', 'createdAt', '_id', '__v'], axis=1)
ratings.rename(columns={'product': 'productId'}, inplace=True)
ratings.rename(columns={'user': 'userId'}, inplace=True)
ratings

Unnamed: 0,rating,productId,userId
0,2,661ec86248881b3a8fbe93dd,65b8f42b2572d52578387b47
1,4,661ec86248881b3a8fbe93db,66204cdf816a3707f0d9a3f8
2,3,661ec86248881b3a8fbe93e2,66204cdf816a3707f0d9a3f8
3,3,661ec86248881b3a8fbe93e2,65b8f42b2572d52578387b47
4,2,661ec86248881b3a8fbe93da,662809c45cd5ce502490963c
5,5,661ec86648881b3a8fbe9ae2,66297f5e8fcb1a0c34c40ffc
6,4,661ec86f48881b3a8fbea5ee,662325fee545ac54acca523c
7,3,661ec86648881b3a8fbe9ae2,65b8f42b2572d52578387b47
8,3,661ec86648881b3a8fbe9adf,662d7a8fa5620743c0c1bed7
9,5,661ec86248881b3a8fbe9458,662325fee545ac54acca523c


In [8]:
ratings['userId'] = ratings['userId'].astype(str)

## Only recommend for users have more than X rates

In [9]:
reviewer_counts = ratings.groupby('userId').size().reset_index(name='num_ratings').sort_values(by='num_ratings', ascending=False)
reviewer_counts

Unnamed: 0,userId,num_ratings
4,66297f5e8fcb1a0c34c40ffc,13
2,662325fee545ac54acca523c,11
0,65b8f42b2572d52578387b47,3
1,66204cdf816a3707f0d9a3f8,2
3,662809c45cd5ce502490963c,1
5,662d7a8fa5620743c0c1bed7,1


In [10]:
min_no_rates = 10
reviewers_with_enough_rates = reviewer_counts[reviewer_counts['num_ratings']>=min_no_rates]
reviewers_with_enough_rates

Unnamed: 0,userId,num_ratings
4,66297f5e8fcb1a0c34c40ffc,13
2,662325fee545ac54acca523c,11


In [11]:
ratings = reviewers_with_enough_rates

## Concat Rates

In [12]:
ratings = pd.concat([fake_rates,ratings ], axis=0, ignore_index=True)
ratings.tail()

Unnamed: 0,userId,productId,rating,num_ratings
33034,A2LF16F0KX9L7P,661ec89148881b3a8fbed0c7,5.0,
33035,A2EF8N7ZE7ONWX,661ec89148881b3a8fbed0c8,5.0,
33036,A379DLIAF0MFM5,661ec89148881b3a8fbed0ca,5.0,
33037,66297f5e8fcb1a0c34c40ffc,,,13.0
33038,662325fee545ac54acca523c,,,11.0


#### Real data [userID, ProductID, rating] should be replaced

In [13]:
# Divide the dataset in 70:30 ratio
trainset, testset = model_selection.train_test_split(ratings, test_size=0.3, random_state=5)

In [14]:
trainset.head()

Unnamed: 0,userId,productId,rating,num_ratings
32078,A2XPK6XCL0UH1S,661ec88d48881b3a8fbecd48,4.0,
29043,A1VHCO8RQFIGQJ,661ec88448881b3a8fbec146,5.0,
31227,AT28RL25Q2OLK,661ec88a48881b3a8fbec94d,3.0,
23220,A1HXESZHP7SXKI,661ec87548881b3a8fbeab7e,4.0,
25759,AJELAHDCBYXD3,661ec87a48881b3a8fbeb46e,1.0,


In [15]:
testset.head()

Unnamed: 0,userId,productId,rating,num_ratings
27016,A2FG061KDA1ARB,661ec87d48881b3a8fbeb855,5.0,
1121,AMUP8DYE7EAN2,661ec86248881b3a8fbe9412,5.0,
10378,AS8C336DDLSO4,661ec86648881b3a8fbe98aa,5.0,
15262,A359MYLPLEOF7N,661ec86948881b3a8fbe9d81,5.0,
25379,AZXQ0WME7X6UT,661ec87748881b3a8fbeb2f1,1.0,


In [16]:
print('Shape of the training set  :', trainset.shape)
print('Shape of the test set      :', testset.shape)

Shape of the training set  : (23127, 4)
Shape of the test set      : (9912, 4)


In [17]:
# subset_df = ratings.iloc[33037:]
# feature_values = subset_df['userId']
# users_Ids = feature_values.unique()
# users_Ids

## Sparse Matrix

In [18]:
# Create the User-Item sparse matrix
user_item = ratings.pivot(index='userId', columns='productId', values='rating').fillna(0)
print('Shape of User-Item sparse matrix:', user_item.shape)
# user_item = user_item[user_item.index.isin(users_Ids)] 
user_item.head()

Shape of User-Item sparse matrix: (2211, 15603)


productId,NaN,661ec86248881b3a8fbe93da,661ec86248881b3a8fbe93db,661ec86248881b3a8fbe93dc,661ec86248881b3a8fbe93dd,661ec86248881b3a8fbe93de,661ec86248881b3a8fbe93df,661ec86248881b3a8fbe93e0,661ec86248881b3a8fbe93e1,661ec86248881b3a8fbe93e2,...,661ec89148881b3a8fbed0c2,661ec89148881b3a8fbed0c3,661ec89148881b3a8fbed0c4,661ec89148881b3a8fbed0c5,661ec89148881b3a8fbed0c6,661ec89148881b3a8fbed0c7,661ec89148881b3a8fbed0c8,661ec89148881b3a8fbed0c9,661ec89148881b3a8fbed0ca,661ec89148881b3a8fbed0cb
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
662325fee545ac54acca523c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66297f5e8fcb1a0c34c40ffc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
item_user = user_item.T
item_user.head()

userId,662325fee545ac54acca523c,66297f5e8fcb1a0c34c40ffc,A100UD67AHFODS,A100WO06OQR8BQ,A105S56ODHGJEK,A105TOJ6LTVMBG,A109XLG7SJQAIA,A10AFVU66A79Y1,A10CRW7XRJBJ2G,A10G4BPT5MGBHY,...,AZFF4CX9MQ4AE,AZMY6E8B52L2T,AZNUHQSHZHSUE,AZPOUCM043IY8,AZQGJ5CEAJGXB,AZV2U6GU5QA6C,AZXQ0WME7X6UT,AZYJE40XW6MFG,AZZ5ASC403N74,AZZYW4YOE1B6E
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93da,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93db,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93dc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93dd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cosine Similarity

In [20]:
# Calculate the user-user similarity
# subset = user_item[user_item.index.isin(users_Ids)] 
user_similarity = cosine_similarity(user_item)
np.fill_diagonal(user_similarity, 0)
user_similarity_df = pd.DataFrame(user_similarity,index=user_item.index, columns=user_item.index)
user_similarity_df.head()

userId,662325fee545ac54acca523c,66297f5e8fcb1a0c34c40ffc,A100UD67AHFODS,A100WO06OQR8BQ,A105S56ODHGJEK,A105TOJ6LTVMBG,A109XLG7SJQAIA,A10AFVU66A79Y1,A10CRW7XRJBJ2G,A10G4BPT5MGBHY,...,AZFF4CX9MQ4AE,AZMY6E8B52L2T,AZNUHQSHZHSUE,AZPOUCM043IY8,AZQGJ5CEAJGXB,AZV2U6GU5QA6C,AZXQ0WME7X6UT,AZYJE40XW6MFG,AZZ5ASC403N74,AZZYW4YOE1B6E
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
662325fee545ac54acca523c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66297f5e8fcb1a0c34c40ffc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.082058,0.0,0.0,0.0,0.0,...,0.0,0.097017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.124534,0.0,0.0,0.09244,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Calculate the item-item similarity
item_similarity = cosine_similarity(item_user)
np.fill_diagonal(item_similarity, 0)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user.index, columns=item_user.index)
np.set_printoptions(threshold=np.inf)
np.seterr(over='ignore')
item_similarity_df.head()

productId,NaN,661ec86248881b3a8fbe93da,661ec86248881b3a8fbe93db,661ec86248881b3a8fbe93dc,661ec86248881b3a8fbe93dd,661ec86248881b3a8fbe93de,661ec86248881b3a8fbe93df,661ec86248881b3a8fbe93e0,661ec86248881b3a8fbe93e1,661ec86248881b3a8fbe93e2,...,661ec89148881b3a8fbed0c2,661ec89148881b3a8fbed0c3,661ec89148881b3a8fbed0c4,661ec89148881b3a8fbed0c5,661ec89148881b3a8fbed0c6,661ec89148881b3a8fbed0c7,661ec89148881b3a8fbed0c8,661ec89148881b3a8fbed0c9,661ec89148881b3a8fbed0ca,661ec89148881b3a8fbed0cb
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93da,0.0,0.0,0.0,0.030641,0.0,0.0,0.036033,0.0,0.089282,0.105112,...,0.0,0.0,0.0,0.0,0.140414,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93db,0.0,0.0,0.0,0.017515,0.029706,0.052345,0.015257,0.033395,0.057579,0.056602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93dc,0.0,0.030641,0.017515,0.0,0.0,0.0,0.08711,0.0,0.053367,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
661ec86248881b3a8fbe93dd,0.0,0.0,0.029706,0.0,0.0,0.0,0.1203,0.0,0.043445,0.036137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get Most Close (product - user)

In [22]:
# Method to find top N neighbors
def find_n_neighbors(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(axis=1, func=lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index,
                                                   index=['top{}'.format(i) for i in range(1, n+1)]))
    return df

In [23]:
# Find 10 neighbors of each user
user_10_neighbors = find_n_neighbors(user_similarity_df, 10)
user_10_neighbors.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
662325fee545ac54acca523c,662325fee545ac54acca523c,A3JQ58CZBV3FOZ,A3J8A5L5AF5TX9,A3JAH5WW61N2EK,A3JJ222HEKM10R,A3JL3YQYI7OR5O,A3JLOIXFM75QNV,A3JNBO7H2SPL44,A3JU16JTNLVK1J,A3KHCO3MJLKLVA
66297f5e8fcb1a0c34c40ffc,662325fee545ac54acca523c,A3JQ58CZBV3FOZ,A3J8A5L5AF5TX9,A3JAH5WW61N2EK,A3JJ222HEKM10R,A3JL3YQYI7OR5O,A3JLOIXFM75QNV,A3JNBO7H2SPL44,A3JU16JTNLVK1J,A3KHCO3MJLKLVA
A100UD67AHFODS,A2OOLI2WFY4L2,AZMY6E8B52L2T,AT53ZTTO707MB,A2XX2A4OJCDNLZ,A105TOJ6LTVMBG,A1KD8NJPZ01R37,A1AFS9M75F17IZ,ADAXXCMSLC0U9,A2J7FHZFKOKGZ6,A2W0GY64CJSV5D
A100WO06OQR8BQ,A105TOJ6LTVMBG,A3L1VJMHFWONCB,A1ZXMMQPYC3Z9I,AZBXKUH4AIW3X,A1RPJHUVVSI98A,A313DADVI76HKM,AG7EF0SVBQOUX,AAK6SOEJY30YG,A2LXX47A0KMJVX,A2X6J6AFLLYVXH
A105S56ODHGJEK,AM3XNQU9TZBD8,A2UKE7GIVC7XFW,A1WKQ94M45D8MG,A1WJ3P43SZUNDM,A30UP2KKD5IQEP,A3CBCVHIK3G76X,A3VBXQKRM7A4JR,AMRMK86X3PKXD,A2SZLNSI5KOQJT,A2NW5UDW32JSV2


In [24]:
# Find 10 neighbors of each item0
item_10_neighbors = find_n_neighbors(item_similarity_df, 10)
item_10_neighbors.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,,661ec88048881b3a8fbebc8f,661ec88048881b3a8fbebc73,661ec88048881b3a8fbebc74,661ec88048881b3a8fbebc75,661ec88048881b3a8fbebc76,661ec88048881b3a8fbebc77,661ec88048881b3a8fbebc78,661ec88048881b3a8fbebc79,661ec88048881b3a8fbebc7a
661ec86248881b3a8fbe93da,661ec86248881b3a8fbe977f,661ec86b48881b3a8fbea06b,661ec86648881b3a8fbe99be,661ec86648881b3a8fbe9998,661ec88a48881b3a8fbec841,661ec88448881b3a8fbec01f,661ec89148881b3a8fbecfb4,661ec87748881b3a8fbeb277,661ec88a48881b3a8fbec7f6,661ec88a48881b3a8fbec848
661ec86248881b3a8fbe93db,661ec86248881b3a8fbe9623,661ec87248881b3a8fbea9db,661ec86f48881b3a8fbea49f,661ec87248881b3a8fbea81a,661ec87248881b3a8fbeaae7,661ec87548881b3a8fbeac09,661ec86248881b3a8fbe940f,661ec86648881b3a8fbe99ee,661ec86648881b3a8fbe9b4a,661ec86948881b3a8fbe9f38
661ec86248881b3a8fbe93dc,661ec86f48881b3a8fbea71d,661ec86248881b3a8fbe9610,661ec86648881b3a8fbe99be,661ec86948881b3a8fbe9f5c,661ec88448881b3a8fbec19d,661ec88448881b3a8fbebee3,661ec88048881b3a8fbebed0,661ec88048881b3a8fbebecd,661ec88048881b3a8fbebebd,661ec88048881b3a8fbebebb
661ec86248881b3a8fbe93dd,661ec87248881b3a8fbeaa4d,661ec87248881b3a8fbeab16,661ec86948881b3a8fbe9ca3,661ec88748881b3a8fbec37f,661ec89148881b3a8fbed079,661ec88a48881b3a8fbec6c6,661ec88a48881b3a8fbec6d8,661ec88a48881b3a8fbec6d9,661ec88a48881b3a8fbec6df,661ec87d48881b3a8fbeb857


## Predict User Rating For all Products

In [29]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)  
        mean_user_rating = np.array(mean_user_rating)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [30]:
user_item.shape

(2211, 15603)

In [31]:
user_similarity.shape

(2211, 2211)

In [32]:
#predict user rating for all products
# user_item_subset = user_item[user_item.index.isin(users_Ids)]
user_prediction = predict(user_item, user_similarity, type='user')
user_prediction = pd.DataFrame(user_prediction, index=user_item.index, columns=user_item.columns)
user_prediction.head()

  pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T


productId,NaN,661ec86248881b3a8fbe93da,661ec86248881b3a8fbe93db,661ec86248881b3a8fbe93dc,661ec86248881b3a8fbe93dd,661ec86248881b3a8fbe93de,661ec86248881b3a8fbe93df,661ec86248881b3a8fbe93e0,661ec86248881b3a8fbe93e1,661ec86248881b3a8fbe93e2,...,661ec89148881b3a8fbed0c2,661ec89148881b3a8fbed0c3,661ec89148881b3a8fbed0c4,661ec89148881b3a8fbed0c5,661ec89148881b3a8fbed0c6,661ec89148881b3a8fbed0c7,661ec89148881b3a8fbed0c8,661ec89148881b3a8fbed0c9,661ec89148881b3a8fbed0ca,661ec89148881b3a8fbed0cb
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
662325fee545ac54acca523c,,,,,,,,,,,...,,,,,,,,,,
66297f5e8fcb1a0c34c40ffc,,,,,,,,,,,...,,,,,,,,,,
A100UD67AHFODS,0.004364,0.151258,0.078193,0.096453,0.045059,0.026081,0.122723,0.051906,0.02689,0.04153,...,0.004364,0.004364,0.004364,0.004364,0.004364,0.004364,0.004364,0.004364,0.004364,0.004364
A100WO06OQR8BQ,0.000243,0.066817,0.106212,0.063464,0.083975,0.027765,0.206537,0.018979,0.038663,0.000243,...,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243
A105S56ODHGJEK,-0.001372,0.22107,0.49898,-0.001372,-0.001372,0.176087,-0.001372,0.140576,-0.001372,0.072551,...,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372,-0.001372


In [33]:
item_prediction = predict(user_item, item_similarity, type='item')
item_prediction.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15593,15594,15595,15596,15597,15598,15599,15600,15601,15602
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
662325fee545ac54acca523c,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66297f5e8fcb1a0c34c40ffc,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100UD67AHFODS,,0.009501,0.003443,0.013541,0.01127,0.008865,0.013709,0.003128,0.00557,0.003615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,,0.001544,0.004095,0.003529,0.007867,0.003639,0.010932,0.00292,0.003997,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,,0.014476,0.016939,0.0,0.0,0.011954,0.0,0.010943,0.0,0.007534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Recommend

In [34]:
# Method to Recommend the items with the highest predicted ratings
def recommend_items(userId, orig_df, preds_df, top_n):
    # Get and sort the user's ratings
    sorted_user_ratings = orig_df.loc[userId].sort_values(ascending=False) #sorted_user_ratings
    
    sorted_user_predictions = preds_df.loc[userId].sort_values(ascending=False) #sorted_user_predictions
    
    # Prepare recommendations
    recommedations = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    
    recommedations.index.name = 'Recommended Items'
    
    recommedations.columns = ['user_ratings', 'user_predictions']
    
    # Take the products which user has NOT rated
    recommedations = recommedations.loc[recommedations.user_ratings == 0] 
    
    recommedations = recommedations.sort_values('user_predictions', ascending=False)
    
    #print(recommedations[:top_n])
    return recommedations.head(top_n)

In [35]:
def get_result(userID):
    ## NO. recommendations
    no_products = 11
    selected_columns = ['productId', 'name','user_ratings', 'user_predictions']
    column_mapping = {'productId': 'RProductID', 'name': 'RProductName'}
    
    recom_UBCF = recommend_items(userID, user_item, user_prediction, no_products)

    recom_IBCF = recommend_items(userID, user_item, item_prediction, no_products)

    result = pd.concat([recom_UBCF, recom_IBCF], axis=0)
    result =  pd.DataFrame(result.index)
    
    result.dropna(inplace=True)
    result.drop_duplicates(subset=['Recommended Items'], inplace=True)
    display(result)
    result = result.transpose()
    result['userId'] = userID
    result = result.set_index('userId')
    num_columns = len(result.columns)
    new_columns = ['recommend {}'.format(i) for i in range(num_columns)]
    # Rename columns
    result.columns = new_columns

    return result

In [36]:
#Find recommendation for couple of users using UBCF
users = {'662325fee545ac54acca523c'}  

for user in users:
    # UBCF, IBCF = get_result(user)
    result = get_result(user)
    # display(result)
    # print("Top recommendations for the userId: %s" %(user))
    # print("-----------------------------")
    # print(type(UBCF))
    # display(result)
    # print("----------------------------------------------------------")

Unnamed: 0,Recommended Items
1,661ec88048881b3a8fbebc8f
2,661ec88048881b3a8fbebc73
3,661ec88048881b3a8fbebc74
4,661ec88048881b3a8fbebc75
5,661ec88048881b3a8fbebc76
6,661ec88048881b3a8fbebc77
7,661ec88048881b3a8fbebc78
8,661ec88048881b3a8fbebc79
9,661ec88048881b3a8fbebc7a
10,661ec88048881b3a8fbebc7b


In [37]:
## Get unique users only from database
subset_df = ratings.iloc[33037:]
feature_values = subset_df['userId']
users_Ids = feature_values.unique()
users_Ids = pd.DataFrame(users_Ids,columns=['userId'])
users_Ids

Unnamed: 0,userId
0,66297f5e8fcb1a0c34c40ffc
1,662325fee545ac54acca523c


## Apply on all users

In [38]:
result_df = pd.DataFrame()

# Loop through each value in the column
for user in users_Ids['userId']:
    # Call your function and get the result DataFrame
    result = get_result(user)
    
    # Concatenate the result DataFrame to the result_df
    result_df = pd.concat([result_df, result])
result_df.replace(pd.NA, None, inplace=True)
result_df

Unnamed: 0,Recommended Items
1,661ec88048881b3a8fbebc8f
2,661ec88048881b3a8fbebc73
3,661ec88048881b3a8fbebc74
4,661ec88048881b3a8fbebc75
5,661ec88048881b3a8fbebc76
6,661ec88048881b3a8fbebc77
7,661ec88048881b3a8fbebc78
8,661ec88048881b3a8fbebc79
9,661ec88048881b3a8fbebc7a
10,661ec88048881b3a8fbebc7b


Unnamed: 0,Recommended Items
1,661ec88048881b3a8fbebc8f
2,661ec88048881b3a8fbebc73
3,661ec88048881b3a8fbebc74
4,661ec88048881b3a8fbebc75
5,661ec88048881b3a8fbebc76
6,661ec88048881b3a8fbebc77
7,661ec88048881b3a8fbebc78
8,661ec88048881b3a8fbebc79
9,661ec88048881b3a8fbebc7a
10,661ec88048881b3a8fbebc7b


Unnamed: 0_level_0,recommend 0,recommend 1,recommend 2,recommend 3,recommend 4,recommend 5,recommend 6,recommend 7,recommend 8,recommend 9
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
66297f5e8fcb1a0c34c40ffc,661ec88048881b3a8fbebc8f,661ec88048881b3a8fbebc73,661ec88048881b3a8fbebc74,661ec88048881b3a8fbebc75,661ec88048881b3a8fbebc76,661ec88048881b3a8fbebc77,661ec88048881b3a8fbebc78,661ec88048881b3a8fbebc79,661ec88048881b3a8fbebc7a,661ec88048881b3a8fbebc7b
662325fee545ac54acca523c,661ec88048881b3a8fbebc8f,661ec88048881b3a8fbebc73,661ec88048881b3a8fbebc74,661ec88048881b3a8fbebc75,661ec88048881b3a8fbebc76,661ec88048881b3a8fbebc77,661ec88048881b3a8fbebc78,661ec88048881b3a8fbebc79,661ec88048881b3a8fbebc7a,661ec88048881b3a8fbebc7b


### Performance Metrics

In [None]:
# def calculate_rmse(orig_df, preds_df):
#     rmse_df = pd.concat([orig_df.mean(), preds_df.mean()], axis=1)
#     rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
#     RMSE = round(np.sqrt(((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean()), 5)*100
#     print('\nRMSE for this recommender model = {}%\n'.format(RMSE))
#     return rmse_df.head()

In [None]:
# RMSE = calculate_rmse(user_item, user_prediction)

## Save result to DB

In [None]:
# CF_recom = db["CF_recom"]

# # Drop the collection if it exists
# if CF_recom.name in db.list_collection_names():
#     CF_recom.drop()

# data = result_df.to_dict(orient='records')

# # Insert data into MongoDB
# CF_recom.insert_many(data)
# print("DataFrame successfully saved to MongoDB collection.")