# Case Study by: Mohammed Saqlain Attar

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='latin-1')
df_ratings

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6
...,...,...,...
1048570,250764,451410777,0
1048571,250764,452264464,8
1048572,250764,048623715X,0
1048573,250764,486256588,0


### Taking only 10k records

In [3]:
df_ratings.sort_values(by=['user_id'],inplace=True)
df_ratings = df_ratings.head(10000)
df_ratings.reset_index(inplace=True)
df_ratings.drop(columns='index',inplace=True)
df_ratings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,user_id,isbn,rating
0,2,195153448,0
1,7,34542252,0
2,8,1567407781,6
3,8,1881320189,7
4,8,1575663937,6
...,...,...,...
9995,3728,449908119,7
9996,3728,61057819,0
9997,3728,553574671,9
9998,3728,60008776,7


### We will recommend movies based on user-user similarity and item-item similarity (Collaborative Filtering). For that, first we need to calculate the number of unique users and movies.

### Creating a user-item matrix which can be used to calculate the similarity between users and items.

In [4]:
df_ratings_pivot = df_ratings.pivot_table(index='user_id', columns='isbn',values='rating').fillna(0)
df_ratings_pivot

isbn,000636988X,000649840X,002026478X,002542730X,006000438X,006015456X,006015781X,006016185X,006017143X,006017935X,...,9997522052,B000065V20,B00008NRHQ,B00009ANY9,B0000DAPP1,B460712002,BCID694577184,NONFICTION,O76790592X,O809463121
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating piarwise_distance using cosine similarity

In [5]:
from sklearn.metrics import pairwise_distances

In [6]:
user_similarity = pairwise_distances(df_ratings_pivot, metric='cosine')

In [7]:
user_similarity

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [8]:
item_similarity = pairwise_distances(df_ratings_pivot.T, metric='cosine')

In [9]:
item_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [10]:
item_similarity.shape

(8742, 8742)

In [11]:
user_similarity.shape

(1323, 1323)

### Make predictions based on user and item similarity

In [12]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

## Predictions

In [13]:
user_prediction = predict(df_ratings_pivot, user_similarity, type='user')
item_prediction = predict(df_ratings_pivot, item_similarity, type='item')

In [14]:
user_prediction

array([[ 0.00375197,  0.00375197, -0.00305075, ..., -0.00305075,
        -0.00305075,  0.00072854],
       [ 0.00375197,  0.00375197, -0.00305075, ..., -0.00305075,
        -0.00305075,  0.00072854],
       [ 0.0082194 ,  0.0082194 ,  0.00141153, ...,  0.00141153,
         0.00141153,  0.00519368],
       ...,
       [ 0.01794993,  0.01794993,  0.01114207, ...,  0.01114207,
         0.01114207,  0.01492421],
       [ 0.00638777,  0.00638777, -0.00042009, ..., -0.00042009,
        -0.00042009,  0.00336205],
       [ 0.00375197,  0.00375197, -0.00305075, ..., -0.00305075,
        -0.00305075,  0.00072854]])

In [15]:
item_prediction.values

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00448543, 0.00446813, 0.00446122, ..., 0.00446122, 0.00446122,
        0.00446224],
       ...,
       [0.01426137, 0.01420636, 0.0141844 , ..., 0.0141844 , 0.0141844 ,
        0.01418764],
       [0.00264525, 0.00263505, 0.00263098, ..., 0.00263098, 0.00263098,
        0.00263158],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [16]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [17]:
rmse(user_prediction,df_ratings_pivot.values)

7.871811192426757

In [18]:
rmse(item_prediction.values, df_ratings_pivot.values)

7.905131468547967