# Neighborhood-Based Collaborative Filtering applied in MovieLens Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 8)

### 1. Creating user_ratings - movies dataframe from MovieLens dataset

#### Loading datasets

In [2]:
# loading ratings data
df_ratings = pd.read_csv('../data/movies_small/ratings.csv')
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# loading movies data
df_movies = pd.read_csv('../data/movies_small/movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Dropping duplicates from movies dataset

In [4]:
# checking for duplicated titles in movies data 
df_movies.duplicated(subset='title').value_counts()

False    9737
True        5
dtype: int64

In [5]:
# dropping duplicates 
df_movies.drop_duplicates(subset ="title", keep = 'first', inplace = True)
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


#### Merging ratings and movies datasets

In [58]:
df_ratings['movieId'].nunique()

9724

In [59]:
df_movies['movieId'].nunique()

9737

In [6]:
# merging ratings and movies data
# not rated movies must be dropped at merging the two dataframes, cause later their NaNs cannot be filled with mean
df_merged = pd.merge(df_movies, df_ratings, on='movieId', how='right')
df_merged

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
2,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931
...,...,...,...,...,...,...
100831,166534,Split (2017),Drama|Horror|Thriller,610,4.0,1493848402
100832,168248,John Wick: Chapter Two (2017),Action|Crime|Thriller,610,5.0,1493850091
100833,168250,Get Out (2017),Horror,610,5.0,1494273047
100834,168252,Logan (2017),Action|Sci-Fi,610,5.0,1493846352


In [57]:
# drop 
df_merged['title'].isnull().values.any()

True

In [7]:
# dropping the columns we dont need  
df_merged.drop(columns=['timestamp', 'genres', 'movieId'], inplace=True)

Unnamed: 0,title,userId,rating
0,Toy Story (1995),1,4.0
1,Grumpier Old Men (1995),1,4.0
2,Heat (1995),1,4.0
3,Seven (a.k.a. Se7en) (1995),1,5.0
4,"Usual Suspects, The (1995)",1,5.0
...,...,...,...
100831,Split (2017),610,4.0
100832,John Wick: Chapter Two (2017),610,5.0
100833,Get Out (2017),610,5.0
100834,Logan (2017),610,5.0


In [12]:
# transposing dataframe, users to columns, movies to index 
df_user_movie = df_merged.pivot_table(values='rating', columns='userId', index='title')
df_user_movie

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


### 2. Creating cosine similarity matrix

In [15]:
def cosim(X, Y):
    num = np.nansum(X*Y) #np.dot(X, Y)
    denom = np.sqrt(np.nansum(X*X)*np.nansum(Y*Y)) # np.sqrt(np.dot(X, X)) * np.sqrt(np.dot(Y, Y))
    return num/denom

In [17]:
cosim_table = []
for user1 in df_user_movie.columns:
    row = []
    for user2 in df_user_movie.columns:
        row.append(cosim(df_user_movie[user1], df_user_movie[user2]))
    cosim_table.append(row)

In [19]:
df_cosim = pd.DataFrame(cosim_table, index=df_user_movie.columns, columns=df_user_movie.columns).round(2)

In [20]:
df_cosim

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.03,0.06,0.19,0.13,0.13,0.16,0.14,0.06,0.02,...,0.08,0.16,0.22,0.07,0.15,0.16,0.27,0.29,0.09,0.15
2,0.03,1.00,0.00,0.00,0.02,0.03,0.03,0.03,0.00,0.07,...,0.20,0.02,0.01,0.00,0.00,0.03,0.01,0.05,0.03,0.10
3,0.06,0.00,1.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,...,0.01,0.00,0.02,0.00,0.01,0.01,0.02,0.02,0.00,0.03
4,0.19,0.00,0.00,1.00,0.13,0.09,0.12,0.06,0.01,0.03,...,0.09,0.13,0.31,0.05,0.08,0.20,0.13,0.15,0.03,0.11
5,0.13,0.02,0.01,0.13,1.00,0.30,0.11,0.43,0.00,0.03,...,0.07,0.42,0.11,0.26,0.15,0.11,0.15,0.14,0.26,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.16,0.03,0.01,0.20,0.11,0.10,0.20,0.10,0.08,0.09,...,0.18,0.12,0.30,0.07,0.15,1.00,0.15,0.26,0.07,0.20
607,0.27,0.01,0.02,0.13,0.15,0.16,0.19,0.19,0.01,0.01,...,0.09,0.20,0.20,0.14,0.12,0.15,1.00,0.28,0.15,0.14
608,0.29,0.05,0.02,0.15,0.14,0.18,0.32,0.19,0.10,0.08,...,0.16,0.20,0.23,0.16,0.18,0.26,0.28,1.00,0.12,0.32
609,0.09,0.03,0.00,0.03,0.26,0.21,0.09,0.42,0.00,0.02,...,0.04,0.34,0.06,0.24,0.10,0.07,0.15,0.12,1.00,0.05


### 3. Making predictions

#### 3.1. Picking a target user

In [48]:
df_user = pd.DataFrame(df_user_movie[42])
df_user

Unnamed: 0_level_0,42
title,Unnamed: 1_level_1
'71 (2014),
'Hellboy': The Seeds of Creation (2004),
'Round Midnight (1986),
'Salem's Lot (2004),
'Til There Was You (1997),
...,...
eXistenZ (1999),
xXx (2002),
xXx: State of the Union (2005),
¡Three Amigos! (1986),5.0


#### 3.2. Select the movies they haven't seen

In [28]:
df_user = df_user.loc[df_user[42].isnull()]
df_user

Unnamed: 0_level_0,42
title,Unnamed: 1_level_1
'71 (2014),
'Hellboy': The Seeds of Creation (2004),
'Round Midnight (1986),
'Salem's Lot (2004),
'Til There Was You (1997),
...,...
anohana: The Flower We Saw That Day - The Movie (2013),
eXistenZ (1999),
xXx (2002),
xXx: State of the Union (2005),


#### 3.3. For each of the movies they haven't seen, get the list of users who have seen it

In [30]:
df_user2 = df_user_movie.loc[df_user_movie[42].isnull()]
df_user2.fillna(value=0, inplace=True)
df_user2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5


#### 3.4. Calculate the predicted rating as a cosine similarity weighted average of the ratings of the users who have seen it¶

In [202]:
# cosine similarity of the picked user with the other users
# other way: cosim_user = pd.DataFrame(df_cosim.iloc[41])
cosim_user = pd.DataFrame(df_cosim[42])
cosim_user

Unnamed: 0_level_0,42
userId,Unnamed: 1_level_1
1,0.26
2,0.04
3,0.01
4,0.15
5,0.13
...,...
606,0.21
607,0.22
608,0.34
609,0.15


In [37]:
# calculating numerator by multiplying the cosine similarity with the ratings of other users for the movies 
# which has not been seen by the user 
num = np.dot(df_user2, cosim_user)
num

array([[ 0.48 ],
       [ 0.76 ],
       [ 0.875],
       ...,
       [13.535],
       [ 1.615],
       [ 0.12 ]])

In [200]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html 
# replace values where the condition is False.
# all cells will be 1 where where there is a rating
df_rating_count = df_user2.where(df_user2 == 0, 1)
df_rating_count

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [205]:
# calculating denominator by summing the weight of cosine similarity of users who have seen the movie
denom = np.dot(df_rating_count, cosim_user)
denom

array([[0.12],
       [0.19],
       [0.25],
       ...,
       [4.78],
       [0.82],
       [0.12]])

In [43]:
# calculating predicted ratings for the not seen movies of the picked user by dividing numerator and denominator
# sum_over_n(similarity*rating)/sum_over_n(similarity)
predicted_ratings = num/denom
predicted_ratings

  predicted_ratings = num/denom


array([[4.        ],
       [4.        ],
       [3.5       ],
       ...,
       [2.83158996],
       [1.9695122 ],
       [1.        ]])

In [44]:
# creating dataframe from the numpy array of predicted ratings
df_predicted_ratings = pd.DataFrame(predicted_ratings, columns=["predicted_ratings"])
df_predicted_ratings

Unnamed: 0,predicted_ratings
0,4.000000
1,4.000000
2,3.500000
3,5.000000
4,3.823529
...,...
9274,
9275,3.869168
9276,2.831590
9277,1.969512


In [54]:
df_predicted_ratings.isna().sum()

predicted_ratings    43
dtype: int64

In [49]:
df_recommendation = pd.DataFrame(df_user.reset_index())

In [51]:
df_recommendation['predicted_ratings'] = df_predicted_ratings['predicted_ratings']
df_recommendation

Unnamed: 0,title,42,predicted_ratings
0,'71 (2014),,4.000000
1,'Hellboy': The Seeds of Creation (2004),,4.000000
2,'Round Midnight (1986),,3.500000
3,'Salem's Lot (2004),,5.000000
4,'Til There Was You (1997),,3.823529
...,...,...,...
9714,eXistenZ (1999),,
9715,xXx (2002),,
9716,xXx: State of the Union (2005),,
9717,¡Three Amigos! (1986),5.0,


#### 3.5. Removing movies which were rated only by none or few users

In [60]:
df_rating_count

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [61]:
# calculating number of users who watched a movie 
count = pd.DataFrame(np.sum(df_rating_count, axis=1)).reset_index()
count.rename(columns={0 : 'count'}, inplace=True)
count

Unnamed: 0,title,count
0,'71 (2014),1.0
1,'Hellboy': The Seeds of Creation (2004),1.0
2,'Round Midnight (1986),2.0
3,'Salem's Lot (2004),1.0
4,'Til There Was You (1997),2.0
...,...,...
9274,anohana: The Flower We Saw That Day - The Movi...,1.0
9275,eXistenZ (1999),22.0
9276,xXx (2002),24.0
9277,xXx: State of the Union (2005),5.0


In [63]:
df_recommendation_filtered = pd.merge(df_recommendation, count, on='title')
df_recommendation_filtered

Unnamed: 0,title,42,predicted_ratings,count
0,'71 (2014),,4.000000,1.0
1,'Hellboy': The Seeds of Creation (2004),,4.000000,1.0
2,'Round Midnight (1986),,3.500000,2.0
3,'Salem's Lot (2004),,5.000000,1.0
4,'Til There Was You (1997),,3.823529,2.0
...,...,...,...,...
9274,anohana: The Flower We Saw That Day - The Movi...,,,1.0
9275,eXistenZ (1999),,,22.0
9276,xXx (2002),,,24.0
9277,xXx: State of the Union (2005),,,5.0


In [66]:
# most recommended movies 
df_recommendation_filtered.loc[df_recommendation_filtered['count'] > 20].sort_values(by='predicted_ratings', ascending=False).drop(columns=42).head(25)

Unnamed: 0,title,predicted_ratings,count
7446,Snow White and the Seven Dwarfs (1937),5.0,77.0
2102,"Day the Earth Stood Still, The (1951)",5.0,25.0
4442,Jurassic Park III (2001),5.0,36.0
320,All About Eve (1950),5.0,24.0
8471,Toy Story 2 (1999),5.0,97.0
8327,Three Colors: Blue (Trois couleurs: Bleu) (1993),5.0,24.0
8283,"Thing, The (1982)",5.0,45.0
3589,Harry Potter and the Deathly Hallows: Part 1 (...,5.0,47.0
2256,Dial M for Murder (1954),5.0,25.0
1745,Close Encounters of the Third Kind (1977),5.0,60.0


### 4. Recommendation with the most similar users

#### 4.1. Finding the most similar users

In [67]:
df_cosim

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.03,0.06,0.19,0.13,0.13,0.16,0.14,0.06,0.02,...,0.08,0.16,0.22,0.07,0.15,0.16,0.27,0.29,0.09,0.15
2,0.03,1.00,0.00,0.00,0.02,0.03,0.03,0.03,0.00,0.07,...,0.20,0.02,0.01,0.00,0.00,0.03,0.01,0.05,0.03,0.10
3,0.06,0.00,1.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,...,0.01,0.00,0.02,0.00,0.01,0.01,0.02,0.02,0.00,0.03
4,0.19,0.00,0.00,1.00,0.13,0.09,0.12,0.06,0.01,0.03,...,0.09,0.13,0.31,0.05,0.08,0.20,0.13,0.15,0.03,0.11
5,0.13,0.02,0.01,0.13,1.00,0.30,0.11,0.43,0.00,0.03,...,0.07,0.42,0.11,0.26,0.15,0.11,0.15,0.14,0.26,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.16,0.03,0.01,0.20,0.11,0.10,0.20,0.10,0.08,0.09,...,0.18,0.12,0.30,0.07,0.15,1.00,0.15,0.26,0.07,0.20
607,0.27,0.01,0.02,0.13,0.15,0.16,0.19,0.19,0.01,0.01,...,0.09,0.20,0.20,0.14,0.12,0.15,1.00,0.28,0.15,0.14
608,0.29,0.05,0.02,0.15,0.14,0.18,0.32,0.19,0.10,0.08,...,0.16,0.20,0.23,0.16,0.18,0.26,0.28,1.00,0.12,0.32
609,0.09,0.03,0.00,0.03,0.26,0.21,0.09,0.42,0.00,0.02,...,0.04,0.34,0.06,0.24,0.10,0.07,0.15,0.12,1.00,0.05


In [83]:
# selecting one user and finding similar users and sorting 
cosim_similar = pd.DataFrame(df_cosim[42])
cosim_similar.drop(index=42, inplace=True)
cosim_similar = cosim_user.sort_values(by=42, ascending=False).head(20)
cosim_similar

Unnamed: 0_level_0,42
userId,Unnamed: 1_level_1
45,0.37
368,0.35
608,0.34
68,0.34
453,0.33
555,0.33
414,0.33
135,0.33
64,0.33
19,0.32


In [72]:
# creating a list of similar users
similar_user = list(cosim_user.index)[:20]
similar_user

[45,
 368,
 608,
 68,
 453,
 555,
 414,
 135,
 64,
 266,
 590,
 597,
 19,
 452,
 599,
 239,
 570,
 480,
 217,
 330]

#### 4. 2. Calculating average movie ratings of the similar users

In [90]:
df_movie_user = df_merged.pivot_table(values='rating', columns='title', index='userId')

In [91]:
# df_movie_user.fillna(df_movie_user.mean(), inplace=True)
df_movie_user.fillna(0, inplace=True)

In [92]:
df_movie_user

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
df_similar = df_movie_user.loc[similar_user]
df_similar

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0
453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
555,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,3.0,0.0
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Numerator

In [94]:
# calculating numerator by multiplying the cosine similarity with the ratings of other users for the movies 
# which has not been seen by the user 
num = np.dot(cosim_similar.T, df_similar)
num.shape

(1, 9719)

#### Denominator

In [120]:
df_rating_count = df_similar.where(df_similar == 0, 1)
df_rating_count

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
555,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
# calculating denominator by summing the cosine similarity of users who have seen the movie
denom = np.dot(cosim_similar.T, df_rating_count)
denom.shape

(1, 9719)

In [109]:
# calculating predicted ratings for the not seen movies of the picked user by dividing numerator and denominator
# sum_over_n(similarity*rating)/sum_over_n(similarity)
predicted_ratings = num/denom
predicted_ratings

  predicted_ratings = num/denom


In [118]:
df_predicted_ratings = pd.DataFrame(predicted_ratings.T, columns=["predicted_ratings"])
df_predicted_ratings

Unnamed: 0,predicted_ratings
0,
1,
2,
3,
4,
...,...
9714,3.763359
9715,3.168317
9716,
9717,2.825581


In [125]:
df_movies2 = df_rating_count.T.reset_index()
df_movies2 = pd.DataFrame(df_movies2['title'])
df_movies2 

Unnamed: 0,title
0,'71 (2014)
1,'Hellboy': The Seeds of Creation (2004)
2,'Round Midnight (1986)
3,'Salem's Lot (2004)
4,'Til There Was You (1997)
...,...
9714,eXistenZ (1999)
9715,xXx (2002)
9716,xXx: State of the Union (2005)
9717,¡Three Amigos! (1986)


In [189]:
df_recommendation2 = df_movies2.join(df_predicted_ratings).set_index('title')
# df_recommendation2.sort_values(by='predicted_ratings', ascending=False).head(20)
df_recommendation2

Unnamed: 0_level_0,predicted_ratings
title,Unnamed: 1_level_1
'71 (2014),
'Hellboy': The Seeds of Creation (2004),
'Round Midnight (1986),
'Salem's Lot (2004),
'Til There Was You (1997),
...,...
eXistenZ (1999),3.763359
xXx (2002),3.168317
xXx: State of the Union (2005),
¡Three Amigos! (1986),2.825581


#### 4.3. Removing movies which were rated only by none or few users

In [172]:
#df_rating_count

In [184]:
# calculating number of users who watched a movie, this time with axis=0
count = pd.DataFrame(np.sum(df_rating_count, axis=0)).reset_index()
count.rename(columns={0 : 'count'}, inplace=True)
count

Unnamed: 0,title,count
0,'71 (2014),0.0
1,'Hellboy': The Seeds of Creation (2004),0.0
2,'Round Midnight (1986),0.0
3,'Salem's Lot (2004),0.0
4,'Til There Was You (1997),0.0
...,...,...
9714,eXistenZ (1999),4.0
9715,xXx (2002),3.0
9716,xXx: State of the Union (2005),0.0
9717,¡Three Amigos! (1986),8.0


In [190]:
# adding count column 
df_recommendation2 = pd.merge(df_recommendation2, count, on='title')
df_recommendation2

Unnamed: 0,title,predicted_ratings,count
0,'71 (2014),,0.0
1,'Hellboy': The Seeds of Creation (2004),,0.0
2,'Round Midnight (1986),,0.0
3,'Salem's Lot (2004),,0.0
4,'Til There Was You (1997),,0.0
...,...,...,...
9714,eXistenZ (1999),3.763359,4.0
9715,xXx (2002),3.168317,3.0
9716,xXx: State of the Union (2005),,0.0
9717,¡Three Amigos! (1986),2.825581,8.0


In [192]:
df_recommendation2 = df_recommendation2.loc[df_recommendation2['count'] > 10].sort_values(by='predicted_ratings', ascending=False)
df_recommendation2 

Unnamed: 0,title,predicted_ratings,count
6808,"Princess Bride, The (1987)",4.657040,17.0
7593,"Shawshank Redemption, The (1994)",4.573634,13.0
8002,Star Wars: Episode V - The Empire Strikes Back...,4.566718,20.0
5512,"Matrix, The (1999)",4.559677,19.0
3555,Good Will Hunting (1997),4.504155,11.0
...,...,...,...
839,Batman Forever (1995),2.402893,15.0
1151,"Blair Witch Project, The (1999)",2.353774,13.0
834,Batman & Robin (1997),2.100559,11.0
1932,Coneheads (1993),1.968056,11.0


#### 4. 4. Dropping movies that the user has already seen

In [193]:
df_recommendation2 = pd.merge(df_recommendation2, df_user, on='title')
df_recommendation2

Unnamed: 0,title,predicted_ratings,count,42
0,"Princess Bride, The (1987)",4.657040,17.0,5.0
1,"Shawshank Redemption, The (1994)",4.573634,13.0,4.0
2,Star Wars: Episode V - The Empire Strikes Back...,4.566718,20.0,3.0
3,"Matrix, The (1999)",4.559677,19.0,5.0
4,Good Will Hunting (1997),4.504155,11.0,5.0
...,...,...,...,...
214,Batman Forever (1995),2.402893,15.0,2.0
215,"Blair Witch Project, The (1999)",2.353774,13.0,
216,Batman & Robin (1997),2.100559,11.0,
217,Coneheads (1993),1.968056,11.0,


In [194]:
df_recommendation2 = df_recommendation2.loc[df_recommendation2[42].isnull()]
df_recommendation2

Unnamed: 0,title,predicted_ratings,count,42
8,Raiders of the Lost Ark (Indiana Jones and the...,4.466486,17.0,
9,Memento (2000),4.457983,11.0,
12,"Lord of the Rings: The Fellowship of the Ring,...",4.398824,13.0,
14,American Beauty (1999),4.395586,18.0,
15,Apocalypse Now (1979),4.384161,13.0,
...,...,...,...,...
210,"Honey, I Shrunk the Kids (1989)",2.580282,11.0,
213,"Lost World: Jurassic Park, The (1997)",2.415816,12.0,
215,"Blair Witch Project, The (1999)",2.353774,13.0,
216,Batman & Robin (1997),2.100559,11.0,


In [195]:
# 25 most recommended movies 
df_recommendation2.sort_values(by='predicted_ratings', ascending=False).drop(columns=42).head(25)

Unnamed: 0,title,predicted_ratings,count
8,Raiders of the Lost Ark (Indiana Jones and the...,4.466486,17.0
9,Memento (2000),4.457983,11.0
12,"Lord of the Rings: The Fellowship of the Ring,...",4.398824,13.0
14,American Beauty (1999),4.395586,18.0
15,Apocalypse Now (1979),4.384161,13.0
16,Snatch (2000),4.382952,12.0
17,Trainspotting (1996),4.381797,13.0
20,Aliens (1986),4.32969,17.0
23,Léon: The Professional (a.k.a. The Professiona...,4.289541,12.0
24,Indiana Jones and the Last Crusade (1989),4.287523,17.0
