## Collaborative Filtering Recommendation Engine

In [6]:
# Import libraries
import pandas as pd
import numpy as np

In [7]:
# https://grouplens.org/datasets/movielens/latest/
movies_df = pd.read_csv('../datasets/movies.csv',
                        usecols=['movieId','title'],
                        dtype={'movieId': 'int32', 'title': 'str'})
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int32 
 1   title    9742 non-null   object
dtypes: int32(1), object(1)
memory usage: 114.3+ KB


In [9]:
ratings_df = pd.read_csv('../datasets/ratings.csv',
                         usecols=['userId', 'movieId', 'rating'],
                         dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [10]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int32  
 1   movieId  100836 non-null  int32  
 2   rating   100836 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 1.2 MB


In [11]:
# Merge two dataframs based on 'movieId' column
merged_df = ratings_df.merge(movies_df, on='movieId')
merged_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"


In [12]:
merged_df.isna().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [13]:
merged_df.shape

(100836, 4)

In [14]:
# Group movies by title
movie_groupby_title = merged_df.groupby(by = ['title'])
movie_groupby_title.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"
...,...,...,...,...
100823,610,160836,3.0,Hazard (2005)
100825,610,161634,4.0,Don't Breathe (2016)
100827,610,163937,3.5,Blair Witch (2016)
100828,610,163981,3.5,31 (2016)


In [15]:
# Calculate total number of ratings for each movie
movie_ratingCount = movie_groupby_title.count().reset_index()
movie_ratingCount.head()

Unnamed: 0,title,userId,movieId,rating
0,'71 (2014),1,1,1
1,'Hellboy': The Seeds of Creation (2004),1,1,1
2,'Round Midnight (1986),2,2,2
3,'Salem's Lot (2004),1,1,1
4,'Til There Was You (1997),2,2,2


In [16]:
# Rename the 'rating' column and keep 'title' and 'totalRatingCount' columns
rename_dict = {'rating': 'totalRatingCount'}
movie_total_ratingCount = movie_ratingCount.rename(columns=rename_dict)[['title', 'totalRatingCount']]

movie_total_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [17]:
# Merge the dataframe
combined_rating_totalRatingCount = merged_df.merge(movie_total_ratingCount, left_on = 'title', right_on = 'title', how = 'left')

combined_rating_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,1,3,4.0,Grumpier Old Men (1995),52
2,1,6,4.0,Heat (1995),102
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),203
4,1,50,5.0,"Usual Suspects, The (1995)",204


In [18]:
totalRatingCount_threshold = 50
popular_movies = combined_rating_totalRatingCount.query('totalRatingCount >= @totalRatingCount_threshold')
popular_movies.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,1,3,4.0,Grumpier Old Men (1995),52
2,1,6,4.0,Heat (1995),102
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),203
4,1,50,5.0,"Usual Suspects, The (1995)",204


In [19]:
# Create a Pivot matrix
movies_userId_table = popular_movies.pivot_table(index='title', columns='userId',values='rating').fillna(0)

movies_userId_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [20]:
# Calculate cosine similarity matrix
from scipy.sparse import csr_matrix

movies_userId_matrix = csr_matrix(movies_userId_table.values)
movies_userId_matrix

<450x606 sparse matrix of type '<class 'numpy.float32'>'
	with 41360 stored elements in Compressed Sparse Row format>

In [21]:
# Use Nearest Neighbors for calculating cosine similarity
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movies_userId_matrix)

In [22]:
# Let's choose a random movie from the matrix and see the recommendations
movie_index = np.random.choice(movies_userId_matrix.shape[0])
print(movie_index)

343


In [23]:
# Convert to dense format (NumPy array)
dense_matrix = movies_userId_matrix.toarray()

distances, indices = model_knn.kneighbors(dense_matrix[movie_index, :].reshape(1, -1), n_neighbors=6)


In [24]:
# Which movies are similar to the chosen movie?
for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f'Recommendations for {movies_userId_table.index[movie_index]}:\n')
    else:
        recommended_movie_index = indices.flatten()[i]
        print(f'{i}: {movies_userId_table.index[recommended_movie_index]}, with distance of {distances.flatten()[i]:.2f}')


Recommendations for Scream (1996):

1: Blair Witch Project, The (1999), with distance of 0.50
2: Jaws (1975), with distance of 0.51
3: Face/Off (1997), with distance of 0.52
4: Indiana Jones and the Temple of Doom (1984), with distance of 0.55
5: Austin Powers: International Man of Mystery (1997), with distance of 0.56
