In this project I will explore the IMDB dataset using K-Nearest Neighbours to make recommendations

K-nearest neighbours is an unsupervised method to find similar datapoints based of the assumption that if two datapoints have similar feature vectors, then the datapoints are likely to have the same label.

This model is non-parametric. As such the time complexity of training is 0, and the space complexity is O(nd). However, the classification of a point is O(ndk) and O(nd) in space. where d is number of features, n number of datapoints, and k is the number of neighbours. K is a hyperparameter of the model. This model can be used for classification.

In [105]:
import pandas
import numpy

# load dataframe
movies_dataframe = pandas.read_csv("movies.csv")
# change dataframe index ID to movieID which is already an index
movies_dataframe.set_index("movieId", inplace=True)

movies_dataframe

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [106]:
# load in ratings Dataset
ratings_dataframe = pandas.read_csv("ratings.csv")

ratings_dataframe

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [107]:
# add how many times the movie has been rated as a new column
total_counts = ratings_dataframe["movieId"].value_counts()
movies_dataframe["ratingscount"] = total_counts
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingscount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0
193585,Flint (2017),Drama,1.0
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0


In [108]:
# sort movies by number of ratings and show the 10 most rated movies
movies_dataframe.sort_values("ratingscount", ascending=False).head(10)

Unnamed: 0_level_0,title,genres,ratingscount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [109]:
# find the average rating of each movie ID in the ratings dataframe
average_ratings = ratings_dataframe.groupby("movieId").mean()["rating"]
average_ratings

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

In [110]:
#Add the average ratings to the movies dataframe as a new column
movies_dataframe["averageRatings"] = average_ratings
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingscount,averageRatings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.920930
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429
...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0,4.000000
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0,3.500000
193585,Flint (2017),Drama,1.0,3.500000
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0,3.500000


In [111]:
#sort dataframe by number of ratings, then by average ratings
movies_dataframe.sort_values(["ratingscount", "averageRatings"], ascending=False).head(20)

Unnamed: 0_level_0,title,genres,ratingscount,averageRatings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


In [132]:
# set threatsholds values such as min number of reviews and create a new dataframe
min_ratings = 100
min_ratings_subset = movies_dataframe.query(f"ratingscount >= {min_ratings}").sort_values("ratingscount", ascending=False)
min_ratings_subset

Unnamed: 0_level_0,title,genres,ratingscount,averageRatings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.161290
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
...,...,...,...,...
434,Cliffhanger (1993),Action|Adventure|Thriller,101.0,3.034653
1517,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy,100.0,3.535000
44191,V for Vendetta (2006),Action|Sci-Fi|Thriller|IMAX,100.0,3.885000
4022,Cast Away (2000),Drama,100.0,3.700000


Finding the K-nearest neighbours manually using vector lengths

In [142]:
# Finds the movie ratings of a user
def find_user_ratings(userId):
	user_ratings = ratings_dataframe.query(f"userId == {userId}")
	return user_ratings[["movieId", "rating"]].set_index("movieId")

# Find the distance between 2 users based off there movie ratings
def find_distance_between_real_users(userId_1, userId_2):
	ratings_user1 = find_user_ratings(userId_1)
	ratings_user2 = find_user_ratings(userId_2)
	ratings_compar = ratings_user1.join(ratings_user2, lsuffix="_1", rsuffix="_2").dropna()
	compar_1 = ratings_compar["rating_1"]
	compar_2 = ratings_compar["rating_2"]
	distance = numpy.linalg.norm(compar_1 - compar_2)
	return [userId_1, userId_2, distance]  # returns a list

# Find the distance a user and all other users
def find_relative_distances(userId):
	users = ratings_dataframe["userId"].unique()
	users = users[ users != userId]
	distances = [find_distance_between_real_users(userId, every_id) for every_id in users]
	return pandas.DataFrame(distances, columns= ["SingleUserId", "UserId", "Distance"])


# Number of closest users to use for recommendation
NUMBER_OF_NEIGHBORS = 5


# Finding the k nearest nerighbours
def find_k_nearest_neighbors(userId, k=NUMBER_OF_NEIGHBORS):
	distance_to_user = find_relative_distances(userId).sort_values("Distance").set_index("UserId")
	return distance_to_user.head(k)

# Recommender function for a user
def make_recommendation_with_knn(userId):
	top_k_neighbours = find_k_nearest_neighbors(userId)  # find the most similar users (k of them defult 5)
	ratings_by_index = ratings_dataframe.set_index("userId")  # set the index of the ratings df to userId
	top_similar_ratings = ratings_by_index.loc[top_k_neighbours.index]  # grab the ratings of the k most similar userers
	top_similar_ratings_avg = top_similar_ratings.groupby("movieId").mean()[["rating"]]  # group by movie, and take avg rating (the users gave)
	rec = top_similar_ratings_avg.sort_values("rating", ascending=False).head(10) # sort the avg ratings by high to low and pick top 10
	return rec.join(movies_dataframe)

In [143]:
#original Number of Users
ratings_dataframe["userId"].max()

612

In [144]:
# Make a random new user

import random

min_number = 1
max_number = movies_dataframe.shape[0]  # rows index
test_user_watched = []
test_user_ratings = []

number_of_movies = 14
for i in range(number_of_movies):
	random_movie_index = random.randint(min_number, max_number)
	test_user_watched.append(random_movie_index)
	random_rating = random.randint(0, 6)  # rating between 0 and 5 (5+1)
	test_user_ratings.append(random_rating)


user_data = [list(index) for index in zip(test_user_watched, test_user_ratings)]

def create_new_user(user_data):
	new_id = ratings_dataframe["userId"].max() + 1
	new_user_df = pandas.DataFrame(user_data, columns= ["movieId", "rating"])
	new_user_df["userId"] = new_id
	return pandas.concat([ratings_dataframe, new_user_df])

ratings_dataframe["userId"].max()

 #add the new user to the ratings_dataframe
ratings_dataframe = create_new_user(user_data)


In [145]:
# new number of users
new_user_id = ratings_dataframe["userId"].max()
new_user_id

613

In [146]:
#make a recommendation of our new user
make_recommendation_with_knn(new_user_id)

Unnamed: 0_level_0,rating,title,genres,ratingscount,averageRatings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
940,5.0,"Adventures of Robin Hood, The (1938)",Action|Adventure|Romance,8.0,4.0
1196,5.0,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,211.0,4.21564
1198,5.0,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,200.0,4.2075
3441,5.0,Red Dawn (1984),Action|Drama|War,14.0,3.428571
2899,5.0,Gulliver's Travels (1939),Adventure|Animation|Children,3.0,3.0
3448,5.0,"Good Morning, Vietnam (1987)",Comedy|Drama|War,43.0,3.802326
3450,5.0,Grumpy Old Men (1993),Comedy,29.0,3.293103
2872,5.0,Excalibur (1981),Adventure|Fantasy,25.0,3.64
1210,5.0,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,196.0,4.137755
2858,5.0,American Beauty (1999),Drama|Romance,204.0,4.056373
