# Simple Movie recommendation system using K-Nearest Neighbors algorithm

Dataset available on: https://www.kaggle.com/code/alyssonbispopereira/recomenda-o-de-filmes-ptbr/data

In [63]:
# Libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [64]:
# Import file with movies
movies = pd.read_csv('movies_metadata.csv', low_memory = False)
movies.head(4)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0


In [65]:
movies['genres'].head(15)

0     [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1     [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2     [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3     [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                        [{'id': 35, 'name': 'Comedy'}]
5     [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
6     [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
7     [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
8     [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
9     [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...
10    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
11    [{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...
12    [{'id': 10751, 'name': 'Family'}, {'id': 16, '...
13    [{'id': 36, 'name': 'History'}, {'id': 18, 'na...
14    [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
Name: genres, dtype: object

In [66]:
# Import file with movies ratings
ratings = pd.read_csv('ratings.csv')
ratings.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546


# Pre process

In [67]:
# Filtering unecessary colums and renaming variables
movies = movies[ ['id', 'original_title', 'original_language', 'vote_count'] ]

movies.rename(columns = {'id': 'movie_id', 'original_title': 'title', 'original_language': 'language', 'vote_count': 'total_movie_ratings'}, inplace = True)

movies.head()

Unnamed: 0,movie_id,title,language,total_movie_ratings
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [68]:
# Repeating the same process for ratings
ratings = ratings [ ['userId', 'movieId', 'rating'] ]

ratings.rename(columns = {'userId': 'user_id', 'movieId': 'movie_id', 'rating': 'rating'}, inplace = True)

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [69]:
# Checking for empty values
movies.isna().sum()

movie_id                0
title                   0
language               11
total_movie_ratings     6
dtype: int64

In [70]:
# Since there are fewer values we will remove them and check if the remotion was successful
movies.dropna(inplace = True)
movies.isna().sum()

movie_id               0
title                  0
language               0
total_movie_ratings    0
dtype: int64

In [71]:
# Checking for ratings
ratings.isna().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

In [72]:
# Filtering user that have rated few movies
ratings['user_id'].value_counts()

user_id
45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
30155         1
9641          1
164717        1
243426        1
234625        1
Name: count, Length: 270896, dtype: int64

In [73]:
ratings.shape[0]

26024289

In [74]:
# Since the dataset is huge and there are users that rated a lot of movies we can filter them by removing users that rated less than a minimum number of movies
minimum_ratings_amount = 1000
useful_ratings = ratings['user_id'].value_counts() > minimum_ratings_amount
y = useful_ratings[useful_ratings].index
y.shape[0]

2502

In [75]:
# Updating the ratings dataset
ratings = ratings[ratings['user_id'].isin(y)]
ratings.shape[0]

3837582

In [76]:
# Grouping movies according to language to only consider movies in english
# movies_language = movies['original_language'].value_counts() 
# movies_language.head()
# movies = movies[movies['original_language'] == 'en']

In [77]:
# Checking dataset varaible types
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45449 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movie_id             45449 non-null  object 
 1   title                45449 non-null  object 
 2   language             45449 non-null  object 
 3   total_movie_ratings  45449 non-null  float64
dtypes: float64(1), object(3)
memory usage: 1.7+ MB


In [78]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3837582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   user_id   int64  
 1   movie_id  int64  
 2   rating    float64
dtypes: float64(1), int64(2)
memory usage: 117.1 MB


In [79]:
# Since there is a type mismatch for the movie id we need to convert the id in the movie dataset for int64 type to match ratings id type
movies['movie_id'] = movies['movie_id'].astype('int64')

In [80]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45449 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movie_id             45449 non-null  int64  
 1   title                45449 non-null  object 
 2   language             45449 non-null  object 
 3   total_movie_ratings  45449 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.7+ MB


In [81]:
# Concatenating the two dataframes
movies_and_ratings = ratings.merge(movies, on = 'movie_id')
movies_and_ratings.head()

Unnamed: 0,user_id,movie_id,rating,title,language,total_movie_ratings
0,229,2,3.0,Ariel,fi,44.0
1,231,2,2.0,Ariel,fi,44.0
2,741,2,3.0,Ariel,fi,44.0
3,836,2,3.5,Ariel,fi,44.0
4,1104,2,4.0,Ariel,fi,44.0


In [82]:
# Checking the amount of movies with ratings
movies_and_ratings.shape[0]

1366634

In [83]:
# Checking for null values
movies_and_ratings.isna().sum()

user_id                0
movie_id               0
rating                 0
title                  0
language               0
total_movie_ratings    0
dtype: int64

In [84]:
# Discarting duplicates for cases when an user rated the same movie more than once
movies_and_ratings.drop_duplicates(['user_id', 'movie_id'], inplace = True)
movies_and_ratings.shape[0]

1366320

In [85]:
# Since the dataframes were already merged we no longer need the movie_id column
del movies_and_ratings['movie_id']


In [86]:
movies_and_ratings.head()

Unnamed: 0,user_id,rating,title,language,total_movie_ratings
0,229,3.0,Ariel,fi,44.0
1,231,2.0,Ariel,fi,44.0
2,741,3.0,Ariel,fi,44.0
3,836,3.5,Ariel,fi,44.0
4,1104,4.0,Ariel,fi,44.0


In [87]:
# Creating a pivot since whe want each user to be a variable with the rating of each movie
movies_pivot = movies_and_ratings.pivot_table(columns = 'user_id', index = 'title', values = 'rating')

In [88]:
movies_pivot.head()

user_id,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,,,,2.0,,,,3.0,,,...,,,,,2.0,,,,,
$5 a Day,,,,,,,,,,,...,,,,,2.5,2.0,,,,
'Gator Bait,,,,,,,,,,,...,,,,,,,,,,5.0
'R Xmas,,,3.5,,,,,,,,...,,,,,2.0,,,,3.0,
'Twas the Night Before Christmas,,,,,,,,,,,...,,,,,,,,,,


In [89]:
# Filling Nan values with zeros
movies_pivot.fillna(0, inplace = True)
movies_pivot.head()

user_id,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
$5 a Day,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
'Gator Bait,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
'R Xmas,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0
'Twas the Night Before Christmas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# Transforming the dataset in a sparse matrix
movies_sparse = csr_matrix(movies_pivot)

In [91]:
# Creating and training model
model = NearestNeighbors(algorithm = 'brute')
model.fit(movies_sparse)

# Testing the model

In [92]:
# Creating a function for movie recommendations
def get_movie_recommendations(movie_name):
    n_recommendations = 5
    movie = movies_pivot.loc[movie_name]
    movie = movie.values.reshape(1, -1)
    distances, indices = model.kneighbors(movie, n_neighbors = n_recommendations + 1)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(movie_name))
        else:
            print('{0}: {1}'.format(i, movies_pivot.index[indices.flatten()[i]]))

In [93]:
get_movie_recommendations('Cidade de Deus')

Recommendations for Cidade de Deus:

1: How to Save a Marriage and Ruin Your Life
2: Gus
3: When Brendan Met Trudy
4: Orazi e Curiazi
5: Skew
