In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
credits = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
credits.shape

In [None]:
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')

Convert the movieId field to categorical.

In [None]:
ratings['movieId'] = ratings['movieId'].astype('str')
ratings.head()

In [None]:
ratings.shape

In [None]:
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
# keywords.head()

In [None]:
keywords[keywords['id'] == 557]['keywords'].values[0]

In [None]:
keywords.shape

In [None]:
movies_metadata = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
movies_metadata = movies_metadata.rename(columns={'id': 'movieId'})
movies_metadata.head()

In [None]:
movies_metadata.shape

In [None]:
movies_metadata.head(20).sort_values(by='budget', ascending=False)

Since the movie_metadata table contains a lot of attributes, we will only be using the subset of all these attributes. In fact for the first run, I'll be only using the movie title and maybe one or two other fields only. I kept the movieId as well as that will help me later when we will be joining the two dataframes of ratings and movie meta_data. That way we will be able to get the movie name alongside the user ratings as well.

In [None]:
movies_metadata_subset = movies_metadata[['movieId', 'title']]
movies_metadata_subset.head()

In [None]:
movies_metadata.shape

Since the ratings.csv does not contain the title of the movie, we merge the movies metadata and ratings so that the title of the movie appears in the ratings dataframe as well.

In [None]:
movies_ratings = pd.merge(ratings, movies_metadata_subset, on='movieId')
movies_ratings.head()

In [None]:
movies_ratings.shape

As you can see in the above cell, the size of this dataframe is huge. I need to find a way to reduce the size of the dataframe by minimizing the number of rows somehow.

I looked at how many ratings have been posted by each user. There are a lot of users who have rated less than 50 movies. Maybe if I were to filter these users out then the size of my pivot table will be small. The pivot table causes the notebook to crash since I am working with limited RAM here on kaggle.

In [None]:
num_ratings = 30
user_rating_count = pd.DataFrame(movies_ratings.groupby('userId')['rating'].count())
user_with_less_than_num_ratings = user_rating_count[user_rating_count['rating'] < num_ratings]
print('Total users who have posted a rating: '+str(user_rating_count.shape[0]))
print('Users with less than '+str(num_ratings)+' ratings posted: '+str(user_with_less_than_num_ratings.shape[0]))

In [None]:
total_movies = movies_ratings.groupby('title')['rating'].count().sort_values(ascending=False).shape[0]
print('Total movies for which a rating has been posted: '+str(total_movies))

In the cell below I do some filtering. As I said earlier, there are a lot of users who have watched around just 50 movies now we will be filtering out these users. The result is a reduced subset of my inital ratings dataframe.

In [None]:
min_movies = 30
movies_ratings_filtered = movies_ratings.groupby("userId").filter(lambda x: len(x) >= min_movies)

In [None]:
movies_ratings_filtered.shape

Below I look at the average rating of each movie. The problem here is that some movies have a perfect rating score of 5 even if they have just one review. So we will be also taking the number of reviews into account. Here we group the average rating and the number of ratings.

In [None]:
rating_count = pd.DataFrame(movies_ratings_filtered.groupby(["movieId"])['rating'].count().sort_values(ascending=False))
rating_count = rating_count.rename(columns={'rating': 'rating_count'})
rating_count = rating_count.merge(pd.DataFrame(movies_ratings.groupby(["movieId"])['rating'].mean()), on='movieId')
rating_count = rating_count.rename(columns={'rating': 'avg_rating'})
rating_count = rating_count.merge(movies_metadata_subset, on='movieId')
rating_count = rating_count.set_index('title')
rating_count.head()

Now we generate our matrix

In [None]:
user_ratings = movies_ratings_filtered.pivot_table(index='userId', columns='title', values='rating')

In [None]:
user_ratings.head()

Above, we have finally got our matrix that tells us which user has watched what movies. A row is a user and a column is a movie. There are a lot of NaN values but those just tell us that a particular user has not rated a movie yet.

We will now be looking at generating at some actual recommendations.

In [None]:
cs = cosine_similarity(user_ratings)

I want some recommendations based on the movie 'Spider-Man 2'.

In [None]:
movie_name = 'Spider-Man 2'
rating_of_movie = user_ratings[movie_name]
rating_of_movie.head()

In [None]:
recommendations = pd.DataFrame(user_ratings.corrwith(rating_of_movie), columns=['Correlation'])
recommendations.dropna(inplace=True)
recommendations.head()

In [None]:
top_recommendations = recommendations.join(rating_count['rating_count'])
# top_recommendations = top_recommendations[top_recommendations['rating_count'] >= 100]
top_recommendations = top_recommendations.sort_values(by=['Correlation', 'rating_count'], ascending=[False, False])
top_recommendations.head()