In [None]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import seaborn as sns
import matplotlib.pyplot as plt
import os

data_path = '../input/the-movies-dataset/'
movies_filename = 'movies_metadata.csv'
ratings_filename = 'ratings_small.csv'

# read movies table
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename))

# remove irrelevant column
df_movies.drop(['homepage'], axis=1,inplace=True)

# display table head
df_movies.head()

In [None]:
# read ratings table
df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename))

# remove irrelevant column
df_ratings.drop(['timestamp'], axis=1,inplace=True)

#convert movieId to string.
df_ratings["movieId"]=df_ratings["movieId"].astype(str)

# display table head
df_ratings.head()

In [None]:
# locate the 1234th movie
df_movies.loc[1234] 

In [None]:
# create user-rating matrix
user_item = df_ratings.groupby(['userId', 'movieId'])['rating'].first().unstack(fill_value=0.0)
user_item.head()

In [None]:
# merge movies df and rating df
df = pd.merge(df_movies, df_ratings, left_on='id',right_on='movieId', how='inner')

# display table head
df.head()

In [None]:
# number of element in each dimension
user_item.shape

In [None]:
# locate movies of the highest ranking given by user 32
user_item.loc[32].sort_values(ascending=False).head()

In [None]:
# decompose user_item matrix using SVD
U, D, Vt = svds(user_item, k = 50)

In [None]:
# user-to-concept similarity matrix
U.shape

In [None]:
# concept-to-movie similarity matrix
Vt.shape

In [None]:
# concept matrix
D_matrix=np.diag(D)
D_matrix.shape

In [None]:
# get predicted ratings
all_predicted_ratings = np.dot(np.dot(U, D_matrix), Vt)
prediction_df = pd.DataFrame(all_predicted_ratings, columns = user_item.columns, index=user_item.index)

In [None]:
# display predicted ratings table dimensions
prediction_df.shape

In [None]:
# display predicted ratings table head
prediction_df.head()

In [None]:
# show the top movies predicted to be enjoyed by user 32
prediction_df.loc[32].sort_values(ascending=False).head()

In [None]:
# show the top movies rated by user 32
user_item.loc[32].sort_values(ascending=False).head()

In [None]:
# user 32's top rated movies
movies_user_32 = user_item.loc[32]

In [None]:
# get movies that user 32 has rated more than 3
high_rated_movies_32 = movies_user_32[movies_user_32 > 3].index
high_rated_movies_32

In [None]:
# predicted movies for user 32
recommended_movies_32 = prediction_df.loc[32]

In [None]:
# predicted movies for user 32 with a predicted rating of more than 3
highly_recommended_movies_32 = recommended_movies_32[recommended_movies_32 > 3].index
highly_recommended_movies_32

In [None]:
# movies that have a high recommendation (> 3) but that have no rating yet
recc=set(highly_recommended_movies_32) - set(high_rated_movies_32)
recc

In [None]:
# function to find recommended movies for a specific user
def get_high_recommended_movies(userId):
    rated_movies = user_item.loc[userId]
    highly_rated_movies =  rated_movies[rated_movies > 3].index
    recommended_movies = prediction_df.loc[userId]
    highly_recommended_movies = recommended_movies[recommended_movies > 3].index
    result= set(highly_recommended_movies) - set(highly_rated_movies)
    return prediction_df.loc[userId,result].sort_values(ascending=False)

In [None]:
# get recommended movies for a user 32
get_high_recommended_movies(32)

In [None]:
# locate movie 364
print(df_movies.loc[364])