In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Recommend.csv',names=['user_id', 'movie_id', 'rating', 'timestamp'])
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
# Since this is a user movie recommendation model,
# we must be aware of the number of users and the number of movies.
from sklearn.model_selection import train_test_split
n_users = df.user_id.unique().shape[0] 
n_movies = df.movie_id.unique().shape[0]
train_data, test_data = train_test_split(df, test_size=0.25)

In [4]:
# First create a zero matrix with dimensions as N_users and N_movies.
train_data_matrix = np.zeros((n_users, n_movies))
# Now that we have created train and test sets, we will populate the train and test metrics with ratings such that
# the user ID Index minus movie ID Index equals the given rating.
for line in train_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
train_data_matrix

array([[5., 0., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [5]:
# Do the precious step to the testing set also
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix

array([[0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
# Now, import the pair wise distances import from sklearn.metrics to create 
# cosinesimilarity matrices for users and movies. cosinesimilarity is a 
# measure of similarity between two non zero vectors of an inner product 
# space that measures the cosineof the angle between them. The cosine is 0° is 1
# and it is less than one for any angle in the interval, zero pi radiance.
# Now, we will go ahead with the predictions based on the fact that
# user movie collaborative filtering difference from mean rating is a better indicator than absolute rating.
# Calculate user predictions such that user prediction is the sum of the mean user rating
# and the dot product of user similarity and ratings difference divided by the absolute value of the user similarity.
# The matrix we received as output describes how those unrated movies will be rated by users based on their prior rating styles.
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis] 
ratings_diff = (train_data_matrix - mean_user_rating) 
user_pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[ 1.61165989,  0.58926771,  0.47956177, ...,  0.30104391,
         0.2985871 ,  0.30095742],
       [ 1.31649972,  0.28047279,  0.12858869, ..., -0.07578609,
        -0.07779922, -0.07452033],
       [ 1.36719426,  0.26420347,  0.11842345, ..., -0.09475083,
        -0.09627151, -0.09287537],
       ...,
       [ 1.26113782,  0.22051528,  0.07186974, ..., -0.12784995,
        -0.13002868, -0.12682036],
       [ 1.37911643,  0.31308818,  0.19407183, ..., -0.01282872,
        -0.01511816, -0.01177234],
       [ 1.43775857,  0.39110147,  0.29266956, ...,  0.11315369,
         0.11064232,  0.11309699]])

Now that we have approximately generated ratings for all movies,