# User movie recommendation model

consider ratings dataset below, containing the data on: UserID, MovieID, Rating and Timestamp. Each line of this files represents one rating of one movie by one user, and has the following format:
UserId::MovieId::Rating::Timestamp

Ratings are made on five star scale with half star increments. UserID represents ID of the user, movieID represents ID of the movie and Timestamp represents seconds from the midnight coordinated universal time or UTC of January 1, 1970.

In [1]:
# import required libraries
import pandas as pd
import numpy as np

In [2]:
# Import dataset
df = pd.read_csv('Recommend.csv', names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [3]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
df.shape

(100000, 4)

In [5]:
df.isna().sum()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
n_users = df.user_id.unique().shape[0]

In [9]:
n_users

943

In [10]:
n_movies= df.movie_id.unique().shape[0]

In [11]:
n_movies

1682

In [12]:
train_data, test_data = train_test_split(df, test_size= 0.25)

In [13]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
71810,663,237,4,889492473
49286,476,648,4,883364295
69802,643,240,5,891445823
95237,682,24,4,888522575
79428,439,290,4,882894084


In [14]:
train_data_matrix = np.zeros((n_users, n_movies))

In [15]:
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [16]:
test_data_matrix = np.zeros((n_users, n_movies))

In [17]:
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
from sklearn.metrics import pairwise_distances
user_similarities = pairwise_distances(train_data_matrix, metric='cosine')

In [19]:
type(user_similarities)

numpy.ndarray

In [20]:
user_similarities

array([[0.        , 0.85001504, 0.94320291, ..., 0.89530107, 0.86700419,
        0.71729024],
       [0.85001504, 0.        , 0.92484892, ..., 0.82202151, 0.82193653,
        0.90408736],
       [0.94320291, 0.92484892, 0.        , ..., 0.94639197, 0.93798612,
        0.9626733 ],
       ...,
       [0.89530107, 0.82202151, 0.94639197, ..., 0.        , 0.93126491,
        0.93925503],
       [0.86700419, 0.82193653, 0.93798612, ..., 0.93126491, 0.        ,
        0.83742962],
       [0.71729024, 0.90408736, 0.9626733 , ..., 0.93925503, 0.83742962,
        0.        ]])

In [21]:
movie_similarities= pairwise_distances(train_data_matrix.T, metric='cosine')

In [22]:
movie_similarities

array([[0.        , 0.64355631, 0.7660931 , ..., 1.        , 0.94577237,
        0.94577237],
       [0.64355631, 0.        , 0.83148035, ..., 1.        , 0.90975147,
        0.90975147],
       [0.7660931 , 0.83148035, 0.        , ..., 1.        , 1.        ,
        0.88796295],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.94577237, 0.90975147, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.94577237, 0.90975147, 0.88796295, ..., 1.        , 1.        ,
        0.        ]])

In [23]:
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis]

In [24]:
mean_user_rating

array([[0.40546968],
       [0.11058264],
       [0.06420927],
       [0.05112961],
       [0.2175981 ],
       [0.36444709],
       [0.73127229],
       [0.08858502],
       [0.04458977],
       [0.34482759],
       [0.25624257],
       [0.09750297],
       [0.89179548],
       [0.17717004],
       [0.13079667],
       [0.28478002],
       [0.04161712],
       [0.45124851],
       [0.03032105],
       [0.07491082],
       [0.20451843],
       [0.20095125],
       [0.21521998],
       [0.12128419],
       [0.13198573],
       [0.1450654 ],
       [0.03745541],
       [0.12960761],
       [0.05469679],
       [0.07728894],
       [0.05826397],
       [0.06718193],
       [0.03983353],
       [0.04161712],
       [0.02437574],
       [0.0332937 ],
       [0.08561237],
       [0.19500595],
       [0.04458977],
       [0.04102259],
       [0.07669441],
       [0.29310345],
       [0.36563615],
       [0.23365042],
       [0.08145065],
       [0.05172414],
       [0.03804994],
       [0.123

In [25]:
ratings_diff = train_data_matrix - mean_user_rating

In [26]:
ratings_diff

array([[ 4.59453032,  2.59453032,  3.59453032, ..., -0.40546968,
        -0.40546968, -0.40546968],
       [ 3.88941736, -0.11058264, -0.11058264, ..., -0.11058264,
        -0.11058264, -0.11058264],
       [-0.06420927, -0.06420927, -0.06420927, ..., -0.06420927,
        -0.06420927, -0.06420927],
       ...,
       [ 4.96313912, -0.03686088, -0.03686088, ..., -0.03686088,
        -0.03686088, -0.03686088],
       [-0.14209275, -0.14209275, -0.14209275, ..., -0.14209275,
        -0.14209275, -0.14209275],
       [-0.25267539,  4.74732461, -0.25267539, ..., -0.25267539,
        -0.25267539, -0.25267539]])

In [27]:
user_pred = mean_user_rating + user_similarities.dot(ratings_diff) / np.array([np.abs(user_similarities).sum(axis=1)]).T

In [28]:
user_pred

array([[ 1.56396958,  0.5340082 ,  0.44333826, ...,  0.25501494,
         0.25503457,  0.25481166],
       [ 1.33683512,  0.28885486,  0.15277999, ..., -0.0567805 ,
        -0.05552454, -0.05535557],
       [ 1.34084728,  0.23694568,  0.11221058, ..., -0.10427554,
        -0.10242209, -0.10242013],
       ...,
       [ 1.19130838,  0.19738023,  0.07427683, ..., -0.12749366,
        -0.12665711, -0.12652717],
       [ 1.36641753,  0.29762235,  0.19461563, ..., -0.01713053,
        -0.01643077, -0.01612315],
       [ 1.41874797,  0.36962539,  0.29288251, ...,  0.10231637,
         0.10234226,  0.10227769]])