In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)

In [3]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' , str(n_users) , ' | Number of movies = ' , str(n_items))

Number of users =  943  | Number of movies =  1682


In [4]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



In [5]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
                train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
                test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
61170,174,709,4,890168554
60570,593,699,4,875671334
21742,434,369,4,886724972
91784,933,834,1,874938878
7586,197,344,4,891409070


In [7]:
test_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
37091,474,489,4,887923972
61944,716,81,4,879796475
53127,378,747,3,880055597
94929,747,403,5,888734113
99895,806,209,3,882387837


In [9]:
train_data_matrix

array([[5., 0., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [10]:
test_data_matrix

array([[0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [12]:
user_similarity

array([[0.        , 0.84002633, 0.94925803, ..., 0.83629985, 0.85578474,
        0.67280298],
       [0.84002633, 0.        , 0.92751307, ..., 0.81652697, 0.87675635,
        0.93951876],
       [0.94925803, 0.92751307, 0.        , ..., 0.88984179, 0.92646922,
        0.98432378],
       ...,
       [0.83629985, 0.81652697, 0.88984179, ..., 0.        , 0.93324984,
        0.97865404],
       [0.85578474, 0.87675635, 0.92646922, ..., 0.93324984, 0.        ,
        0.85181599],
       [0.67280298, 0.93951876, 0.98432378, ..., 0.97865404, 0.85181599,
        0.        ]])

In [18]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T   
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [20]:
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [15]:
user_prediction

array([[ 1.57647424,  0.58716197,  0.46231776, ...,  0.28712984,
         0.28703283,  0.28468412],
       [ 1.29318423,  0.29332428,  0.12016934, ..., -0.07454829,
        -0.07326958, -0.07650526],
       [ 1.33204496,  0.26682373,  0.10155193, ..., -0.10315287,
        -0.10141255, -0.10465651],
       ...,
       [ 1.19129704,  0.23037632,  0.06407253, ..., -0.12265249,
        -0.12194535, -0.12488354],
       [ 1.37924389,  0.3393183 ,  0.2029536 , ...,  0.00409809,
         0.00477693,  0.00193166],
       [ 1.41100459,  0.38732901,  0.28293954, ...,  0.10168513,
         0.10163413,  0.09920952]])

In [21]:
item_prediction

array([[0.35859302, 0.37873615, 0.39449401, ..., 0.4386564 , 0.42857246,
        0.43664485],
       [0.0760114 , 0.08927412, 0.08467236, ..., 0.0901717 , 0.09156647,
        0.09161214],
       [0.06239588, 0.06676252, 0.06407075, ..., 0.06253994, 0.06535267,
        0.06543724],
       ...,
       [0.02763692, 0.03642354, 0.03532274, ..., 0.04165546, 0.04043953,
        0.04164188],
       [0.13938788, 0.14641054, 0.15594515, ..., 0.16224979, 0.16038176,
        0.16240333],
       [0.20167565, 0.19432402, 0.22854257, ..., 0.25428798, 0.24526823,
        0.25223081]])

In [16]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
                prediction = prediction[ground_truth.nonzero()].flatten() 
                ground_truth = ground_truth[ground_truth.nonzero()].flatten()
                return sqrt(mean_squared_error(prediction, ground_truth))

In [17]:
print ('User-based CF RMSE: ' , str(rmse(user_prediction, test_data_matrix)))

User-based CF RMSE:  3.1292376489669893


In [23]:
print ('Item-based CF RMSE: ' , str(rmse(item_prediction, test_data_matrix)))

Item-based CF RMSE:  3.4574733738606946


In [24]:
import turicreate
train_data = turicreate.SFrame(train_data)
test_data = turicreate.SFrame(test_data)

  from ._conv import register_converters as _register_converters


In [25]:
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='item_id', target='rating')

In [26]:
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5],k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+---------+-------+------+
| user_id | item_id | score | rank |
+---------+---------+-------+------+
|    1    |   1122  |  5.0  |  1   |
|    1    |   1467  |  5.0  |  2   |
|    1    |   1293  |  5.0  |  3   |
|    1    |   1358  |  5.0  |  4   |
|    1    |   1201  |  5.0  |  5   |
|    2    |   1122  |  5.0  |  1   |
|    2    |   1467  |  5.0  |  2   |
|    2    |   1293  |  5.0  |  3   |
|    2    |   1358  |  5.0  |  4   |
|    2    |   1201  |  5.0  |  5   |
|    3    |   1122  |  5.0  |  1   |
|    3    |   1467  |  5.0  |  2   |
|    3    |   1293  |  5.0  |  3   |
|    3    |   1358  |  5.0  |  4   |
|    3    |   1201  |  5.0  |  5   |
|    4    |   1122  |  5.0  |  1   |
|    4    |   1467  |  5.0  |  2   |
|    4    |   1293  |  5.0  |  3   |
|    4    |   1358  |  5.0  |  4   |
|    4    |   1201  |  5.0  |  5   |
|    5    |   1122  |  5.0  |  1   |
|    5    |   1467  |  5.0  |  2   |
|    5    |   1293  |  5.0  |  3   |
|    5    |   1358  |  5.0  |  4   |
|

In [27]:
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='item_id', target='rating', similarity_type='cosine')

In [28]:
#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+---------+---------------------+------+
| user_id | item_id |        score        | rank |
+---------+---------+---------------------+------+
|    1    |   204   |  0.9264227542483691 |  1   |
|    1    |   168   |  0.9017197486845036 |  2   |
|    1    |   423   |  0.7626407577574832 |  3   |
|    1    |   210   |  0.7503059211286526 |  4   |
|    1    |    96   |  0.6335611849733926 |  5   |
|    2    |   121   |  0.7967719086786595 |  1   |
|    2    |   258   |  0.7268192346503095 |  2   |
|    2    |   117   |  0.6691407503151312 |  3   |
|    2    |   181   |  0.6541731924545474 |  4   |
|    2    |   286   |  0.6335776055731425 |  5   |
|    3    |   307   |  0.4439897697146346 |  1   |
|    3    |   313   |  0.4153808980453305 |  2   |
|    3    |   347   | 0.40448152292065503 |  3   |
|    3    |   286   | 0.40058092809304957 |  4   |
|    3    |   748   | 0.35649534405731576 |  5   |
|    4    |    22   |  0.8797683748934004 |  1   |
|    4    |   100   |  0.823921

In [29]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [ (i, j, self.R[i, j]) for i in range(self.num_users) for j in range(self.num_items) if self.R[i, j] > 0]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [31]:
ratings = df
R = np.array(ratings.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0))

In [34]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=1000)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 20 ; error = 296.0814
Iteration: 40 ; error = 290.8940
Iteration: 60 ; error = 286.9768
Iteration: 80 ; error = 280.4384
Iteration: 100 ; error = 271.0924
Iteration: 120 ; error = 260.8548
Iteration: 140 ; error = 250.4007
Iteration: 160 ; error = 240.4434
Iteration: 180 ; error = 231.6503
Iteration: 200 ; error = 224.2539
Iteration: 220 ; error = 218.1346
Iteration: 240 ; error = 213.0517
Iteration: 260 ; error = 208.7840
Iteration: 280 ; error = 205.1576
Iteration: 300 ; error = 202.0470
Iteration: 320 ; error = 199.3553
Iteration: 340 ; error = 197.0078
Iteration: 360 ; error = 194.9466
Iteration: 380 ; error = 193.1251
Iteration: 400 ; error = 191.5058
Iteration: 420 ; error = 190.0580
Iteration: 440 ; error = 188.7562
Iteration: 460 ; error = 187.5805
Iteration: 480 ; error = 186.5134
Iteration: 500 ; error = 185.5407
Iteration: 520 ; error = 184.6511
Iteration: 540 ; error = 183.8331
Iteration: 560 ; error = 183.0798
Iteration: 580 ; error = 182.3829
Iteration: 600 ; e