In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import recommender as rcmd

# load data and setting up

In [56]:
movies = sio.loadmat('./data/ex8_movies.mat')
Y, R = movies.get('Y'), movies.get('R')

Y.shape, R.shape

((1682, 943), (1682, 943))

In [57]:
m, u = Y.shape
# m: how many movies
# u: how many users

n = 10  # how many features for a movie

In [58]:
param_mat = sio.loadmat('./data/ex8_movieParams.mat')
theta, X = param_mat.get('Theta'), param_mat.get('X')

theta.shape, X.shape

((943, 10), (1682, 10))

# cost
<img style="float: left;" src="../img/rcmd_cost.png">

In [20]:
# use subset of data to calculate the cost as in pdf...
users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
theta_sub = theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

param_sub = rcmd.serialize(X_sub, theta_sub)

rcmd.cost(param_sub, Y_sub, R_sub, features)

22.224603725685675

In [25]:
param = rcmd.serialize(X, theta)  # total real params

rcmd.cost(rcmd.serialize(X, theta), Y, R, 10)  # this is real total cost

27918.64012454421

# gradient
<img style="float: left;" src="../img/rcmd_gradient.png">

In [26]:
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.gradient(params, Y, R, 10),
                                      n_movie, n_user, 10)

<img style="float: left;" src="../img/rcmd_vectorized_grad.png">

In [27]:
assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

# regularized cost

In [33]:
# in the ex8_confi.m, lambda = 1.5, and it's using sub data set
rcmd.regularized_cost(param_sub, Y_sub, R_sub, features, l=1.5)

31.344056244274221

In [35]:
rcmd.regularized_cost(param, Y, R, 10, l=1)  # total regularized cost

32520.682450229557

# regularized gradient

<img style="float: left;" src="../img/rcmd_reg_grad.png">

In [37]:
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.regularized_gradient(params, Y, R, 10),
                                                                n_movie, n_user, 10)

assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

# parse `movie_id.txt`

In [52]:
movie_list = []

with open('./data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

# reproduce my ratings

In [53]:
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

# prepare data

In [59]:
Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0
Y.shape

(1682, 944)

In [60]:
R = np.insert(R, 0, ratings != 0, axis=1)
R.shape

(1682, 944)

In [64]:
n_features = 10
n_movie, n_user = Y.shape
l = 10

normalized ratings

In [67]:
Y_norm = Y - Y.mean()
Y_norm.mean()

1.4975111669342889e-16

# training

In [69]:
import scipy.optimize as opt