In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

In [2]:
data = loadmat('./data/ex8_movies.mat')
Y = data['Y']
R = data['R']

Y.shape, R.shape

((1682, 943), (1682, 943))

In [3]:
# 第一部电影的平均评分
np.sum(Y[0, :]) / np.sum(R[0, :])

3.8783185840707963

In [4]:
data_param = loadmat('./data/ex8_movieParams.mat')
X = data_param['X']   
Theta = data_param['Theta']

X.shape, Theta.shape

((1682, 10), (943, 10))

In [5]:
# 选取部分数据集
nu = 4  # user
nm = 5  # movies
nf = 3  # features

X = X[:nm, :nf]
Theta = Theta[:nu, :nf]
Y = Y[:nm, :nu]
R = R[:nm, :nu]

In [6]:
# 序列化，打平; 原因：优化函数中传入的参数是一维的
def serialize(X, Theta):
    return np.concatenate((X.ravel(), Theta.ravel()), axis=None)

# 反序列化，恢复为矩阵形式
def deserialize(param, nu, nm, nf):
    X = param[:nm*nf].reshape(nm, nf)
    Theta = param[nm*nf:].reshape(nu, nf)
    return X, Theta


In [7]:
# cost function
def cofi_cost_func(param, Y, R, nu, nm, nf, lam=0):
    X, Theta = deserialize(param, nu, nm, nf)
    term = X.dot(Theta.T)
    
    term = np.multiply(term, R) # 将未评分的元素置零
    cost = (np.sum((term - Y) ** 2)) / 2
    
    # 添加正则项
    cost += (lam / 2.0) * np.sum(np.square(Theta))
    cost += (lam / 2.0) * np.sum(np.square(X))
    return cost

In [8]:
# 计算在当前X和theta下cost
params = serialize(X, Theta)

cost = cofi_cost_func(params, Y, R, nu, nm, nf)

print('Cost:', cost)

Cost: 22.224603725685675


In [9]:
def gradient(param, Y, R, nu, nm, nf, lam=0):
    X, Theta = deserialize(param, nu, nm, nf)
    
    term = X.dot(Theta.T)
    term = np.multiply(term, R)
    term -= Y  # (nm, nu)
    X_grad = term.dot(Theta) # (nm, nu) * (nu, nf) -> (nm, nf)
    Theta_grad = term.T.dot(X) #(nu, nm) * (nm, nf) -> (nu, nf)
    
    X_grad += lam * X
    Theta_grad += lam * Theta
    
    return serialize(X_grad, Theta_grad)

In [10]:
deserialize(gradient(params, Y, R, nu, nm, nf), nu, nm, nf)

(array([[-2.52899165,  7.57570308, -1.89979026],
        [-0.56819597,  3.35265031, -0.52339845],
        [-0.83240713,  4.91163297, -0.76677878],
        [-0.38358278,  2.26333698, -0.35334048],
        [-0.80378006,  4.74271842, -0.74040871]]),
 array([[-10.5680202 ,   4.62776019,  -7.16004443],
        [ -3.05099006,   1.16441367,  -3.47410789],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]]))

In [11]:
movies = []
with open('data/movie_ids.txt', encoding='ISO-8859-1') as f:
    for line in f:
        movies.append(' '.join(line.strip('\n').split(' ')[1:]))

movies[:10]  # 前10部电影

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Twelve Monkeys (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Richard III (1995)']

In [12]:
# 对一些电影评分
my_ratings = np.zeros((1682,1))
my_ratings[0]   = 4
my_ratings[97]  = 2
my_ratings[6]   = 3
my_ratings[11]  = 5
my_ratings[53]  = 4
my_ratings[63]  = 5
my_ratings[65]  = 3
my_ratings[68]  = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5

In [13]:
data = loadmat('./data/ex8_movies.mat')  #重新加载数据集，以使用全部的数据

Y = data['Y']
R = data['R']
nf = 10 # 只使用10个特征

In [14]:
# 在数据集中添加上面的评分
myR_row = my_ratings > 0
Y = np.hstack((Y,my_ratings))
R = np.hstack((R,myR_row))
nm, nu = Y.shape

In [15]:
# 均值规范化
def normalize_ratrings(Y, R):
    Ymean = np.sum(Y, axis=1) / np.sum(R, axis=1) # 行向量
    Ymean = Ymean.reshape((Ymean.shape[0], 1))
    return Y - Ymean, Ymean

In [16]:
Ynorm, Ymean = normalize_ratrings(Y,R)

Ynorm.shape, Ymean.shape

((1682, 944), (1682, 1))

In [17]:
from scipy.optimize import fmin_tnc

X = np.random.rand(nm,nf)
Theta = np.random.rand(nu,nf)
params = serialize(X, Theta)


my_lambda = 10.0

# 这里使用Y(和答案一致)，应该使用Ynorm(但只会迭代一次，出错，未知原因)
result = fmin_tnc(cofi_cost_func, x0=params, fprime=gradient, \
                               args=(Y, R, nu, nm, nf, my_lambda), \
                                disp=0)

In [18]:
final_X, final_Theta = deserialize(result[0], nu, nm, nf)

final_X.shape, final_Theta.shape

((1682, 10), (944, 10))

In [19]:
pred_matrix = final_X @ final_Theta.T

In [20]:
my_pred = pred_matrix[:, -1] + Ymean.flatten()

In [21]:
# Sort my predictions from highest to lowest
pred_idxs_sorted = np.argsort(my_pred)
pred_idxs_sorted[:] = pred_idxs_sorted[::-1]

print ("Top recommendations for you:")
for i in range(10):
    print ('Predicting rating %0.1f for movie %s.' % \
    (my_pred[pred_idxs_sorted[i]], movies[pred_idxs_sorted[i]]))
    
print ("\nOriginal ratings provided:")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print ('Rated %d for movie %s.' % (my_ratings[i],movies[i]))

Top recommendations for you:
Predicting rating 8.6 for movie Star Wars (1977).
Predicting rating 8.4 for movie Titanic (1997).
Predicting rating 8.4 for movie Shawshank Redemption, The (1994).
Predicting rating 8.3 for movie Schindler's List (1993).
Predicting rating 8.2 for movie Raiders of the Lost Ark (1981).
Predicting rating 8.2 for movie Good Will Hunting (1997).
Predicting rating 8.1 for movie Usual Suspects, The (1995).
Predicting rating 8.1 for movie Empire Strikes Back, The (1980).
Predicting rating 8.1 for movie Godfather, The (1972).
Predicting rating 8.1 for movie Braveheart (1995).

Original ratings provided:
Rated 4 for movie Toy Story (1995).
Rated 3 for movie Twelve Monkeys (1995).
Rated 5 for movie Usual Suspects, The (1995).
Rated 4 for movie Outbreak (1995).
Rated 5 for movie Shawshank Redemption, The (1994).
Rated 3 for movie While You Were Sleeping (1995).
Rated 5 for movie Forrest Gump (1994).
Rated 2 for movie Silence of the Lambs, The (1991).
Rated 4 for movie 