# 案例：向用户推荐系统

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

In [2]:
mat = sio.loadmat('E:/ML_NG/8-anomaly detection and recommendation/data/ex8_movies.mat')
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
Y,R = mat['Y'],mat['R']
Y.shape,R.shape

((1682, 943), (1682, 943))

In [7]:
param_mat = sio.loadmat('E:/ML_NG/8-anomaly detection and recommendation/data/ex8_movieParams.mat')
param_mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [10]:
X,Theta,nu,nm,nf = param_mat['X'],param_mat['Theta'], param_mat['num_users'], param_mat['num_movies'],param_mat['num_features']
X.shape,Theta.shape,nu,nm,nf

((1682, 10),
 (943, 10),
 array([[943]], dtype=uint16),
 array([[1682]], dtype=uint16),
 array([[10]], dtype=uint8))

In [12]:
nu = int(nu)
nm = int(nm)
nf = int(nf)
nu,nm,nf

(943, 1682, 10)

## 序列化参数

In [14]:
def serialize(X,Theta):
    
    return np.append(X.flatten(),Theta.flatten())

In [40]:
X.shape,Theta.shape

((1682, 10), (943, 10))

## 解序列化参数

In [26]:
def deserialize(params,nm,nu,nf):
    X = params[:nm*nf].reshape(nm,nf)
    Theta = params[nm*nf:].reshape(nu,nf)
    return X,Theta

## 代价函数

In [27]:
def costFunction(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nm,nu,nf)
    error = 0.5 * np.square((X @ Theta.T - Y)*R).sum()
    reg1 = 0.5 * lamda * np.square(X).sum()
    reg2 = 0.5 * lamda * np.square(Theta).sum()
    return error + reg1 + reg2

In [41]:
users = 4
movies = 5
features = 3
X_sub = X[:movies,:features]
Theta_sub = Theta[:users,:features]
Y_sub = Y[:movies,:users]
R_sub = R[:movies,:users]

cost1 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda=0)
cost1

22.224603725685675

In [42]:
cost2 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda=1.5)
cost2

31.344056244274217

## 梯度

In [68]:
def costGtadient(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nm,nu,nf)
    X_grad = ((X@Theta.T-Y)*R)@Theta + lamda * X
    Theta_grad = ((X@Theta.T-Y)*R).T@X + lamda * Theta
    return serialize(X_grad,Theta_grad)

## 添加一个用户

In [45]:
my_ratings = np.zeros((nm,1))
my_ratings[9] = 5
my_ratings[66] = 5
my_ratings[96] = 5
my_ratings[121] = 4
my_ratings[148] = 4
my_ratings[285] = 3
my_ratings[490] = 4
my_ratings[599] = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3

In [46]:
Y = np.c_[Y,my_ratings]
R = np.c_[R,my_ratings!=0]

In [47]:
Y.shape

(1682, 944)

In [49]:
nm,nu = Y.shape

## 均值归一化

In [69]:
def normlizeRatings(Y,R):
    Y_mean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1,1)
    Y_norm = (Y - Y_mean) * R
    return Y_norm,Y_mean

In [70]:
Y_norm,Y_mean = normlizeRatings(Y,R)

## 参数初始化

In [52]:
X = np.random.random((nm,nf))
Theta = np.random.random((nu,nf))
params = serialize(X,Theta)
lamda = 5

## 模型训练

In [72]:
from scipy.optimize import minimize
res = minimize(fun = costFunction,
        x0 = params,
        args=(Y_norm,R,nm,nu,nf,lamda),
        method = 'TNC',
        jac = costGtadient,
        options = {'maxiter':100})

In [73]:
params_fit = res.x

In [74]:
fit_X,fit_Theta = deserialize(params_fit,nm,nu,nf)

## 预测

In [75]:
Y_pred = fit_X@fit_Theta.T

In [76]:
y_pred = Y_pred[:,-1] + Y_mean.flatten()

In [77]:
index = np.argsort(-y_pred)  #从大到小排列 或者np.ardsort(y_pred)[::-1]

In [78]:
index[:10]

array([1292, 1188, 1121, 1652,  813, 1466, 1598, 1535, 1200, 1499],
      dtype=int64)

In [83]:
movies = []
with open('E:/ML_NG/8-anomaly detection and recommendation/data/movie_ids.txt','r',encoding='latin 1') as f:  #读入电影名文件
    for line in f:
        tokens = line.strip().split(' ')  #按空格分割
        movies.append(' '.join(tokens[1:]))

In [84]:
len(movies)

1682

In [85]:
for i in range(10):
    print(index[i],movies[index[i]],y_pred[index[i]])

1292 Star Kid (1997) 5.000915199198762
1188 Prefontaine (1997) 5.000134007097596
1121 They Made Me a Criminal (1939) 4.999880212319454
1652 Entertaining Angels: The Dorothy Day Story (1996) 4.999878407870896
813 Great Day in Harlem, A (1994) 4.999792519190221
1466 Saint of Fort Washington, The (1993) 4.9997379495506244
1598 Someone Else's America (1995) 4.999686229160827
1535 Aiqing wansui (1994) 4.999541708075515
1200 Marlene Dietrich: Shadow and Light (1996) 4.9995197744792215
1499 Santa with Muscles (1996) 4.998871903932824
