In [1]:
import numpy as np
import random

def load_movielens(path='ml-100k'):
    # get movie titles
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = id
    # load data
    prefs = {}
    for line in open(path + '/u.data', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs



prefs = load_movielens()
print(prefs['88'])

{'319': 3.0, '311': 5.0, '321': 1.0, '313': 3.0, '1191': 5.0, '880': 3.0, '886': 5.0, '302': 3.0, '898': 4.0, '286': 5.0, '904': 5.0, '326': 5.0, '301': 4.0, '315': 4.0, '881': 5.0, '261': 5.0, '690': 4.0, '750': 2.0, '308': 4.0, '300': 3.0, '354': 5.0}


In [3]:
def split_data(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user in data:
        if random.randint(0, M) == k:
            test.append([user, data[user]])
        else:
            train.append([user, data[user]])
    return train, test


train, test = split_data(prefs, 8, 1, 1)


def changement(data):
    x = []
    y = []
    for i in range(len(data)):
        for j in range(1682):
            x.append(data[i][1].get(str(j+1), 0.0))
        y.append(x)
        x = []
    y = np.array(y)
    return y
train = changement(train)
test = changement(test)



In [5]:
def sgd(data_matrix, k, alpha, lam, max_cycles):
    """使用梯度下降法进行矩阵分解。

    Args:
    - data_matrix: mat, 用户物品矩阵
    - k: int, 分解矩阵的参数
    - alpha: float, 学习率
    - lam: float, 正则化参数
    - max_cycles: int, 最大迭代次数

    Returns:
    p,q: mat, 分解后的矩阵
    """
    m, n = np.shape(data_matrix)
    # initiate p & q
    p = np.mat(np.random.random((m, k)))
    q = np.mat(np.random.random((k, n)))

    # start training
    for step in range(max_cycles):
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = data_matrix[i, j]
                    for r in range(k):
                        error = error - p[i, r] * q[r, j]
                    for r in range(k):
                        p[i, r] = p[i, r] + alpha * (2 * error * q[r, j] - lam * p[i, r])
                        q[r, j] = q[r, j] + alpha * (2 * error * p[i, r] - lam * q[r, j])

        loss = 0.0
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = 0.0
                    for r in range(k):
                        error = error + p[i, r] * q[r, j]
                    # calculate loss function
                    loss = (data_matrix[i, j] - error) * (data_matrix[i, j] - error)
                    for r in range(k):
                        loss = loss + lam * (p[i, r] * p[i, r] + q[r, j] * q[r, j]) / 2

        if loss < 0.001:
            break
        if step % 10 == 0:
            print("\titer: %d, loss: %f" % (step, loss))
    return p, q

p, q = sgd(train, 2, 0.003, 0.1, 64)

	iter: 0, loss: 6.200833
	iter: 10, loss: 4.275766
	iter: 20, loss: 2.939334
	iter: 30, loss: 2.070869
	iter: 40, loss: 1.490039
	iter: 50, loss: 1.099543
	iter: 60, loss: 0.837347


In [6]:
def prediction(data_matrix, p, q, user):
    """为用户未互动的项打分

    Args:
    - data_matrix: mat, 原始用户物品矩阵
    - p: mat, 分解后的矩阵p
    - q: mat, 分解后的矩阵q
    - user: int, 用户的id

    Returns:
    - predict: list, 推荐列表
    """
    n = np.shape(data_matrix)[1]
    predict = {}
    for j in range(n):
        if data_matrix[user, j] == 0:
            predict[j] = (p[user,] * q[:, j])[0, 0]

    # 按照打分从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)


prediction(train, p, q, 2)

[(1448, 5.1284191442247415),
 (850, 5.0090010848997242),
 (1499, 4.9393168950181465),
 (1466, 4.8918758519901226),
 (1397, 4.7337658278508812),
 (482, 4.6566516240463436),
 (407, 4.6553618436641688),
 (1239, 4.6469261887803857),
 (1188, 4.6352166246954525),
 (168, 4.6303700888183181),
 (1557, 4.6274550873749103),
 (640, 4.6145776263266516),
 (473, 4.6029181772099017),
 (1404, 4.5940262567045167),
 (511, 4.5876588306957089),
 (479, 4.5855586285734464),
 (1511, 4.5731217596552813),
 (1367, 4.5696475460470545),
 (133, 4.5684168760958936),
 (1387, 4.5498179051135077),
 (1523, 4.5320077592658841),
 (487, 4.5305283519703154),
 (1063, 4.5291109419024664),
 (1157, 4.5267225595953571),
 (512, 4.5172442843077176),
 (118, 4.5165237179248532),
 (922, 4.5042321104926932),
 (1366, 4.5029606092119696),
 (1192, 4.4973998371894712),
 (271, 4.4963510765854693),
 (1641, 4.4893302224429794),
 (792, 4.4871046272216448),
 (957, 4.4868875767984893),
 (510, 4.4733076452975267),
 (514, 4.467827483269752),
 (11