In [1]:
import numpy as np

In [2]:
def read_train(filename='u1.base'): 
    """读取训练集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = np.zeros((1682, 943))
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        data[int(movieid)-1, int(user)-1] = 1.
    return data

In [3]:
def read_test(filename='u1.test'): 
    """读取测试集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = []
    base = []
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        if len(base) == 0:
            base.append(int(user))
            base.append(int(movieid))
        elif len(base) != 0 and int(user) == base[0]:
            base.append(int(movieid))
        elif len(base) != 0 and int(user) != base[0]:
            data.append(base)
            base = []
            base.append(int(user))
            base.append(int(movieid))
    data.append(base)
    data = np.array(data)
    return data

In [4]:
def sgd(data_matrix, k, alpha, lam, max_cycles):
    """使用梯度下降法进行矩阵分解。

    Args:
    - data_matrix: mat, 用户物品矩阵
    - k: int, 分解矩阵的参数
    - alpha: float, 学习率
    - lam: float, 正则化参数
    - max_cycles: int, 最大迭代次数

    Returns:
    p,q: mat, 分解后的矩阵
    """
    m, n = np.shape(data_matrix)
    # initiate p & q
    p = np.mat(np.random.random((m, k)))
    q = np.mat(np.random.random((k, n)))

    # start training
    for step in range(max_cycles):
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = data_matrix[i, j]
                    for r in range(k):
                        error = error - p[i, r] * q[r, j]
                    for r in range(k):
                        p[i, r] = p[i, r] + alpha * (2 * error * q[r, j] - lam * p[i, r])
                        q[r, j] = q[r, j] + alpha * (2 * error * p[i, r] - lam * q[r, j])

        loss = 0.0
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = 0.0
                    for r in range(k):
                        error = error + p[i, r] * q[r, j]
                    # calculate loss function
                    loss = (data_matrix[i, j] - error) * (data_matrix[i, j] - error)
                    for r in range(k):
                        loss = loss + lam * (p[i, r] * p[i, r] + q[r, j] * q[r, j]) / 2

        if loss < 0.001:
            break
        if step % 1000 == 0:
            print("\titer: %d, loss: %f" % (step, loss))
    return p, q

In [5]:
def prediction(data_matrix, p, q, user):
    """为用户未互动的项打分

    Args:
    - data_matrix: mat, 原始用户物品矩阵
    - p: mat, 分解后的矩阵p
    - q: mat, 分解后的矩阵q
    - user: int, 用户的id

    Returns:
    - predict: list, 推荐列表
    """
    n = np.shape(data_matrix)[1]
    predict = {}
    for j in range(n):
        if data_matrix[user, j] == 0:
            predict[j] = (p[user,] * q[:, j])[0, 0]

    # 按照打分从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [6]:
def train():
    train_data = read_train()
    test_data = read_test()
    
    p, q = sgd(train_data, 100, 0.01, 0.01, 100)
    recom = prediction(train_data, p, q, 1)
    return recom

In [None]:
train()