In [1]:
# conding=utf-8

######################
#
# name: lixinsong
#
# date: 2018-06-30
#
#######################

import math
import numpy as np

In [2]:
# 加载数据集，生成字典
'''
ua.base    -- The data sets ua.base, ua.test, ub.base, and ub.test
ua.test       split the u data into a training set and a test set with
ub.base       exactly 10 ratings per user in the test set.  The sets
ub.test       ua.test and ub.test are disjoint.  These data sets can
              be generated from u.data by mku.sh.
'''
def load_movielens(path="./ml-100k"):

    train = {}
    test = {}

    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)

    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)

    user_counts = 0
    movie_counts = 0
    for line in open(path + "/u.info", encoding='latin-1'):

        count, content = line.strip().split(" ")
        if "users" in content:
            user_counts = int(count)
        elif "items" in content:
            movie_counts = int(count)
    return train, test, user_counts, movie_counts

In [3]:
# 计算召回率
def recall(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


# 计算准确率
def precision(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


# 计算覆盖率
def coverage(train, test, N, rank):
    recommend_items = set()
    all_items = set()
    new_rank = top_k(rank, N)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        for item, pui in new_rank:
            recommend_items.add(str(item+1))
    return len(recommend_items) / (len(all_items) * 1.0)


# 平均流行度
def popularity(train, test, N, rank):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        new_rank = top_k(rank, N)
        for item, pui in new_rank:
            ret += math.log(1 + item_popularity[str(item+1)])
            n += 1
    ret /= n * 1.0
    return ret

In [4]:
# 将数据集转化为行向量
def data2mat(data, user_counts=943, movie_counts=1682):
    # user_counts = len(data)
    # print("user_counts = ", user_counts)
    # movie_counts = len(set(sum([list(x.keys()) for x in data.values()], [])))
    # print("movie_counts = ", movie_counts)
    mat = np.zeros((user_counts, movie_counts), dtype=float)
    for user, movies in data.items():
        for movie, score in movies.items():
            mat[int(user)-1][int(movie)-1] = float(int(score))

    return mat


In [5]:
# 为用户推荐前 n 个物品
def top_k(predict, n):

    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [6]:
def sgd(data_matrix, k, alpha, lam, max_cycles):
    """使用梯度下降法进行矩阵分解。

    Args:
    - data_matrix: mat, 用户物品矩阵
    - k: int, 分解矩阵的参数
    - alpha: float, 学习率
    - lam: float, 正则化参数
    - max_cycles: int, 最大迭代次数

    Returns:
    p,q: mat, 分解后的矩阵
    """
    m, n = np.shape(data_matrix)
    # initiate p & q
    p = np.mat(np.random.random((m, k)))
    q = np.mat(np.random.random((k, n)))

    # start training
    for step in range(max_cycles):
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = data_matrix[i, j]
                    for r in range(k):
                        error = error - p[i, r] * q[r, j]
                    for r in range(k):
                        p[i, r] = p[i, r] + alpha * (2 * error * q[r, j] - lam * p[i, r])
                        q[r, j] = q[r, j] + alpha * (2 * error * p[i, r] - lam * q[r, j])

        loss = 0.0
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = 0.0
                    for r in range(k):
                        error = error + p[i, r] * q[r, j]
                    # calculate loss function
                    loss = (data_matrix[i, j] - error) * (data_matrix[i, j] - error)
                    for r in range(k):
                        loss = loss + lam * (p[i, r] * p[i, r] + q[r, j] * q[r, j]) / 2

        if loss < 0.001:
            break
        if step % 1000 == 0:
            print("\titer: %d, loss: %f" % (step, loss))
    return p, q


def prediction(data_matrix, p, q, user):
    """为用户未互动的项打分

    Args:
    - data_matrix: mat, 原始用户物品矩阵
    - p: mat, 分解后的矩阵p
    - q: mat, 分解后的矩阵q
    - user: int, 用户的id

    Returns:
    - predict: list, 推荐列表
    """
    n = np.shape(data_matrix)[1]
    predict = {}
    for j in range(n):
        if data_matrix[user, j] == 0:
            predict[j] = (p[user,] * q[:, j])[0, 0]

    # 按照打分从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [None]:
# 主函数
def main():
    train, test, user_counts, movie_counts = load_movielens()
    # print(prefs['1'])
    print("用户数量:", user_counts, "电影数量：", movie_counts)
    mat = data2mat(train) 
    p, q = sgd(mat,2,0.001,0.01,10)
    rank = top_k(prediction(mat, p, q, 90), 10) 
    rank = item_based_recommend(mat,w, 90)

    print(top_k(rank, 10))
    print("召回率：",recall(train, test, 10, rank))
    print("准确率：", precision(train, test, 10, rank))
    print("覆盖率：", coverage(train, test, 10, rank))
    print("流行度：", popularity(train, test, 10, rank))


if __name__ == '__main__':

    main()