In [1]:
# conding=utf-8

######################
#
# name: lixinsong
#
# date: 2018-06-30
#
#######################

import math
import numpy as np

In [2]:
# 加载数据集，生成字典
'''
ua.base    -- The data sets ua.base, ua.test, ub.base, and ub.test
ua.test       split the u data into a training set and a test set with
ub.base       exactly 10 ratings per user in the test set.  The sets
ub.test       ua.test and ub.test are disjoint.  These data sets can
              be generated from u.data by mku.sh.
'''
def load_movielens(path="./ml-100k"):

    train = {}
    test = {}

    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)

    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)

    user_counts = 0
    movie_counts = 0
    for line in open(path + "/u.info", encoding='latin-1'):

        count, content = line.strip().split(" ")
        if "users" in content:
            user_counts = int(count)
        elif "items" in content:
            movie_counts = int(count)
    return train, test, user_counts, movie_counts

In [3]:
# 计算召回率
def recall(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


# 计算准确率
def precision(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


# 计算覆盖率
def coverage(train, test, N, rank):
    recommend_items = set()
    all_items = set()
    new_rank = top_k(rank, N)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        for item, pui in new_rank:
            recommend_items.add(str(item+1))
    return len(recommend_items) / (len(all_items) * 1.0)


# 平均流行度
def popularity(train, test, N, rank):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        new_rank = top_k(rank, N)
        for item, pui in new_rank:
            ret += math.log(1 + item_popularity[str(item+1)])
            n += 1
    ret /= n * 1.0
    return ret

In [4]:
# 将数据集转化为行向量
def data2mat(data, user_counts=943, movie_counts=1682):
    # user_counts = len(data)
    # print("user_counts = ", user_counts)
    # movie_counts = len(set(sum([list(x.keys()) for x in data.values()], [])))
    # print("movie_counts = ", movie_counts)
    mat = np.zeros((user_counts, movie_counts), dtype=float)
    for user, movies in data.items():
        for movie, score in movies.items():
            mat[int(user)-1][int(movie)-1] = float(int(score))

    return mat


# 根据用户物品倒排表，生成共现矩阵
def similarity(data, user_counts=943, movie_counts=1682):
    user_goods = {}
    #N = defaultdict(defaultdict)
    #C = defaultdict(defaultdict)
    C = {}
    N = {}
    for user, movies in data.items():
        for movie, score in movies.items():
            if user not in user_goods:
                user_goods[user] = set()
            user_goods[user].add(movie)

    w = np.zeros((movie_counts, movie_counts), dtype=float)

    for u, items in user_goods.items():
        for i in items:
            if i not in N:
                N[i] = 0
            N[i] += 1
            for j in items:
                if i == j:
                    continue
                if i not in C:
                    C[i] = {}
                if j not in C[i].keys():
                    C[i][j] = 0
                C[i][j] += 1

    for i, related_items in C.items():
        for j, cij in related_items.items():
            if N[i] * N[j] != 0:
                w[int(i)-1, int(j)-1] = cij/math.sqrt(N[i] * N[j])
            else:
                w[int(i)-1, int(j)-1] = 0.0
            w[int(j)-1, int(i)-1] = w[int(i)-1, int(j)-1]
    return w

In [5]:
def item_based_recommend(data, w, user):
    """
    基于物品相似度为用户 user 推荐物品

    Args:
    - data: mat, 物品用户矩阵
    - w: mat, 物品与物品之间的相似性
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """

    m, n = np.shape(data)  # m: 物品数量 n: 用户数量
    interaction = data[:, int(user)-1].T  # 用户 user 互动物品信息

    # 找到用户 user 没有互动的商品
    not_iter = []
    for i in range(m):
        if interaction[i] == 0:  # 用户 user 未打分项
            not_iter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_iter:
        item = np.copy(interaction)  # 获取用户 user 对物品的互动信息
        for j in range(m):   # 对每一个物品
            if item[j] != 0:  # 利用互动过的物品预测
                if x not in predict:
                    predict[x] = w[x, j] * item[j]
                else:
                    predict[x] = predict[x] + w[x, j] * item[j]
    # 按照预测的大小从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [6]:
# 为用户推荐前 n 个物品
def top_k(predict, n):

    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [7]:
# 主函数
def main():
    train, test, user_counts, movie_counts = load_movielens()
    # print(prefs['1'])
    print("用户数量:", user_counts, "电影数量：", movie_counts)
    w = similarity(train)

    # print(w)
    mat = data2mat(train)
    rank = item_based_recommend(mat,w, 90)

    print(top_k(rank, 10))
    print("召回率：",recall(train, test, 10, rank))
    print("准确率：", precision(train, test, 10, rank))
    print("覆盖率：", coverage(train, test, 10, rank))
    print("流行度：", popularity(train, test, 10, rank))


if __name__ == '__main__':

    main()

用户数量: 943 电影数量： 1682
[(173, 87.12620987873795), (203, 85.80077510125328), (171, 85.70664520033061), (78, 85.53615743979402), (180, 84.71373939180117), (233, 84.40283901430493), (49, 84.33340834028431), (120, 83.85504690403941), (422, 83.82553419541021), (194, 83.77985271931861)]
召回率： 0.04316012725344645
准确率： 0.04316012725344645
覆盖率： 0.005952380952380952
流行度： 5.830478606467905
