In [10]:
import math
import numpy as np
# 加载训练集和测试集
def load_movielens(path):
    train = {} 
    test = {}
    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, timestamp = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)

    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, timestamp = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)

    user_counts = 0
    movie_counts = 0
    # 在u数据集中的用户、项目和评级的数量。
    for line in open(path + "/u.info", encoding='latin-1'):
        count, content = line.strip().split(" ")
        if "users" in content:
            user_counts = int(count)
        elif "items" in content:
            movie_counts = int(count)
    return train, test, user_counts, movie_counts

In [11]:
# 将数据集转化为行向量
def data2mat(data, user_counts=943, movie_counts=1682):
    mat = np.zeros((user_counts, movie_counts), dtype=float)
    for user, movies in data.items():
        for movie, score in movies.items():
            mat[int(user)-1][int(movie)-1] = float(int(score))
    return mat

In [12]:
# 创建物品倒排表，生成共现矩阵
def item_similarity(data, user_counts=943, movie_counts=1682):
    # 计算项目之间的共同评级用户。
    u_goods = {}
    for user, movies in data.items():
        for movie, score in movies.items():
            if user not in u_goods:
                u_goods[user] = set()
            u_goods[user].add(movie)

    w = np.zeros((movie_counts, movie_counts), dtype=float)
    C= {}
    N = {}
    for u, items in u_goods.items():
        for i in items:
            if i not in N:
                N[i] = 0
            N[i] += 1
            for j in items:
                if i == j:
                    continue
                if i not in C:
                    C[i] = {}
                if j not in C[i].keys():
                    C[i][j] = 0
                C[i][j] += 1

    # 计算最终相似矩阵w
    for i, related_items in C.items():
        for j, cij in related_items.items():
            if N[i] * N[j] != 0:
                w[int(i)-1, int(j)-1] = cij/math.sqrt(N[i] * N[j])
            else:
                w[int(i)-1, int(j)-1] = 0.0
            w[int(j)-1, int(i)-1] = w[int(i)-1, int(j)-1]
    return w

In [25]:
def item_based_recommend(data, w, user):
    """
    基于物品相似度为用户 user 推荐物品

    Args:
    - data: mat, 物品用户矩阵
    - w: mat, 物品与物品之间的相似性
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """

    m, n = np.shape(data)  # m: 物品数量 n: 用户数量
    interaction = data[:, user].T  # 用户 user 互动物品信息

    # 找到用户 user 没有互动的商品
    not_iter = []
    for i in range(m):
        if interaction[i] == 0:  # 用户 user 未打分项
            not_iter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_iter:
        item = np.copy(interaction)  # 获取用户 user 对物品的互动信息
        for j in range(m):   # 对每一个物品
            if item[j] != 0:  # 利用互动过的物品预测
                if x not in predict:
                    predict[x] = w[x, j] * item[j]
                else:
                    predict[x] = predict[x] + w[x, j] * item[j]
    # 按照预测的大小从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

def top_k(predict, n):
    """为用户推荐前 n 个物品

    Args:
    - predict: list, 排好序的物品列表
    - k: int, 推荐的物品个数

    :return: top_recom, list, top n 个物品
    """
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [26]:
# 计算召回率
def recall(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


# 计算准确率
def precision(train, test, N, rank):
    hit = 0
    all = 0
    new_rank = top_k(rank, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


# 计算覆盖率
def coverage(train, test, N, rank):
    recommend_items = set()
    all_items = set()
    new_rank = top_k(rank, N)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        for item, pui in new_rank:
            recommend_items.add(str(item+1))
    return len(recommend_items) / (len(all_items) * 1.0)


# 平均流行度
def popularity(train, test, N, rank):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        new_rank = top_k(rank, N)
        for item, pui in new_rank:
            ret += math.log(1 + item_popularity[str(item+1)])
            n += 1
    ret /= n * 1.0
    return ret

In [27]:
def main():
    path = './ml-100k'
    train, test, user_counts, movie_counts = load_movielens(path)
    mat = data2mat(train)
    w = item_similarity(train)
    for i in [50,100,150,200,250]:
        rank = item_based_recommend(mat,w, i)
        print('用户：', i)
        print("准确率：", precision(train, test, 20, rank))
        print("召回率：",recall(train, test, 20, rank))
        print("覆盖率：", coverage(train, test, 20, rank))
        print("流行度：", popularity(train, test, 20, rank))
        print("------------------------")

In [28]:
if __name__ == '__main__':
    main()

用户： 50
准确率： 0.0331389183457052
召回率： 0.0662778366914104
覆盖率： 0.011904761904761904
流行度： 5.700391964696671
------------------------
用户： 100
准确率： 0.036744432661717924
召回率： 0.07348886532343585
覆盖率： 0.011904761904761904
流行度： 5.765066557840541
------------------------
用户： 150
准确率： 0.03011664899257688
召回率： 0.06023329798515376
覆盖率： 0.011904761904761904
流行度： 5.667777027581603
------------------------
用户： 200
准确率： 0.033828207847295864
召回率： 0.06765641569459173
覆盖率： 0.011904761904761904
流行度： 5.709807639629365
------------------------
用户： 250
准确率： 0.035949098621421
召回率： 0.071898197242842
覆盖率： 0.011904761904761904
流行度： 5.739234292044088
------------------------
