In [23]:
import numpy as np
import math
import random
# 加载训练集和测试集
def load_movielens(path):
    train = {} 
    test = {}
    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, timestamp = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)

    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, timestamp = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)

    user_counts = 0
    movie_counts = 0
    # 在u数据集中的用户、项目和评级的数量。
    for line in open(path + "/u.info", encoding='latin-1'):
        count, content = line.strip().split(" ")
        if "users" in content:
            user_counts = int(count)
        elif "items" in content:
            movie_counts = int(count)
    return train, test, user_counts, movie_counts

In [24]:
# 将数据集转化为行向量
def data2mat(data, user_counts=943, movie_counts=1682):
    mat = np.zeros((user_counts, movie_counts), dtype=float)
    for user, movies in data.items():
        for movie, score in movies.items():
            mat[int(user)-1][int(movie)-1] = float(int(score))
    return mat

In [25]:
# 余弦相似性
def cos_sim(x, y):
    """余弦相似性

    Args:
    - x: mat, 以行向量的形式存储
    - y: mat, 以行向量的形式存储

    :return: x 和 y 之间的余弦相似度
    """
    numerator = np.matmul(x, y.T)  # x 和 y 之间的内积
    denominator = np.sqrt(np.matmul(x, x.T)) * np.sqrt(np.matmul(y, y.T))
    return (numerator / denominator)

In [26]:
# 对于任意矩阵，计算任意两个行向量之间的相似度：
def similarity(data):
    """计算矩阵中任意两行之间的相似度
    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i], data[j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w

In [27]:
# 实现UserCF 算法：

def user_based_recommend(data, w, user):
    """基于用户相似性为用户 user 推荐物品

    Args:
    - data: mat, 用户物品矩阵
    - w: mat, 用户之间的相似度
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """
    m, n = np.shape(data)
    interaction = data[int(user)-1,]  # 用户 user 与物品信息

    # 找到用户 user 没有互动过的物品
    not_inter = []
    for i in range(n):
        if interaction[i] == 0:  # 没有互动的物品
            not_inter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_inter:
        item = np.copy(data[:, x])  # 找到所有用户对商品 x 的互动信息
        for i in range(m):  # 对每一个用户
            if item[i] != 0:
                if x not in predict:
                    predict[x] = w[user, i] * item[i]
                else:
                    predict[x] = predict[x] + w[user, i] + item[i]
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [28]:
# 为用户推荐前 $N$ 个打分最高的物品
def top_k(predict, n):
    """为用户推荐前 n 个物品

    Args:
    - predict: list, 排好序的物品列表
    - k: int, 推荐的物品个数

    :return: top_recom, list, top n 个物品
    """
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom


In [29]:
# 计算召回率
def recall(train, test, N, predict):
    hit = 0
    all = 0
    new_predict = top_k(predict, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_predict:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

# 计算准确率
def precision(train, test, N, predict):
    hit = 0
    all = 0
    new_predict = top_k(predict, N)
    for user in train.keys():
        tu = test[user]
        for item, pui in new_predict:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


# 计算覆盖率
def coverage(train, test, N, predict):
    recommend_items = set()
    all_items = set()
    new_predict = top_k(predict, N)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        for item, pui in new_predict:
            recommend_items.add(str(item+1))
    return len(recommend_items) / (len(all_items) * 1.0)


# 平均流行度
def popularity(train, test, N, predict):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        new_predict = top_k(predict, N)
        for item, pui in new_predict:
            ret += math.log(1 + item_popularity[str(item+1)])
            n += 1
    ret /= n * 1.0
    return ret

In [54]:
def main():
    path = './ml-100k'
    train, test, user_counts, movie_counts = load_movielens(path)
    mat = data2mat(train)
    w = similarity(mat)
    for i in [50,100,150,200,250]:
        predict = user_based_recommend(mat,w, i)
        print('用户：', i)
        print("准确率：", precision(train, test, 20, predict))
        print("召回率：",recall(train, test, 20, predict))
        print("覆盖率：", coverage(train, test, 20, predict))
        print("流行度：", popularity(train, test, 20, predict))
        print("------------------------")

In [55]:
if __name__ == '__main__':
    main()

用户： 50
准确率： 0.056733828207847295
召回率： 0.11346765641569459
覆盖率： 0.011904761904761904
流行度： 5.879745408179353
------------------------
用户： 100
准确率： 0.05090137857900318
召回率： 0.10180275715800637
覆盖率： 0.011904761904761904
流行度： 5.8621099117134365
------------------------
用户： 150
准确率： 0.0535524920466596
召回率： 0.1071049840933192
覆盖率： 0.011904761904761904
流行度： 5.864893067568947
------------------------
用户： 200
准确率： 0.041410392364793214
召回率： 0.08282078472958643
覆盖率： 0.011904761904761904
流行度： 5.645394498830135
------------------------
用户： 250
准确率： 0.03966065747613998
召回率： 0.07932131495227995
覆盖率： 0.011904761904761904
流行度： 5.6934334140693865
------------------------
