In [1]:
import numpy as np

In [2]:
# 加载训练集、测试集数据（行为用户，列为物品，train[u][i]=rate，test同上）
def load_movielens(path='./ml-100k', k="1"):
    # get movie titles
    movies = {}
    prefs_shape = []
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = title
    # load data
    for line in open(path + '/u.info', encoding='latin-1'):
        prefs_shape.append(int(line.split(' ')[0:1][0]))
    train = np.zeros(shape=prefs_shape[0:2], dtype=np.int)
    test = np.zeros(shape=prefs_shape[0:2], dtype=np.int)
    for line in open(path + '/u' + k + '.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train[int(user)-1][int(movieid)-1] = int(rating)
    for line in open(path + '/u' + k + '.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test[int(user)-1][int(movieid)-1] = int(rating)
    return train, test
train, test = load_movielens()

In [3]:
# 建立用户相似度矩阵（行列皆为用户）
def user_similarity(train):
    import math
    # 建立物品->用户倒排表（列为用户，行为物品的二阶矩阵）
    train = train.T
    # 建立用户相似兴趣数目的矩阵（行列皆为用户）
    user_simil = np.zeros(shape=[943, 943], dtype=np.int)
    for i in range(train.shape[0]):
        for u1 in range(train.shape[1]):
            for u2 in range(u1+1,train.shape[1]):
                if train[i][u1] and train[i][u2]:
                    user_simil[u1][u2] += 1
    for u1 in range(train.shape[1]):
        for u2 in range(u1+1,train.shape[1]):
            user_simil[u2][u1] = user_simil[u1][u2]
    # 由用户相似兴趣数目的矩阵得到用户的相似度矩阵（行列皆为用户）
    u_similar = np.zeros(shape=[943, 943], dtype=np.float32)
    for u1 in range(user_simil.shape[0]):
        for u2 in range(user_simil.shape[0]):
            if u1 != u2:
                u_similar[u1][u2] = user_simil[u1][u2] / math.sqrt((train.T[u1]>0).sum() * (train.T[u2]>0).sum())
    return u_similar
w = user_similarity(train)

[[ 0  6  3 ...,  7 14 35]
 [ 6  0  3 ...,  5  7  7]
 [ 3  3  0 ...,  2  4  1]
 ..., 
 [ 7  5  2 ...,  0  4  5]
 [14  7  4 ...,  4  0 19]
 [35  7  1 ...,  5 19  0]]


array([[ 0.        ,  0.08164966,  0.048795  , ...,  0.12844577,
         0.13556501,  0.23240556],
       [ 0.08164966,  0.        ,  0.08964214, ...,  0.16854997,
         0.12452441,  0.08539125],
       [ 0.048795  ,  0.08964214,  0.        , ...,  0.0805823 ,
         0.08504865,  0.0145803 ],
       ..., 
       [ 0.12844577,  0.16854997,  0.0805823 , ...,  0.        ,
         0.09594782,  0.08224396],
       [ 0.13556501,  0.12452441,  0.08504865, ...,  0.09594782,
         0.        ,  0.16492459],
       [ 0.23240556,  0.08539125,  0.0145803 , ...,  0.08224396,
         0.16492459,  0.        ]], dtype=float32)

In [6]:
# compare
import math
def cos_sim(x, y):

    numerator = x * y.T  # x 和 y 之间的内积
    a = len(numerator[(numerator>0)])  # 两行之间共同评分的电影个数
    b = x * x.T
    b = len(b[b>0])   # x行评分电影个数
    c = y * y.T
    c = len(c[c>0])   # y行电影评分个数
    denominator = math.sqrt(b*c)
    return (a / denominator)

def similarity(data):
    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i], data[j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w
w = similarity(train)

In [7]:
# 基于用户相似性为用户 user 推荐物品
def user_based_recommend(data, w, user):
    """基于用户相似性为用户 user 推荐物品

    Args:
    - data: mat, 用户物品矩阵
    - w: mat, 用户之间的相似度
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """
    m, n = np.shape(data)
    interaction = data[int(user)]  # 获取用户user评分信息

    # 找到用户user没互动过的物品加入not_inter
    not_inter = []
    for i in range(n):
        if interaction[i] == 0:  # 没评分即为没互动
            not_inter.append(i)
    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_inter:
        item = np.copy(data[:, x])  # 找到所有用户对电影x的评分信息
        for i in range(m):  # 对每一个用户
            if item[i] != 0:
                if x not in predict:
                    predict[x] = w[int(user), i] * item[i]
                else:
                    predict[x] = predict[x] + w[int(user), i] + item[i]
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

def top_k(predict, n):
    """为用户推荐前 n 个物品

    Args:
    - predict: list, 排好序的物品列表
    - k: int, 推荐的物品个数

    :return: top_recom, list, top n 个物品
    """
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [8]:
def recall(train, test, w):
    # 由于电脑性能限制，一个用户只推荐排名靠前的5部电影
    hit = 0
    all = 0
    for u in range(train.shape[0]):
        tu = test[u]
        predict = user_based_recommend(train, w, u)
        T = (tu>0).sum()
        pre_tu = top_k(predict, 5)
        for item, _ in pre_tu:
            if item in tu:
                hit += 1
        all += T        
    return hit / (all * 1.0)
recall(train, test, w)

0.028000000000000001

In [9]:
def precision(train, test, w):
    hit = 0
    all = 0
    for u in range(train.shape[0]):
        tu = test[u]
        predict = user_based_recommend(train, w, u)
        pre_tu = top_k(predict, 5)
        for item, _ in pre_tu:
            if item in tu:
                hit += 1
        all += 5
    return hit / (all * 1.0)
precision(train, test, w)

0.11876988335100742

In [11]:
def coverage(train, test, w):
    recommend_items = set()
    all_items = set()
    for u in range(train.shape[0]):
        for i in range(train.shape[1]):
            all_items.add(i)
        predict = user_based_recommend(train, w, u)
        pre_tu = top_k(predict, 5)
        for item, _ in pre_tu:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)
coverage(train, test, w)

0.029726516052318668

In [13]:
def popularity(train, test, w):
    item_popularity = dict()
    for i in range(train.shape[1]):
        if i not in item_popularity:
            item_popularity.setdefault(i, 0)
        item_popularity[i] += 1
    ret = 0
    n = 0
    for u in range(train.shape[0]):
        predict = user_based_recommend(train, w, u)
        pre_tu = top_k(predict, 5)
        for item, _ in pre_tu:
            ret += math.log(1 + item_popularity[item])
            n += 1
    ret /= n * 1.0
    return ret
popularity(train, test, w)

0.6931471805599841