In [9]:
import random
import math
import numpy as np

In [10]:
def load_movielens(path='./ml-100k'):
    # get movie titles
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = title
    # load data
    train = {}
    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)
    test = {}
    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)
    return train, test, movies

In [11]:
train, test, movies = load_movielens()

In [12]:
def gen_list(data, user_len=943, movie_len=1682):
    mat_data = np.zeros((user_len, movie_len), dtype=float)
    for u, item in data.items():
        for n, r in item.items():
            mat_data[int(u)-1][int(n)-1] = 1.0
#             mat_data[int(u)-1][int(n)-1] = 1.0
    return mat_data

In [13]:
def gen_normal_list(data, user_len=943, movie_len=1682):
    mat_data = np.zeros((user_len, movie_len), dtype=float)
    for u, item in data.items():
        for n, r in item.items():
            mat_data[int(u)-1][int(n)-1] = float(int(r))
    return mat_data

In [14]:
def item_similarity(train):
    mat_data = gen_list(train)
    m = mat_data.shape[1]  # 物品的数量
    w = np.mat(np.zeros((m, m)))
    N = [sum(b) for b in mat_data.T]
    
    for i in range(m):
        for j in range(m):
            if not i == j:
                common = len([a for a in mat_data[:, i]+mat_data[:, j] if a==2])
                if N[i] * N[j] !=0:
                    w[i, j] = common / math.sqrt(N[i] * N[j])
                else:
                    w[i, j] = 0.0
                w[j, i] = w[i, j]
            else:
                w[j, i] = 0.0
    return w

In [15]:
w = item_similarity(train)

In [16]:
# print(w)

[[0.         0.42701903 0.38348249 ... 0.         0.05050763 0.05050763]
 [0.42701903 0.         0.29581426 ... 0.         0.09090909 0.09090909]
 [0.38348249 0.29581426 0.         ... 0.         0.         0.10846523]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.05050763 0.09090909 0.         ... 0.         0.         0.        ]
 [0.05050763 0.09090909 0.10846523 ... 0.         0.         0.        ]]


In [17]:
def item_based_recommend(data, w, user):
    """
    基于物品相似度为用户 user 推荐物品

    Args:
    - data: mat, 用户物品矩阵
    - w: mat, 物品与物品之间的相似性
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """
    data = gen_normal_list(train)
    m, n = np.shape(data)  # m:用户数量  n: 物品数量
    interaction = data[int(user)-1]  # 用户 user 互动物品信息

    # 找到用户 user 没有互动的商品
    not_iter = []
    for i in range(n):
        if interaction[i] == 0:  # 用户 user 未打分项
            not_iter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_iter:
        item = np.copy(interaction)  # 获取用户 user 对物品的互动信息
        for j in range(n):   # 对每一个物品
            if item[j] != 0:  # 利用互动过的物品预测
                if x not in predict:
                    predict[x] = w[x, j] * item[j]
                else:
                    predict[x] = predict[x] + w[x, j] * item[j]
    # 按照预测的大小从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [18]:
rank = item_based_recommend(train, w, '1')

In [19]:
def top_k(rank, k):
    if len(rank) <= k:
        return rank
    else:
        return rank[:k]

In [20]:
print(top_k(rank, 5))

[(422, 394.7821275730572), (201, 383.20902154597155), (384, 374.345544392906), (654, 373.1798888323046), (402, 373.07009197032096)]


In [21]:
def recall(train, test, N, w):
    hit = 0
    all = 0
    mat_train = gen_normal_list(train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(item_based_recommend(mat_train, w, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [22]:
print("top5的召回率为：", recall(train, test, 5, w))

top5的召回率为： 0.13170731707317074


In [23]:
def precision(train, test, N, w):
    hit = 0
    all = 0
    mat_train = gen_normal_list(train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(item_based_recommend(mat_train, w, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [26]:
print("top5的准确率为：",precision(train, test, 5, w))

top5的准确率为： 0.2634146341463415


In [27]:
def coverage(train, test, N, w):
    recommend_items = set()
    all_items = set()
    mat_train = gen_normal_list(train)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = top_k(item_based_recommend(mat_train, w, user), N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)

In [28]:
print("top5的覆盖率为：", coverage(train, test, 5, w))

top5的覆盖率为： 0.0994047619047619


In [29]:
def popularity(train, test, N, w):
    item_popularity = dict()
    mat_train = gen_normal_list(train)
    for user, items in train.items():
        for item in items.keys():
            item_popularity[item] = item_popularity.get(item, 0) + 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = top_k(item_based_recommend(mat_train, w, user), N)
        for item, pui in rank:
            ret += math.log(1 + 1/item_popularity[str(item + 1)])
            n += 1
    ret /= n * 1.0
    return ret

In [30]:
print('top5的新颖度为：', popularity(train, test, 5, w))

top5的新颖度为： 0.0035223833873731358
