In [1]:
# 读取数据 用电影的id来表示电影 便于计算

def load_movielens(path='ml-100k'):
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = id

    prefs = {}
    for line in open(path + '/u.data', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs
prefs= load_movielens()

In [2]:
# 划分数据集 按照7：1 划分数据集
import random
import numpy
def split_data(data, M, k, seed):
    
    test = []
    train = []
    random.seed(seed)
    for user in data:
        if random.randint(0, M) == k:
            test.append([user, data[user]])
        else:
            train.append([user, data[user]])
    return train, test
train,test = split_data(prefs,8,5,1)
def changenp(data):     # 把每个用户的评分放到一个大矩阵中 
    a = []
    b = []
    for i in range(len(data)):
        for j in range(1682):
            a.append(data[i][1].get(str(j+1),0.0))
        b.append(a)
        a = []
    b = numpy.array(b)
    return b

a = changenp(train)
b = changenp(test)
b[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [10]:
import numpy as np
import math
# # 计算两个物品之间的相似度
def same(x, y):

    tor = x * y.T  # x 和 y 之间的内积
    a = len(x[x>0])
    b = len(y[y>0])
    c = len(tor[tor>0])
    denominator = math.sqrt(a*b)
    if denominator>0 :
        ans = c / denominator
    else:
        ans = 0
    return ans

def similarity(data):
    """计算矩阵中任意两行之间的相似度
    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[1]  # 物品的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = same(data[:,i], data[:,j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w
w = similarity(a)
w[0]



matrix([[0.        , 0.43668123, 0.39239246, ..., 0.        , 0.04950738,
         0.04950738]])

In [12]:
def item_based_recommend(data, w, user):

    n, m = np.shape(data)  # m: 物品数量 n: 用户数量
    interaction = data[user].T  # 用户 user 互动物品信息

    # 找到用户 user 没有互动的商品
    not_iter = []
    for i in range(m):
        if interaction[i] == 0:  # 用户 user 未打分项
            not_iter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_iter:
        item = np.copy(interaction)  # 获取用户 user 对物品的互动信息
        for j in range(m):   # 对每一个物品
            if item[j] != 0:  # 利用互动过的物品预测
                if x not in predict:
                    predict[x] = w[x, j] * item[j]
                else:
                    predict[x] = predict[x] + w[x, j] * item[j]
    # 按照预测的大小从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

predict = item_based_recommend(a,w,0)

In [14]:
# 为用户推荐前n个商品
def top_k(predict, n):
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom
top = top_k(predict,5)
top

[(301, 39.154086092812946),
 (332, 37.91789942393367),
 (287, 37.50497110439719),
 (268, 36.324878042164265),
 (300, 35.266577484284774)]

In [20]:
# 召回率
def recall(train, test, N,predict):
    hit = 0
    all = 0
    for user in range(len(test)):
        tu = test[user]
        rank = top_k(predict, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)
recall(a,b,500,predict)

0.002764259533937262

In [19]:
# 准确度
def precision(train, test, N, predict):
    hit = 0
    all = 0
    for user in range(len(test)):
        tu = test[user]
        rank = top_k(predict, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)
precision(a,b,100,predict)

0.01

In [21]:
# 覆盖率
def coverage(train,N, predict):
    recommend_items = set()
    all_items = set()
    for user in range(len(train)):
        for item in train[user]:
            if item != 0:
                all_items.add(user)
        rank = top_k(predict, N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)
coverage(a,500,predict)

0.5910165484633569

In [22]:
# 新颖度
def popularity(train, N, predict):
    item_popularity = dict()
    for user in range(len(train)):
        i = 0
        for item in train[user]:
            i = i+1
            if item != 0:
                item_popularity[str(i)]=item_popularity.get(str(i),0)
    ret = 0
    n = 0
    for user in range(len(train)):
        rank = top_k(predict, N)
        for item, pui in rank:
            ret += math.log(1 + item_popularity.get(item,0.1))
            n += 1
    ret /= n * 1.0
    return ret
popularity(a, 500, predict)

0.09531017980478937