In [1]:
import numpy as np

In [2]:
def cos_sim(x, y):
    """余弦相似性

    Args:
    - x: mat, 以行向量的形式存储
    - y: mat, 以行向量的形式存储

    :return: x 和 y 之间的余弦相似度
    """
    x = x.reshape((1, -1))
    y = y.reshape((1, -1))
    numerator = np.dot(x, y.T)  # x 和 y 之间的内积
    denominator = np.sqrt(np.dot(x, x.T)) * np.sqrt(np.dot(y, y.T))
    return (numerator / denominator)[0, 0]

In [3]:
def similarity(data):
    """计算矩阵中任意两行之间的相似度
    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i], data[j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w

In [4]:
def user_based_recommend(data, w, user):
    """基于用户相似性为用户 user 推荐物品

    Args:
    - data: mat, 用户物品矩阵
    - w: mat, 用户之间的相似度
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """
    m, n = np.shape(data)
    interaction = data[user, ]  # 用户 user 与物品信息

    # 找到用户 user 没有互动过的物品
    not_inter = []
    for i in range(n):
        if interaction[i] == 0:  # 没有互动的物品
            not_inter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_inter:
        item = np.copy(data[:, x])  # 找到所有用户对商品 x 的互动信息
        for i in range(m):  # 对每一个用户
            if item[i] != 0:
                if x not in predict:
                    predict[x] = w[user, i] * item[i]
                else:
                    predict[x] = predict[x] + w[user, i] + item[i]
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [5]:
def top_k(predict, n):
    """为用户推荐前 n 个物品

    Args:
    - predict: list, 排好序的物品列表
    - k: int, 推荐的物品个数

    :return: top_recom, list, top n 个物品
    """
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [6]:
def read_train(filename='u1.base'): 
    """读取训练集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = np.zeros((943, 1682))
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        data[int(user)-1, int(movieid)-1] = 1.
    return data

In [38]:
def read_test(filename='u1.test'): 
    """读取测试集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = []
    base = []
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        if len(base) == 0:
            base.append(int(user))
            base.append(int(movieid))
        elif len(base) != 0 and int(user) == base[0]:
            base.append(int(movieid))
        elif len(base) != 0 and int(user) != base[0]:
            data.append(base)
            base = []
            base.append(int(user))
            base.append(int(movieid))
    data.append(base)
    data = np.array(data)
    test_data = {}
    for i in data:
        test_data[i[0]] = i[1:]
    return test_data


In [44]:
train_data = read_train()
test_data = read_test()
    
w = similarity(train_data)
predict = user_based_recommend(train_data, w, 0)
recom = top_k(predict, 5)

<class 'list'>


In [46]:
def precision_recall(test, N):
    """准确率、召回率

    Args:
    - test: 测试集
    - N: 推荐物品个数

    :return: 准确率和召回率
    """
    hit = 0
    n_recall = 0
    n_precision = 0
    num = 0
    for user, items in test.items():
        predict = user_based_recommend(train_data, w, num)
        rank = top_k(predict, 5)
        for j in range(len(rank)):
            if rank[j] == items[j]:
                hit += 1
        n_recall += len(items)
        n_precision += N
        num += 1
    return hit / (1.0 * n_recall), hit / (1.0 * n_precision)