In [1]:
import numpy as np

In [2]:
def cos_sim(x, y):
    """余弦相似性

    Args:
    - x: mat, 以行向量的形式存储
    - y: mat, 以行向量的形式存储

    :return: x 和 y 之间的余弦相似度
    """
    x = x.reshape((1, -1))
    y = y.reshape((1, -1))
    numerator = np.dot(x, y.T)  # x 和 y 之间的内积
    denominator = np.sqrt(np.dot(x, x.T)) * np.sqrt(np.dot(y, y.T))
    return (numerator / denominator)[0, 0]

In [3]:
def similarity(data):
    """计算矩阵中任意两行之间的相似度
    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i], data[j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w

In [4]:
def item_based_recommend(data, w, user):
    """
    基于物品相似度为用户 user 推荐物品

    Args:
    - data: mat, 物品用户矩阵
    - w: mat, 物品与物品之间的相似性
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """

    m, n = np.shape(data)  # m: 物品数量 n: 用户数量
    interaction = data[:, user].T  # 用户 user 互动物品信息

    # 找到用户 user 没有互动的商品
    not_iter = []
    for i in range(m):
        if interaction[i] == 0:  # 用户 user 未打分项
            not_iter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_iter:
        item = np.copy(interaction)  # 获取用户 user 对物品的互动信息
        for j in range(m):   # 对每一个物品
            if item[j] != 0:  # 利用互动过的物品预测
                if x not in predict:
                    predict[x] = w[x, j] * item[j]
                else:
                    predict[x] = predict[x] + w[x, j] * item[j]
    # 按照预测的大小从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [5]:
def top_k(predict, n):
    """为物品推荐前 n 个用户

    Args:
    - predict: list, 排好序的用户列表
    - k: int, 推荐的用户个数

    :return: top_recom, list, top n 个用户
    """
    top_recom = []
    len_result = len(predict)
    if n >= len_result:
        top_recom = predict
    else:
        for i in range(n):
            top_recom.append(predict[i])
    return top_recom

In [6]:
def read_train(filename='u1.base'): 
    """读取训练集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = np.zeros((1682, 943))
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        data[int(movieid)-1, int(user)-1] = 1.
    return data

In [7]:
def read_test(filename='u1.test'): 
    """读取测试集数据
    
    Args:
    - filename: str, 读取的文件名
    
    return: numpy.array格式数据
    """
    data_path ='../ml-100k/'       
  
    #加载数据  
    data = []
    base = []
    for line in open(data_path + filename, 'r'):  
        (user,movieid,rating,ts)=line.split('\t')
        if len(base) == 0:
            base.append(int(user))
            base.append(int(movieid))
        elif len(base) != 0 and int(user) == base[0]:
            base.append(int(movieid))
        elif len(base) != 0 and int(user) != base[0]:
            data.append(base)
            base = []
            base.append(int(user))
            base.append(int(movieid))
    data.append(base)
    data = np.array(data)
    return data

In [8]:
def train():
    train_data = read_train()
    test_data = read_test()
    
    w = similarity(train_data)
    predict = item_based_recommend(train_data, w, 0)
    recom = top_k(predict, 10)
    return recom

In [9]:
train()

  


[(173, 49.70162884414777),
 (68, 47.345092416709434),
 (422, 46.386432099292875),
 (97, 46.35894145892537),
 (209, 46.197999464448614),
 (55, 46.068737508167736),
 (95, 46.00672950738956),
 (81, 45.37952164828667),
 (201, 45.37580253938097),
 (185, 44.392244831745266)]