In [3]:
# 基于用户的协同过滤算法

# Author : MilesCode
# Date : 18/03/29
# Version : 0.0

In [4]:
import tensorflow as tf
import numpy as np
import math
import sys
from operator import itemgetter
import random


def load_data(filename,data):
    with open(filename,'r') as f:  
        for line in f.readlines():  
            line = line.strip()
            linelist = line.split()
#             data.append()
            data.append([linelist[0],linelist[1]])
#         print(data)
#             linelist = map(int,linestrlist)# 方法一  
#             linelist = [int(i) for i in linestrlist] # 方法二  
#             data.append(linestrlist[])
#             print(linelist)
# def ReadData(file, data):
#     ''' 读取评分数据
#         @param file  评分数据文件
#         @param data 储存评分数据的List
#     '''
#     for line in file:
#         print(line)
#         line = line.strip('\n')
#         linelist = line.split()
#         print(linelist)
#         data.append([linelist[0], linelist[1]])



data = []
filename = 'ml-100k/u.data'
load_data(filename, data)

  return f(*args, **kwds)


In [5]:
def cos_sim(x, y):
    """
    计算余弦相似度

    Args:
    - x: mat, 以行向量的形式存储
    - y: mat, 以行向量的形式存储

    :return: x 和 y 之间的余弦相似度
    """
    
    numerator = x * y.T  # x 和 y 之间的内积
    denominator = np.sqrt(x * x.T) * np.sqrt(y * y.T)
    return (numerator / denominator)[0, 0]

In [6]:
def similarity(data):
    """
    计算矩阵中任意两行之间的相似度

    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i, ], data[j, ])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w

In [7]:
def user_based_recommend(data, w, user):
    """
    基于用户相似度为用户 user 推荐物品

    Args:
    - data: mat, 用户物品矩阵
    - w: mat, 用户之间的相似度
    - user: int, 用户编号

    :return: predict, list, 推荐列表
    """
    m, n = np.shape(data)
    interaction = data[user, ]  # 用户 user 与物品信息

    # 找到用户 user 没有互动过的物品
    not_inter = []
    for i in range(n):
        if interaction[0, i] == 0:  # 没有互动的物品
            not_inter.append(i)

    # 对没有互动过的物品进行预测
    predict = {}
    for x in not_inter:
        item = np.copy(data[:, x])  # 找到所有用户对商品 x 的互动信息
        for i in range(m):  # 对每一个用户
            if item[i, 0] != 0:
                if x not in predict:
                    predict[x] = w[user, i] * item[i, 0]
                else:
                    predict[x] = predict[x] + w[user, i] + item[i, 0]
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [8]:
def SplitData(data, M, key, seed):
    ''' 将数据分为训练集和测试集
        @param data   储存训练和测试数据的List
        @param M      将数据分为M份
        @param key    选取第key份数据做为测试数据
        @param seed   随机种子
        @return train 训练数据集Dict
        @return test  测试数据集Dict
    '''
    test = dict()
    train = dict()
    random.seed(seed)
    for user, item in data:
        if random.randint(0, M) == key:
            if user in test:
                test[user].append(item)
            else:
                test[user] = []
        else:
            if user in train:
                train[user].append(item)
            else:
                train[user] = []
    return train, test

In [9]:
def Recall(train, test, N):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = Recommend(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [10]:
def Precision(train, test, N):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = Recommend(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [11]:
def Coverage(train, test, N):
    recommend_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = Recommend(user, N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)

In [12]:
def Popularity(train, test, N):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = Recommend(user, N)
        for item, pui in rank:
            ret += math.log(1 + item_popularity[item])
            n += 1
    ret /= n * 1.0
    return ret

In [13]:
# def UserSimilarity(train):
#     # build inverse table for item_users
#     item_users = dict()
#     for u, items in train.items():
#         for i in items.keys():
#             if i not in item_users:
#                 item_users[i] = set()
#             item_users[i].add(u)
#     #calculate co-rated items between users
#         C = dict()
#         N = dict()
#         for i, users in item_users.items():
#             for u in users:
#                 N[u] += 1
#                 for v in users:
#                     if u == v:
#                         continue
#                     C[u][v] += 1
#     #calculate finial similarity matrix W
#     W = dict()
#     for u, related_users in C.items():
#         for v, cuv in related_users.items():
#             W[u][v] = cuv / math.sqrt(N[u] * N[v])
#     return W

def UserSimilarity(train):
    ''' 计算用户相似度
        @param train 训练数据集Dict
        @return W    记录用户相似度的二维矩阵
    '''
    #建立物品到用户之间的倒查表，降低计算用户相似度的时间复杂性
    item_users = dict()
    for u, items in train.items():
        for i in items:
            if (i not in item_users):
                item_users[i] = set()
            item_users[i].add(u)
        C = dict()
        N = dict()
        #计算用户之间共有的item的数目
        for i, users in item_users.items():
            for u in users:
                if (u not in N):
                    N[u] = 1
                N[u] += 1
                for v in users:
                    if u == v:
                        continue
                    if (u not in C):
                        C[u] = dict()
                    if (v not in C[u]):
                        C[u][v] = 0
                    #对热门物品进行了惩罚，采用这种方法被称做UserCF-IIF
                    C[u][v] += (1 / math.log(1 + len(users)))
    W = dict()
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            if (u not in W):
                W[u] = dict()
            #利用余弦相似度计算用户之间的相似度
            W[u][v] = cuv / math.sqrt(N[u] * N[v])

    return W

In [14]:
def Recommend(user, train, W, N, K):
    rank = dict()
    interacted_items = train[user]
    for v, wuv in sorted(W[user].items(), key=itemgetter(1), reverse=True)[0:K]:
        for i, rvi in train[v].items:
            if i in interacted_items:
    #we should filter items user interacted before
                continue
            rank[i] += wuv * rvi
    return rank

In [17]:
if __name__ == '__main__':
    data = []
    M = 8
    key = 10
    seed = 1
    N = 10

    print("Process Running...")
    file = 'ml-100k/u.data'
    load_data(file, data)
    train, test = SplitData(data, M, key, seed)
    print(1)
    W = UserSimilarity(train)
    print(2)
    recall = Recall(train, test, N)
    print(3)
    precision = Precision(train, test, N)
    print(4)
    popularity = Popularity(train, test, N)
    print(5)
    coverage = Coverage(train, test,N)
    print(6)
    print('recall: ', recall, '\n')
    print('precision: ', precision, '\n')
    print('Popularity: ', popularity, '\n')
    print('coverage: ', coverage, '\n')
else:
    print("this is not the main function")

Process Running...
1
200


KeyError: '196'