## 基于用户的协同过滤算法,使用 NumPy 实现基于用户的协同过滤算法，在 MovieLens 数据集上进行测试，并计算准确率、召回率、覆盖率、流行度等指标。

##  

In [6]:
from math import sqrt  

def loadData():  
    trainSet = {}  
    testSet = {}
    movieUser = {}
    u2u = {}  
    TrainFile = 'ml-100k/u1.base'
    TestFile = 'ml-100k/u1.test'
    #加载训练集
    for line in open(TrainFile):  
        (userId, itemId, rating, timestamp) = line.strip().split('\t')     
        trainSet.setdefault(userId,{})  
        trainSet[userId].setdefault(itemId,float(rating))  
        movieUser.setdefault(itemId,[])  
        movieUser[itemId].append(userId.strip())
        
    #加载测试集  
    for line in open(TestFile):   
        (userId, itemId, rating, timestamp) = line.strip().split('\t')     
        testSet.setdefault(userId,{})  
        testSet[userId].setdefault(itemId,float(rating))  
  
    #生成用户用户共有电影矩阵  
    for m in movieUser.keys():  
        for u in movieUser[m]:  
            u2u.setdefault(u,{})  
            for n in movieUser[m]:  
                if u!=n:  
                    u2u[u].setdefault(n,[])  
                    u2u[u][n].append(m)  
    return trainSet,testSet,u2u  
        
# 计算一个用户的平均评分    
def getAverageRating(user):    
    average = (sum(trainSet[user].values())*1.0) / len(trainSet[user].keys())    
    return average  
  
#计算用户相似度    
def getUserSim(u2u,trainSet):  
    userSim = {}  
    # 计算用户的用户相似度    
    for u in u2u.keys(): #对每个用户u  
        userSim.setdefault(u,{})  #将用户u加入userSim中设为key，该用户对应一个字典  
        average_u_rate = getAverageRating(u)  #获取用户u对电影的平均评分  
        for n in u2u[u].keys():  #对与用户u相关的每个用户n               
            userSim[u].setdefault(n,0)  #将用户n加入用户u的字典中  
  
            average_n_rate = getAverageRating(n)  #获取用户n对电影的平均评分  
                
            part1 = 0
            part2 = 0
            part3 = 0
            #对用户u和用户n的共有的每个电影
            for m in u2u[u][n]:    
                part1 += (trainSet[u][m]-average_u_rate)*(trainSet[n][m]-average_n_rate)*1.0    
                part2 += pow(trainSet[u][m]-average_u_rate, 2)*1.0    
                part3 += pow(trainSet[n][m]-average_n_rate, 2)*1.0    
                    
            part2 = sqrt(part2)    
            part3 = sqrt(part3)    
            if part2 == 0 or part3 == 0:#若分母为0，相似度为0  
                userSim[u][n] = 0  
            else:  
                userSim[u][n] = part1 / (part2 * part3)  
    return userSim  
    
# 寻找用户最近邻并生成推荐结果  
def getRecommendations(N,trainSet,userSim):  
    pred = {}  
    for user in trainSet.keys():    #对每个用户  
        pred.setdefault(user,{})    #生成预测空列表  
        interacted_items = trainSet[user].keys() #获取该用户评过分的电影    
        average_u_rate = getAverageRating(user)  #获取该用户的评分平均分  
        userSimSum = 0  
        simUser = sorted(userSim[user].items(),key = lambda x : x[1],reverse = True)[0:N]  
        for n, sim in simUser:    
            average_n_rate = getAverageRating(n)  
            userSimSum += sim   #对该用户近邻用户相似度求和  
            for m, nrating in trainSet[n].items():    
                if m in interacted_items:    
                    continue    
                else:  
                    pred[user].setdefault(m,0)  
                    pred[user][m] += (sim * (nrating - average_n_rate))  
        for m in pred[user].keys():    
                pred[user][m] = average_u_rate + (pred[user][m]*1.0) / userSimSum  
    return pred  
  
#计算预测分析准确度  
def getMAE(testSet,pred):  
    MAE = 0  
    rSum = 0  
    setSum = 0  
  
    for user in pred.keys():
        for movie, rating in pred[user].items():
            if user in testSet.keys() and movie in testSet[user].keys() :
                setSum = setSum + 1 
                rSum = rSum + abs(testSet[user][movie]-rating)
    MAE = rSum / setSum  
    return MAE  
  

In [14]:
if __name__ == '__main__':  
    print ('Loading Data..'  )
    trainSet,testSet,u2u = loadData()  
    print ('Computing Similarity between users'  )
    userSim = getUserSim(u2u,trainSet)  
    for N in (i * 10 for i in range(1,11)):
        pred = getRecommendations(N,trainSet,userSim)
        MAE = getMAE(testSet,pred)  #计算MAE  
        print ('N= %d Prediction accuracy MAE=%f'%(N,MAE))  

Loading Data..
Computing Similarity between users
对不同的近邻数获得推荐
N= 10 Prediction accuracy MAE=0.855821
N= 20 Prediction accuracy MAE=0.857916
N= 30 Prediction accuracy MAE=0.858719
N= 40 Prediction accuracy MAE=0.849656
N= 50 Prediction accuracy MAE=0.844740
N= 60 Prediction accuracy MAE=0.840211
N= 70 Prediction accuracy MAE=0.836142
N= 80 Prediction accuracy MAE=0.833538
N= 90 Prediction accuracy MAE=0.833280
N= 100 Prediction accuracy MAE=0.831454
