# user_based协同过滤算法简单实现

## 1. userId, movieId矩阵
## 2. 用户用户相关性矩阵(pearson)
## 3. pearson作为权重*rating得到推荐score
## 4. 展示top-5的movieId

In [1]:
import sys,os
import numpy as np
import pandas as pd
import math

In [2]:
df = pd.read_csv('/Users/yihaoli/Desktop/dataset/movielens-20m-dataset/rating.csv')
df = df[list(df.columns)[:-1]]
# 数据量太大, 先拿10000条做测试
df = df.head(10000)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
userId     10000 non-null int64
movieId    10000 non-null int64
rating     10000 non-null float64
dtypes: float64(1), int64(2)
memory usage: 234.5 KB


In [4]:
# 查看是否存在重复评价的情况
# 不存在
df.groupby(['userId','movieId']).count().sort_values('rating',ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,2,1
58,2139,1
58,2076,1
58,2097,1
58,2108,1
...,...,...
29,21,1
29,22,1
29,23,1
29,24,1


In [5]:
index = list(df.groupby('userId').count().reset_index().sort_values('userId')['userId'])

In [6]:
column = list(df.groupby('movieId').count().reset_index().sort_values('movieId')['movieId'])

In [7]:
len(index)

91

In [8]:
len(column)

2889

In [23]:
user_item = pd.DataFrame(index = index, columns = column)

In [24]:
for line in df.itertuples():
    user_item.loc[line[1],line[2]] = line[3]

In [38]:
user_item.loc[3,1] is not np.nan

True

In [28]:
user_user = pd.DataFrame(index = index, columns = index)

In [42]:
def get_pearson(column,user1,user2):
    sum_xy = 0  
    sum_x = 0  
    sum_y = 0  
    sum_x2 = 0  
    sum_y2 = 0  
    n = 0  
    for c in column:
        if user_item.loc[user1,c] is not np.nan and user_item.loc[user2,c] is not np.nan:
            n += 1
            x = user_item.loc[user1,c]
            y = user_item.loc[user2,c]  
            sum_xy += x * y  
            sum_x += x  
            sum_y += y  
            sum_x2 += pow(x, 2)  
            sum_y2 += pow(y, 2)  
    if n == 0:  
        return 0  
    #皮尔逊相关系数计算公式 
    denominator = math.sqrt(sum_x2 - pow(sum_x, 2) / n) * math.sqrt(sum_y2 - pow(sum_y, 2) / n)  
    if denominator == 0:  
        return 0  
    else:  
        return (sum_xy - (sum_x * sum_y) / n) / denominator

In [43]:
for user1 in index:
    for user2 in index:
        user_user.loc[user1,user2] = user_user.loc[user2,user1] = get_pearson(column,user1,user2)

In [44]:
user_user

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,82,83,84,85,86,87,88,89,90,91
1,1,-0.0693375,0.225312,0,0.467677,0,0.0802846,0,0,0.528761,...,-0.359618,0.560065,0.5,-0.507833,0.0714286,-0.600099,0.231803,0.5,0.126657,0.0342076
2,-0.0693375,1,-0.0130233,0,0.645497,0.866025,0.566164,-0.408248,0,0,...,0.166667,0,0,0,0,0,-0.0267261,0,-0.480493,0.0709038
3,0.225312,-0.0130233,1,0.243975,-0.184208,-0.275839,0.023351,0.129823,0.688247,0.0890411,...,-0.34641,0.0281343,0.0215917,0,0.16431,0.581318,0.378012,0.0316228,0.019492,0.283332
4,0,0,0.243975,1,0.164071,0,0.133631,0.585293,0,0,...,0,0,0,0,0,0,-0.797993,0,-0.3669,-0.240359
5,0.467677,0.645497,-0.184208,0.164071,1,0.145296,0.46486,-0.0807661,0,0,...,-0.342997,-0.560612,0.0516398,-0.5625,0,-0.349215,-0.316176,0,0.317486,0.210192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,-0.600099,0,0.581318,0,-0.349215,0,0.133631,0.301253,0,0,...,0.866025,-0.426401,0.422821,0,0,1,0,0,0.261617,0.286967
88,0.231803,-0.0267261,0.378012,-0.797993,-0.316176,-0.199826,0.165458,0.246321,0.75868,0.123961,...,0.212476,0.284499,0.612008,0,0.622543,0,1,0.462732,0.260169,0.332626
89,0.5,0,0.0316228,0,0,0,0.300669,0,0.516398,0,...,0.522708,0,0,0,0.789777,0,0.462732,1,0.239579,0.0325656
90,0.126657,-0.480493,0.019492,-0.3669,0.317486,0.522233,-0.0756565,0.202291,-0.337778,-0.762493,...,0.209459,0.0213201,-0.175964,0.866025,-0.0213395,0.261617,0.260169,0.239579,1,0.363568


In [45]:
user_item_recom = pd.DataFrame(index = index, columns=column)

In [49]:
def get_score(user_item,user_user,user,item,index):
    score = 0
    if user_item.loc[user,item] is not np.nan:
        return 0
    for i in index:
        if user_item.loc[i,item] is not np.nan and user_user.loc[i,user] > 0:
            score = score + user_item.loc[i,item]*user_user.loc[i,user]
    return score

In [50]:
for user in index:
    print('user: %d'%user)
    for item in column:
        user_item_recom.loc[user,item] = get_score(user_item,user_user,user,item,index)

In [62]:
user_item_recom = user_item_recom.astype(float)

In [68]:
user_item_recom = user_item_recom.T

In [77]:
# The top-5 recom movieId for userId 1
list(user_item_recom.sort_values(1,ascending = False)[1].head(5).index)

[480, 110, 356, 588, 527]

In [78]:
def get_top5(user,user_item_recom):
    return list(user_item_recom.sort_values(user,ascending = False)[user].head(5).index)

In [79]:
top5_recom = pd.DataFrame(index = index, columns = [1,2,3,4,5])

In [81]:
for user in index:
    five_top = get_top5(user,user_item_recom)
    for i in range(len(five_top)):
        top5_recom.loc[user,i+1] = five_top[i]

In [82]:
top5_recom

Unnamed: 0,1,2,3,4,5
1,480,110,356,588,527
2,356,780,593,318,457
3,356,296,110,608,47
4,296,590,110,593,260
5,356,2571,2028,1197,296
...,...,...,...,...,...
87,480,356,589,110,2571
88,356,318,296,593,480
89,296,1197,593,608,356
90,1198,1136,1197,858,50
