In [1]:
import numpy as np
import pandas as pd

In [3]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('/home/ian/code/github/data/ml-100k/u.data', sep='\t', names=header)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
df.shape

(100000, 4)

In [5]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))  

Number of users = 943 | Number of movies = 1682


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
#数据集会被打乱
train_data, test_data = train_test_split(df, test_size=0.25)

In [21]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    #由于user_id和item_id都是从1开始编号的，所有要减一
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [22]:
from sklearn.metrics.pairwise import pairwise_distances

In [23]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [24]:
type(user_similarity)

numpy.ndarray

In [26]:
user_similarity[:5,:5]

array([[0.        , 0.87967139, 0.96200474, 0.95483703, 0.70580442],
       [0.87967139, 0.        , 0.92819722, 0.93103175, 0.9481617 ],
       [0.96200474, 0.92819722, 0.        , 0.75892217, 0.972037  ],
       [0.95483703, 0.93103175, 0.75892217, 0.        , 0.95715971],
       [0.70580442, 0.9481617 , 0.972037  , 0.95715971, 0.        ]])

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
user_similarity = cosine_similarity(train_data_matrix)

In [29]:
user_similarity

array([[1.        , 0.12032861, 0.03799526, ..., 0.08042927, 0.10006484,
        0.29795144],
       [0.12032861, 1.        , 0.07180278, ..., 0.09454163, 0.12720728,
        0.03808831],
       [0.03799526, 0.07180278, 1.        , ..., 0.11298097, 0.14293488,
        0.02132329],
       ...,
       [0.08042927, 0.09454163, 0.11298097, ..., 1.        , 0.06923753,
        0.10690497],
       [0.10006484, 0.12720728, 0.14293488, ..., 0.06923753, 1.        ,
        0.08600008],
       [0.29795144, 0.03808831, 0.02132329, ..., 0.10690497, 0.08600008,
        1.        ]])

In [60]:
def predict(ratings, similarity, type='user'):
    '''
    @ratings: 评分矩阵
    @similarity: 相似度矩阵
    '''
    if type == 'user':
        #用户对于所有电影的平均打分
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        #中心化（评分均值为0）
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.abs(similarity).sum(axis=1)[:, np.newaxis]
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [61]:
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [67]:
user_prediction.shape

(943, 1682)

In [72]:
test_data_matrix[test_data_matrix.nonzero()].flatten()

array([5., 5., 4., ..., 2., 2., 2.])

In [63]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [33]:
user_prediction[:5,:5]

array([[2.0408134 , 0.82410927, 0.53107103, 1.11311089, 0.59267634],
       [1.59774444, 0.19384446, 0.1441214 , 0.39874917, 0.1429882 ],
       [0.97462462, 0.16651001, 0.08289964, 0.30456599, 0.12235106],
       [1.19904954, 0.17730574, 0.06021516, 0.36997217, 0.10601698],
       [2.0270313 , 0.69758174, 0.30926548, 0.89595447, 0.40370688]])

In [64]:
rmse(user_prediction, test_data_matrix)

2.9521053024551134

In [65]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [66]:
rmse(item_prediction, test_data_matrix)

3.4447258501822633