In [4]:
import json 
import numpy as np 

In [5]:
# user1とuser2の間のユークリッドスコアを計算する
def euclidean_score(dataset, user1, user2): 
    if user1 not in dataset: 
        raise TypeError('Cannot find ' + user1 + ' in the dataset') 
    if user2 not in dataset: 
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
    common_movies = {} 
    for item in dataset[user1]: 
        if item in dataset[user2]: 
            common_movies[item] = 1
    if len(common_movies) == 0: 
        return 0 
    squared_diff = [] 
    for item in common_movies:
        squared_diff.append(np.square(dataset[user1][item] - dataset[user2][item])) 
    return 1 / (1 + np.sqrt(np.sum(squared_diff)))

In [6]:
def pearson_score(dataset, user1, user2): 
    if user1 not in dataset: 
        raise TypeError('Cannot find ' + user1 + ' in the dataset') 
    if user2 not in dataset: 
        raise TypeError('Cannot find ' + user2 + ' in the dataset') 
    common_movies = {} 
    for item in dataset[user1]: 
        if item in dataset[user2]: 
            common_movies[item] = 1
    num_ratings = len(common_movies) 
    if num_ratings == 0: 
        return 0 
    user1_sum = np.sum([dataset[user1][item] for item in common_movies]) 
    user2_sum = np.sum([dataset[user2][item] for item in common_movies]) 
    user1_squared_sum = np.sum([np.square(dataset[user1][item]) 
                                    for item in common_movies]) 
    user2_squared_sum = np.sum([np.square(dataset[user2][item])
                                    for item in common_movies])
    sum_of_products = np.sum([dataset[user1][item] * dataset[user2][item]
                                  for item in common_movies]) 
    Sxy = sum_of_products - (user1_sum * user2_sum / num_ratings) 
    Sxx = user1_squared_sum - np.square(user1_sum) / num_ratings 
    Syy = user2_squared_sum - np.square(user2_sum) / num_ratings
    if Sxx * Syy == 0: 
        return 0 
    return Sxy / np.sqrt(Sxx * Syy)

In [7]:
ratings_file = 'ratings.json' 
with open(ratings_file, 'r') as f: 
    data = json.loads(f.read()) 

In [8]:
user1 = 'David Smith'
user2 = 'Bill Duffy'
print("Euclidean score:") 
print(euclidean_score(data, user1, user2)) 
print("Pearson score:") 
print(pearson_score(data, user1, user2))

Euclidean score:
0.585786437626905
Pearson score:
0.9909924304103233


In [9]:
# 入力ユーザーに似たユーザーをデータセットから検索する
def find_similar_users(dataset, user, num_users): 
    if user not in dataset: 
        raise TypeError('Cannot find ' + user + ' in the dataset') 
    scores = [(x, pearson_score(dataset, user, x))
         for x in dataset if x != user]
    scores.sort(key=lambda p: p[1], reverse=True)    
    return scores[:num_users]

In [10]:
user = 'Bill Duffy'
print('Users similar to ' + user + ':\n') 
similar_users = find_similar_users(data, user, 3) 
print('User\t\t\tSimilarity score') 
print('-'*41) 
for item in similar_users: 
    print(item[0], '\t\t', round(item[1], 2)) 

Users similar to Bill Duffy:

User			Similarity score
-----------------------------------------
David Smith 		 0.99
Samuel Miller 		 0.88
Adam Cohen 		 0.86
