In [1]:
# p.119 5.4 類似度の計算
import json
import numpy as np

# user1とuser2の間のユークリッドスコアを計算する
def euclidean_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
        
    common_movies = {}
    
    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1
            
    # 共通する映画がない場合、類似度を計算できないので0を返す
    if len(common_movies) == 0:
        return 0
    
    
    squared_diff = []
    
    for item in common_movies:
        squared_diff.append(np.square(dataset[user1][item] - 
                                     dataset[user2][item]))
        
    return 1 / (1 + np.sqrt(np.sum(squared_diff)))

In [2]:
# user1とuser2の間のピアソンスコアを計算する
def pearson_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
        
    common_movies = {}

    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1
            
    num_ratings = len(common_movies)
    
    if num_ratings == 0:
        return 0
    
    # 評点の総和
    user1_sum = np.sum([dataset[user1][item] for item in common_movies])
    user2_sum = np.sum([dataset[user2][item] for item in common_movies])
    
    # 評点の二乗和
    user1_squared_sum = np.sum([np.square(dataset[user1][item])
                               for item in common_movies])
    user2_squared_sum = np.sum([np.square(dataset[user2][item])
                               for item in common_movies])
    # 評点の積の和
    sum_of_products = np.sum([dataset[user1][item] * dataset[user2][item]
                             for item in common_movies])
    
    Sxy = sum_of_products - (user1_sum * user2_sum / num_ratings)
    Sxx = user1_squared_sum - np.square(user1_sum) / num_ratings
    Syy = user2_squared_sum - np.square(user2_sum) / num_ratings
    
    if Sxx * Syy == 0:
        return 0
    
    return Sxy / np.sqrt(Sxx * Syy)

In [3]:
ratings_file = 'ratings.json'

with open(ratings_file, 'r') as f:
    data = json.loads(f.read())
    
user1 = 'David Smith'
user2 = 'Bill Duffy'

print("Euclidean score:")
print(euclidean_score(data, user1, user2))

print("Pearson score:")
print(pearson_score(data, user1, user2))

Euclidean score:
0.585786437626905
Pearson score:
0.9909924304103233


In [4]:
print(data)

{'David Smith': {'Vertigo': 4, 'Scarface': 4.5, 'Raging Bull': 3.0, 'Goodfellas': 4.5, 'The Apartment': 1.0}, 'Brenda Peterson': {'Vertigo': 3.0, 'Scarface': 1.5, 'Raging Bull': 1.0, 'Goodfellas': 2.0, 'The Apartment': 5.0, 'Roman Holiday': 4.5}, 'Bill Duffy': {'Vertigo': 4.5, 'Scarface': 5.0, 'Goodfellas': 4.5, 'The Apartment': 1.0}, 'Samuel Miller': {'Scarface': 3.5, 'Raging Bull': 5.0, 'The Apartment': 1.0, 'Goodfellas': 5.0, 'Roman Holiday': 1.0}, 'Julie Hammel': {'Scarface': 2.5, 'Roman Holiday': 4.5, 'Goodfellas': 3.0}, 'Clarissa Jackson': {'Vertigo': 5.0, 'Scarface': 4.5, 'Raging Bull': 4.0, 'Goodfellas': 2.5, 'The Apartment': 1.0, 'Roman Holiday': 1.5}, 'Adam Cohen': {'Vertigo': 3.5, 'Scarface': 3.0, 'The Apartment': 1.0, 'Goodfellas': 4.5, 'Roman Holiday': 3.0}, 'Chris Duncan': {'The Apartment': 1.5, 'Raging Bull': 4.5}}


In [5]:
user1 = 'David Smith'
user2 = 'Brenda Peterson'

print("Euclidean score:")
print(euclidean_score(data, user1, user2))

print("Pearson score:")
print(pearson_score(data, user1, user2))

Euclidean score:
0.1424339656566283
Pearson score:
-0.7236759610155113


In [6]:
# p.124 5.5 強調フィルタを用いた類似ユーザーの検索

# 入力ユーザーに似たユーザーをデータセットから検索する
def find_similar_users(dataset, user, num_users):
    if user not in dataset:
        raise TypeError('Cannot find ' + user + 'in the dataset')
        
    scores = [(x, pearson_score(dataset, user, x))
             for x in dataset if x != user]
    
    scores.sort(key = lambda p: p[1], reverse=True)
    
    return scores[:num_users]

In [7]:
#user = 'Bill Duffy'
user = 'Clarissa Jackson'

print('Users similar to ' + user + ':\n')
similar_users = find_similar_users(data, user, 3)
print('User\t\t\tSimilarity score')
print('-'*41)
for item in similar_users:
    print(item[0], '\t\t', round(item[1], 2))

Users similar to Clarissa Jackson:

User			Similarity score
-----------------------------------------
Chris Duncan 		 1.0
Bill Duffy 		 0.83
Samuel Miller 		 0.73


In [8]:
# p.126 5.6 映画推薦システム

# input_user の推薦をする
def get_recommendations(dataset, input_user):
    similar_users = find_similar_users(dataset, input_user, 3)
    
    overall_scores = {}
    similarity_scores = {}

    for user, pscore in similar_users:
        for item, iscore in dataset[user].items():
            if item in dataset[input_user] and dataset[input_user][item] > 0:
                continue
                
            overall_scores[item] = overall_scores.get(item, 0) + iscore * pscore
            similarity_scores[item] = similarity_scores.get(item, 0) + pscore
            
    if len(overall_scores) == 0:
        return ['No recommendations possible']
    
    movie_scores = [(item, score / similarity_scores[item])
                   for item, score in overall_scores.items()]
    
    movie_scores.sort(key=lambda p: p[1], reverse=True)
    return movie_scores

In [9]:
user = 'Chris Duncan'

print("Movie recommendations for " + user + ":")
movies = get_recommendations(data, user)
for i, movie in enumerate(movies):
    print(str(i + 1) + '.', movie[0], ':', round(movie[1], 2))

Movie recommendations for Chris Duncan:
1. Vertigo : 4.5
2. Scarface : 4.17
3. Goodfellas : 4.0
4. Roman Holiday : 1.25


In [10]:
user = 'Julie Hammel'

print("Movie recommendations for " + user + ":")
movies = get_recommendations(data, user)
for i, movie in enumerate(movies):
    print(str(i + 1) + '.', movie[0], ':', round(movie[1], 2))

Movie recommendations for Julie Hammel:
1. The Apartment : 5.0
2. Vertigo : 3.0
3. Raging Bull : 1.0
