# ダミーデータを使ったサンプル

## ライブラリのインポート

In [1]:
%matplotlib inline
from __future__ import print_function

try:
    xrange
except NameError:
    xrange = range

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.spatial as sp
import scipy.sparse as sparse
from sklearn.decomposition import NMF, TruncatedSVD

## ダミーデータ作成

In [2]:
rating_matrix = np.array([[2, 5, 1, 1, 0, 1, 2, 4],
                          [1, 5, 2, 1, 4, 0, 0, 3],
                          [0, 3, 3, 0, 1, 1, 1, 1],
                          [5, 2, 2, 3, 1, 0, 0, 4],
                          [5, 3, 3, 4, 1, 0, 0, 5],
                          [1, 4, 3, 2, 5, 1, 0, 1],
                          [0, 0, 0, 0, 0, 0, 0, 2],
                          [0, 4, 0, 0, 0, 0, 0, 0]])

## 協調フィルタリング

### 類似度計算のステップ

In [3]:
target_user = 0

# コサイン類似度 = 1- コサイン距離
user_similarity = np.array([1 - sp.distance.cosine(rating_matrix[target_user], rating_matrix[i]) for i in range(len(rating_matrix))])
user_similarity

array([ 1.        ,  0.77831178,  0.73914049,  0.7402121 ,  0.78215389,
        0.58777469,  0.5547002 ,  0.69337525])

In [4]:
topN = 4
user_idx = np.array(range(0, len(rating_matrix)))

# 降順ソート
arg_sort = np.argsort(user_similarity)[::-1]

# 自分を覗いたトップN
selected_idx = arg_sort[1:topN + 1]

print([(idx, matrix) for (idx, matrix) in zip(selected_idx, rating_matrix[selected_idx])])
selected_user_similarity = user_similarity[selected_idx]
selected_rating_matrix = rating_matrix[selected_idx]

[(4, array([5, 3, 3, 4, 1, 0, 0, 5])), (1, array([1, 5, 2, 1, 4, 0, 0, 3])), (3, array([5, 2, 2, 3, 1, 0, 0, 4])), (2, array([0, 3, 3, 0, 1, 1, 1, 1]))]


### 予測のステップ

In [5]:
avg_score = []
for col_idx in range(selected_rating_matrix.shape[1]):
    weight_score = sum(selected_rating_matrix[:, col_idx] * selected_user_similarity)
    similarity_sum = sum(selected_user_similarity[selected_user_similarity > 0])
    avg_score.append(weight_score / similarity_sum)

for (i, v) in enumerate(avg_score):
    print(str(i) + 'th item:' + str(v))

0th item:2.76008004028
1th item:3.26857245915
2th item:2.50045570209
3th item:2.01576643714
4th item:1.76811675706
5th item:0.243152856729
6th item:0.243152856729
7th item:3.27180535615


# ニュースアプリにおけるレコメンデーションの例

In [None]:
data = pd.read_csv("./dataset/user_topic_follow_dummy.csv", encoding="utf-8")
display(data.head())
display(data.shape)
display(len(data['user_id'].unique()))
display(len(data['topic_name'].unique()))
display(data.isnull().any())

In [None]:
data.drop_duplicates(keep='last', inplace=True)

In [None]:
converted_dummies = pd.get_dummies(data, drop_first = True)
converted_dummies