### Matrix Factorizationを用いたレコメンド
今回はMovieLens100kを用いて実験。  


In [0]:
from time import time
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

#### データセットの内容確認

In [0]:
u_data_org = pd.read_csv(
   'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', names=["user_id", "item_id", "rating", "timestamp"], sep="\t")
u_data_org.head()

#### 学習用・テスト用データの取り込み

In [0]:
# ユーザ×評価値のデータ
u_data_train = pd.read_csv(
   'http://files.grouplens.org/datasets/movielens/ml-100k/ua.base', names=["user_id", "item_id", "rating", "timestamp"], sep="\t")
u_data_test = pd.read_csv(
   'http://files.grouplens.org/datasets/movielens/ml-100k/ua.test', names=["user_id", "item_id", "rating", "timestamp"], sep="\t")

# 件数の確認
train_cnt = u_data_train.count()
test_cnt = u_data_test.count()
print('Train Set:', str(train_cnt), '\n')
print('Test Set:', str(test_cnt))

#### データをitem_id × user_idの行列へ整形

In [0]:
# item_id x user_idの行列に変換する
item_list = u_data_org.sort_values('item_id').item_id.unique()
user_list = u_data_org.user_id.unique()
rating_matrix_item = np.zeros([len(item_list), len(user_list)])

for item_id in tqdm(range(1, len(item_list))):
    user_list_item = u_data_train[u_data_train['item_id'] == item_id].sort_values('user_id').user_id.unique()
    for user_id in user_list_item:
        try:
            user_rate = u_data_train[(u_data_train['item_id'] == item_id) & (u_data_train['user_id'] == user_id)].loc[:, 'rating']
        except:
            user_rate = 0
        rating_matrix_item[item_id-1, user_id-1] = user_rate


In [0]:
rating_matrix_item

In [0]:
# item x userの評価したかどうか{0, 1}がわかる行列作成
rating_matrix_calc = rating_matrix_item.copy()
rating_matrix_calc[rating_matrix_calc != 0] = 1
# 評価していないアイテムに1が立つ行列を作成。後で使う
rating_matrix_train = np.abs(rating_matrix_calc - 1)

### Matrix Factorization

In [0]:
class MatrixFactorization():
  def __init__(self, R, X, Y, k, steps=200, alpha=0.01, lamda=0.001, threshold=0.001):
    self.R = R
    self.m = R.shape[0]
    self.n = R.shape[1]
    self.k = k
    # initializa U and V
    self.U = np.random.rand(self.m, self.k)
    self.V = np.random.rand(self.k, self.n)
    self.alpha = alpha
    self.lamda = lamda
    self.threshold = threshold
    self.steps = steps
    self.info_step = 1

    # preserve user_id list and item_id list
    self.X = X
    self.Y = Y

  def shuffle_in_unison_scary(self, a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

  def fit(self):
    for step in range(self.steps):
      start_time = time()
      error = 0
      # shuffle the order of the entry
      self.shuffle_in_unison_scary(self.X,self.Y)

      # update U and V
      for i in self.X:
        for j in self.Y:
          r_ij = self.R[i-1,j-1]
          if r_ij > 0:
            err_ij = r_ij - np.dot(self.U[i-1,:], self.V[:,j-1])
            for q in range(self.k):
              self.U[i-1,q] += self.alpha * (err_ij * self.V[q, j-1] + self.lamda * self.U[i-1, q])
              self.V[q, j-1] += self.alpha * (err_ij * self.U[i-1, q] + self.lamda * self.V[q, j-1])

      # approximation
      R_hat = np.dot(self.U, self.V)
      # calculate estimation error for observed values
      for i in self.X:
        for j in self.Y:
          r_ij = self.R[i-1, j-1]
          r_hat_ij = R_hat[i-1, j-1]
          if r_ij > 0:
            error += pow(r_ij - r_hat_ij,2)
      # regularization
      error += (self.lamda * np.power(self.U,2).sum()) / 2
      error += (self.lamda * np.power(self.V,2).sum()) / 2

      if step % self.info_step == 0 and step != 0:
        print('Step: %d / Error: %3f [%.1f sec]'%(step, error, time()-start_time))

      if error < self.threshold:
        break
    return self.U, self.V


In [0]:
X = u_data_org.item_id.unique()
Y = u_data_org.user_id.unique()
k = 20
steps = 150

mf = MatrixFactorization(rating_matrix_item, X, Y, k, steps)

In [0]:
U, V = mf.fit()
pred_rating = np.dot(U, V)

In [0]:
pred_rating

### 予測評価値の計算・レコメンド

In [0]:
user_id = 2
hits = 0

# ユーザが既に評価したアイテムのスコアはゼロに直す
rating_matrix_user = rating_matrix_item[:, user_id - 1]
pred_rating_user_item = rating_matrix_user * rating_matrix_train[:,user_id - 1]

#ここからレコメンドされたアイテムがどれだけあっていたかを評価していく
recommend_list = np.argsort(pred_rating_user_item)[::-1][:10] + 1
purchase_list_user = u_data_test[u_data_test.user_id == user_id].loc[:, 'item_id'].unique()
for item_id in recommend_list:
    if item_id in purchase_list_user:
        hits += 1
pre = hits / 10.0

print(pred_rating_user_item)
print('Recommend list:', recommend_list)
print('Test Rated list:', purchase_list_user)
print('Precision:', str(pre))

#### 全体の精度評価

In [0]:
# 予測評価値の計算
precision_list = []
recall_list = []
user_list_test = u_data_test.sort_values('user_id').user_id.unique()

for user_id in tqdm(user_list_test):
    hits = 0
    # ユーザが既に評価したアイテムのスコアはゼロに直す
    pred_rating_user_item = rating_matrix_user * rating_matrix_train[:,user_id - 1]

    #ここからレコメンドされたアイテムがどれだけあっていたかを評価していく
    recommend_list = np.argsort(pred_rating_user_item)[::-1][:10] + 1
    purchase_list_user = u_data_test[u_data_test.user_id == user_id].loc[:, 'item_id'].unique()
    for item_id in recommend_list:
        if item_id in purchase_list_user:
            hits += 1
    pre = hits / 10.0
    precision_list.append(pre)


In [0]:
# 全体の精度検証
precision = sum(precision_list) / len(precision_list)
print('Precision:', precision)