In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

module_path = '/home/minsoo/Workspace/RecSys-study/fastcampus_lecture/'
import sys
sys.path.append(module_path)
from fcRecSys.utils import cos_sim_matrix

In [2]:
path = '../data/movielens/'

ratings_df = pd.read_csv(path + 'ratings.csv')

In [3]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

# Sparse Matrix 만들기

In [4]:
# unstack()을 이용해서 만들어도 된다.
# sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
# sparse_matrix.index.name = 'movieId'

sparse_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
)
sparse_matrix.index.name = 'movieId'

print(sparse_matrix.shape)
sparse_matrix.head()

(8975, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,,,4.5,,,,...,4.0,,4.0,3.0,,2.5,4.0,2.5,3.0,
2,,,,,,4.0,,4.0,,,...,,,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [5]:
# fillna

# 영화별 평균
sparse_matrix_fillna_movie = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=1)
# 유저별 평균
sparse_matrix_fillna_user = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)

In [6]:
sparse_matrix_fillna_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.903846,2.483333,3.558824,3.647059,3.501976,4.5,3.560976,3.147059,3.306723,...,4.0,3.275229,4.0,3.0,3.204918,2.5,4.0,2.5,3.0,3.679612
2,4.346369,3.903846,2.483333,3.558824,3.647059,4.0,3.25641,4.0,3.147059,3.306723,...,4.447059,3.275229,3.511873,5.0,3.5,3.650219,3.802548,2.0,3.285714,3.679612
3,4.0,3.903846,2.483333,3.558824,3.647059,5.0,3.25641,3.560976,3.147059,3.306723,...,4.447059,3.275229,3.511873,3.518987,3.204918,3.650219,3.802548,3.153214,3.285714,3.679612
4,4.346369,3.903846,2.483333,3.558824,3.647059,3.0,3.25641,3.560976,3.147059,3.306723,...,4.447059,3.275229,3.511873,3.518987,3.204918,3.650219,3.802548,3.153214,3.285714,3.679612
5,4.346369,3.903846,2.483333,3.558824,3.647059,5.0,3.25641,3.560976,3.147059,3.306723,...,4.447059,3.275229,3.511873,3.0,3.204918,3.650219,3.802548,3.153214,3.285714,3.679612


# MF with SVD

In [9]:
def MF_SVD(sparse_matrix, k=300):
    # svd 진행
    u, s, vh = np.linalg.svd(sparse_matrix.transpose())

    # k dim을 이용
    S = s[:k] * np.identity(k, float)
    T = u[:, :k]
    Dt = vh[:k, :]

    # latent vector 구하기
    item_factors = np.transpose(np.matmul(S, Dt))
    user_factors = np.transpose(T)

    return item_factors, user_factors

In [10]:
# 영화별 평균

item_factors_movie, user_factors_movie = MF_SVD(sparse_matrix_fillna_movie)

print(item_factors_movie.shape)
print(user_factors_movie.shape)

prediction_result_df_movie = pd.DataFrame(
    np.matmul(item_factors_movie, user_factors_movie),
    columns=sparse_matrix_fillna_movie.columns,
    index=sparse_matrix_fillna_movie.index
).transpose()

prediction_result_df_movie.head()

(8975, 300)
(300, 610)


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.994276,3.397441,3.986413,2.247248,3.145099,3.977196,3.266362,2.851492,3.500398,3.519276,...,4.49996,3.499969,2.999973,3.999964,3.499969,3.999964,3.499969,3.499969,3.499969,3.999964
2,3.953098,3.444153,3.225777,2.236702,3.102533,4.000321,3.135706,2.842163,3.482881,3.50455,...,4.499584,3.499676,2.999722,3.99963,3.499676,3.99963,3.499676,3.499676,3.499676,3.99963
3,3.931089,3.408511,3.245064,2.254837,3.173535,3.999518,3.130999,2.844971,3.479073,3.473568,...,4.500282,3.500219,3.000188,4.00025,3.500219,4.00025,3.500219,3.500219,3.500219,4.00025
4,3.917234,3.42179,3.209758,2.245421,3.176535,4.022505,3.197609,2.848334,3.501182,3.507897,...,4.499849,3.499882,2.999899,3.999865,3.499882,3.999865,3.499882,3.499882,3.499882,3.999865
5,4.090706,3.359531,3.20626,2.183866,3.228734,4.121995,3.184014,2.871495,3.530002,3.297313,...,4.499557,3.499656,2.999705,3.999606,3.499656,3.999606,3.499656,3.499656,3.499656,3.999606


In [12]:
# 유저별 평균

item_factors_user, user_factors_user = MF_SVD(sparse_matrix_fillna_user)

print(item_factors_user.shape)
print(user_factors_user.shape)

prediction_result_df_user = pd.DataFrame(
    np.matmul(item_factors_user, user_factors_user),
    columns=sparse_matrix_fillna_user.columns,
    index=sparse_matrix_fillna_user.index
).transpose()

prediction_result_df_user.head()

(8975, 300)
(300, 610)


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.014089,4.418084,3.894489,4.343368,4.387456,3.978776,4.344624,4.346252,4.369311,4.297153,...,4.345739,4.346569,4.346984,4.346154,4.346569,4.346154,4.346569,4.346569,4.346569,4.346118
2,3.951583,3.896656,3.878916,3.896506,3.886701,3.909773,3.857795,3.896745,3.905299,3.971618,...,3.914722,3.9004,3.893239,3.907561,3.9004,3.907561,3.9004,3.9004,3.9004,3.902304
3,2.485939,2.482545,2.482327,2.476857,2.490145,2.464956,2.47968,2.479189,2.464461,2.500369,...,2.482932,2.48346,2.483724,2.483196,2.48346,2.483196,2.48346,2.48346,2.48346,2.481393
4,3.537109,3.555453,3.546508,3.594289,3.576493,3.579439,3.559048,3.570563,3.5469,3.548544,...,3.560067,3.55843,3.557611,3.559248,3.55843,3.559248,3.55843,3.55843,3.55843,3.559037
5,3.836411,3.643894,3.703015,3.496509,3.700146,3.693009,3.640583,3.696027,3.621526,3.616211,...,3.650031,3.646118,3.644161,3.648074,3.646118,3.648074,3.646118,3.646118,3.646118,3.647456


In [17]:
def evaluate(test_df, prediction_result_df):
  groups_with_movie_ids = test_df.groupby(by='movieId')
  groups_with_user_ids = test_df.groupby(by='userId')
  intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
  intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(groups_with_user_ids.indices.keys()))))

  num_intersec_movie = len(intersection_movie_ids)
  num_intersec_user  = len(intersection_user_ids)
  print(f'겹치는 영화 수: {num_intersec_movie}')
  print(f'겹치는 유저 수: {num_intersec_user}')

  compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]

  # test_df에 대해서 RMSE 계산
  # 겹치는 영화와 유저에 대해서만 RMSE 계산하는 것
  grouped = test_df.groupby(by='userId')
  rmse_df = pd.DataFrame(columns=['rmse'])
  for userId, group in tqdm(grouped):
      if userId in intersection_user_ids:
          pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
          pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
          actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})

          final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
          final_df = final_df.round(4)
          
          if not final_df.empty:
            rmse = np.sqrt(mean_squared_error(final_df['actual_rating'], final_df['pred_rating']))
            rmse_df.loc[userId] = rmse

  return final_df, rmse_df

In [20]:
result_df_movie, _ = evaluate(test_df, prediction_result_df_movie)
print(result_df_movie.head())
print("For fillna_movie matrix")
print(f"    RMSE: {np.sqrt(mean_squared_error(result_df_movie['actual_rating'].values, result_df_movie['pred_rating'].values)):.2f}")

겹치는 영화 수: 4374
겹치는 유저 수: 608


100%|██████████| 608/608 [00:02<00:00, 208.50it/s]

   actual_rating  movieId  pred_rating
0            4.0    53953       3.0672
1            3.5     5025       2.5771
2            2.5    32011       2.1674
3            4.5   116668       3.4999
4            3.5     6281       3.0786
For fillna_movie matrix
    RMSE: 0.76





In [21]:
result_df_user, _ = evaluate(test_df, prediction_result_df_user)
print(result_df_user.head())
print("For fillna_user matrix")
print(f"    RMSE: {np.sqrt(mean_squared_error(result_df_user['actual_rating'].values, result_df_user['pred_rating'].values)):.2f}")

겹치는 영화 수: 4374
겹치는 유저 수: 608


100%|██████████| 608/608 [00:02<00:00, 209.55it/s]

   actual_rating  movieId  pred_rating
0            4.0    53953       3.6745
1            3.5     5025       3.6777
2            2.5    32011       3.6794
3            4.5   116668       3.6796
4            3.5     6281       3.6756
For fillna_user matrix
    RMSE: 0.84





# 파이썬 라이브러리
- `implicit` 써볼까