In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = '../data/movielens/'

ratings_df = pd.read_csv(path + 'ratings.csv')

In [3]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

In [4]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
77701,483,8529,4.0,1215545278
94477,599,33437,2.5,1498518389
36246,247,5349,2.0,1467645405
17483,111,7361,3.5,1516140853
100300,610,57504,4.5,1493847901


In [5]:
# 빠른 훈련과 수렴을 위해 데이터의 수를 줄여보자
train_df_small = train_df[:1000]

$$min \sum (r_{ui} - \mu - b_i - b_u -\vec{x_u}^T \vec{y}_i)^2 + \lambda (||\vec{x_u}||^2 + ||\vec{y_i}||^2 + b_i^2 + b_u^2)$$

In [6]:
# unstack()을 이용해서 만들어도 된다.
# sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
# sparse_matrix.index.name = 'movieId'

sparse_matrix = train_df_small.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

sparse_matrix.index.name = 'movieId'

print(sparse_matrix.shape)
sparse_matrix.head()

(806, 341)


userId,1,4,6,8,10,14,15,18,19,20,...,599,600,601,602,603,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
sparse_matrix = sparse_matrix.to_numpy()

# Train MF

In [8]:
module_path = '/home/minsoo/Workspace/RecSys-study/'
import sys
sys.path.append(module_path)

from src.model.MF import MF

In [9]:
mf = MF(sparse_matrix, latent_dim=50, alpha=0.1, beta=0.01, iters=30)

In [10]:
mf.train()

Iteration: 1; train_rmse = 0.80349
Iteration: 2; train_rmse = 0.68728
Iteration: 3; train_rmse = 0.60118
Iteration: 4; train_rmse = 0.53038
Iteration: 5; train_rmse = 0.46800
Iteration: 6; train_rmse = 0.41223
Iteration: 7; train_rmse = 0.36228
Iteration: 8; train_rmse = 0.31689
Iteration: 9; train_rmse = 0.27620
Iteration: 10; train_rmse = 0.24084
Iteration: 11; train_rmse = 0.21081
Iteration: 12; train_rmse = 0.18530
Iteration: 13; train_rmse = 0.16390
Iteration: 14; train_rmse = 0.14553
Iteration: 15; train_rmse = 0.12975
Iteration: 16; train_rmse = 0.11604
Iteration: 17; train_rmse = 0.10397
Iteration: 18; train_rmse = 0.09342
Iteration: 19; train_rmse = 0.08421
Iteration: 20; train_rmse = 0.07607
Iteration: 21; train_rmse = 0.06892
Iteration: 22; train_rmse = 0.06263
Iteration: 23; train_rmse = 0.05706
Iteration: 24; train_rmse = 0.05211
Iteration: 25; train_rmse = 0.04778
Iteration: 26; train_rmse = 0.04399
Iteration: 27; train_rmse = 0.04057
Iteration: 28; train_rmse = 0.03754
I

In [11]:
mf.show_full_matrix().shape

(806, 341)