In [1]:
import math
from datetime import datetime

import pandas as pd
from pandas import DataFrame
import numpy as np

### Load Data

In [2]:
rating_df = pd.read_csv("./ml-25m/ratings.csv")
movie_df = pd.read_csv("./ml-25m/movies.csv")
genre_df = pd.read_csv("./ml-25m/genome-scores.csv")

In [3]:
rating_df.isnull().sum(), movie_df.isnull().sum(), genre_df.isnull().sum()

(userId       0
 movieId      0
 rating       0
 timestamp    0
 dtype: int64,
 movieId    0
 title      0
 genres     0
 dtype: int64,
 movieId      0
 tagId        0
 relevance    0
 dtype: int64)

### 결측치 확인

In [4]:
rating_df = rating_df.sort_values(['timestamp', 'userId'])

In [5]:
train_df = rating_df[(rating_df['timestamp'] >= 1104505203) & (rating_df['timestamp'] <= 1230735592)]
test_df = rating_df[rating_df['timestamp'] >= 1230735600]

In [6]:
print("Train: ", len(train_df) / (len(train_df) + len(test_df)) * 100) # train data is too small...
print("Test: ", len(test_df) / (len(train_df) + len(test_df)) * 100)

Train:  28.629003411388805
Test:  71.37099658861119


### Pivot Table

In [7]:
# train_df = train_df.set_index("userId")

# Construct rating Matrix
train_df = train_df.groupby(["userId", "movieId"])['rating'].sum().unstack()

# fill 0 value in NaN
train_df = train_df.fillna(0)

train_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,64957,64969,64976,64983,64986,64990,64993,64997,65011,65025
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_data = train_df.values

### matrix decomposition

In [9]:
# U, s, VT = np.linalg.svd(new_train, compute_uv = True)

In [10]:
# r = s.shape[0] - sum(lx < 1.e-8 for lx in s)

In [11]:
# svd를 하고 다시 복원을 하면 빈 값들이 채워진다. 이것을 모델로 만들어서 사용?
# SVD 자체를 numpy로 구현하고 Train data 사용 모델링, 피팅 이후 Test data로 평가...?
# SVD 를 사용 
# 모든 형태의 Matrix에 적용 가능
# U: AAT 의 고유벡터
# V: ATA 의 고유벡터

### r = qTp구현

- Singular Value Decomposition U, s, Vt = svd(new_train)
- new_train data is too sparse
- U: AAT 의 고유벡터
- s: A의 특이값들을 대각항으로 가지는 대각행렬
- VT: ATA의 고유벡터
- s는 eigenvalue에 sqrt값을 취한 값으로 scaling으로 볼수 있음

In [12]:
lr = 0.01
lm = 0.001
epoch = 10

In [13]:
U, s, VT = np.linalg.svd(train_data, compute_uv=True)

In [14]:
r = s.shape[0] - sum(lx < 1.e-8 for lx in s) # s.shape[0] - sum(s < 1.e-8)

sqrt_s = np.array([math.sqrt(a) for a in s]) # np.sqrt(s)

In [15]:
pu = U[:, :r] @ np.diag(sqrt_s[:r])
qi = np.diag(sqrt_s[:r]) @ VT[:r, :]

In [16]:
for _ in range(1, epoch+1):
    R_hat = pu @ qi
    
    err = (train_data - R_hat)**2
    
    new_qi = qi + (lr * ((pu.T @ err) - (lm * qi)))
    new_pu = pu + (lr * ((err @ qi.T) - (lm * pu)))
    
    qi, pu = new_qi, new_pu
    print("epoch : {} / datetime : {} / loss : {}".format(_, datetime.now(), err.sum()))

epoch : 1 / datetime : 2020-09-07 14:29:17.286627 / loss : 2.494497868739665e-21
epoch : 2 / datetime : 2020-09-07 14:30:50.487606 / loss : 0.02413437835415505
epoch : 3 / datetime : 2020-09-07 14:32:23.943517 / loss : 0.09636490511060801
epoch : 4 / datetime : 2020-09-07 14:33:57.604724 / loss : 0.2159275512191261
epoch : 5 / datetime : 2020-09-07 14:35:31.049074 / loss : 0.38141903380797426
epoch : 6 / datetime : 2020-09-07 14:37:04.486486 / loss : 0.5908709071604229
epoch : 7 / datetime : 2020-09-07 14:38:38.137521 / loss : 0.8418544756455532
epoch : 8 / datetime : 2020-09-07 14:40:11.625870 / loss : 1.131606119492853
epoch : 9 / datetime : 2020-09-07 14:41:45.054392 / loss : 1.4571612698673322
epoch : 10 / datetime : 2020-09-07 14:43:18.526444 / loss : 1.8154857212625617


In [17]:
# pu * qi => rating