In [2]:
import math
from datetime import datetime

import pandas as pd
from pandas import DataFrame
import numpy as np

### Load Data

In [2]:
rating_df = pd.read_csv("./ml-25m/ratings.csv")
movie_df = pd.read_csv("./ml-25m/movies.csv")
genre_df = pd.read_csv("./ml-25m/genome-scores.csv")

In [3]:
rating_df.isnull().sum(), movie_df.isnull().sum(), genre_df.isnull().sum()

(userId       0
 movieId      0
 rating       0
 timestamp    0
 dtype: int64,
 movieId    0
 title      0
 genres     0
 dtype: int64,
 movieId      0
 tagId        0
 relevance    0
 dtype: int64)

### 결측치 확인

In [4]:
rating_df = rating_df.sort_values(['timestamp', 'userId'])

In [None]:
train_df = rating_df[(rating_df['timestamp'] >= 1104505203) & (rating_df['timestamp'] <= 1230735592)]
test_df = rating_df[rating_df['timestamp'] >= 1230735600]

In [None]:
print("Train: ", len(train_df) / (len(train_df) + len(test_df)) * 100) # train data is too small...
print("Test: ", len(test_df) / (len(train_df) + len(test_df)) * 100)

### Pivot Table

In [None]:
# train_df = train_df.set_index("userId")

# Construct rating Matrix
train_df = train_df.groupby(["userId", "movieId"])['rating'].sum().unstack()

# fill 0 value in NaN
train_df = train_df.fillna(0)

train_df.head()

In [71]:
train_data = train_df.values

### matrix decomposition

In [72]:
# U, s, VT = np.linalg.svd(new_train, compute_uv = True)

In [73]:
# r = s.shape[0] - sum(lx < 1.e-8 for lx in s)

In [74]:
# svd를 하고 다시 복원을 하면 빈 값들이 채워진다. 이것을 모델로 만들어서 사용?
# SVD 자체를 numpy로 구현하고 Train data 사용 모델링, 피팅 이후 Test data로 평가...?
# SVD 를 사용 
# 모든 형태의 Matrix에 적용 가능
# U: AAT 의 고유벡터
# V: ATA 의 고유벡터

### r = qTp구현

- Singular Value Decomposition U, s, Vt = svd(new_train)
- new_train data is too sparse
- U: AAT 의 고유벡터
- s: A의 특이값들을 대각항으로 가지는 대각행렬
- VT: ATA의 고유벡터
- s는 eigenvalue에 sqrt값을 취한 값으로 scaling으로 볼수 있음

In [75]:
lr = 0.01
lm = 0.001
epoch = 10

In [76]:
U, s, VT = np.linalg.svd(train_data, compute_uv=True)

In [77]:
r = s.shape[0] - sum(lx < 1.e-8 for lx in s) # s.shape[0] - sum(s < 1.e-8)

sqrt_s = np.array([math.sqrt(a) for a in s]) # np.sqrt(s)

In [78]:
pu = U[:, :r] @ np.diag(sqrt_s[:r])
qi = np.diag(sqrt_s[:r]) @ VT[:r, :]

In [79]:
for _ in range(1, epoch+1):
    R_hat = pu @ qi
    
    err = (train_data - R_hat)**2
    
    new_qi = qi + (lr * ((pu.T @ err) - (lm * qi)))
    new_pu = pu + (lr * ((err @ qi.T) - (lm * pu)))
    
    qi, pu = new_qi, new_pu
    print("epoch : {} / datetime : {} / loss : {}".format(_, datetime.now(), err.sum()))

epoch : 1 / datetime : 2020-09-08 17:20:52.748381 / loss : 2.4571335762598183e-21
epoch : 2 / datetime : 2020-09-08 17:22:33.722755 / loss : 0.024134378354751043
epoch : 3 / datetime : 2020-09-08 17:24:11.701134 / loss : 0.09636490511179578
epoch : 4 / datetime : 2020-09-08 17:25:51.035957 / loss : 0.21592755122088877
epoch : 5 / datetime : 2020-09-08 17:27:28.872396 / loss : 0.38141903381029885
epoch : 6 / datetime : 2020-09-08 17:29:09.964341 / loss : 0.5908709071632912
epoch : 7 / datetime : 2020-09-08 17:30:50.621690 / loss : 0.8418544756489272
epoch : 8 / datetime : 2020-09-08 17:32:33.408689 / loss : 1.131606119496684
epoch : 9 / datetime : 2020-09-08 17:34:09.233265 / loss : 1.4571612698716248
epoch : 10 / datetime : 2020-09-08 17:35:49.500872 / loss : 1.815485721267283


In [80]:
# pu * qi => rating

# numpy로 데이터 읽어오기

In [5]:
data = np.genfromtxt("./ml-25m/ratings.csv", delimiter=",", names=True)

In [6]:
len(data), data, data.shape

(25000095,
 array([(1.00000e+00,   296., 5. , 1.14788004e+09),
        (1.00000e+00,   306., 3.5, 1.14786882e+09),
        (1.00000e+00,   307., 5. , 1.14786883e+09), ...,
        (1.62541e+05, 56176., 2. , 1.24095070e+09),
        (1.62541e+05, 58559., 4. , 1.24095343e+09),
        (1.62541e+05, 63876., 5. , 1.24095252e+09)],
       dtype=[('userId', '<f8'), ('movieId', '<f8'), ('rating', '<f8'), ('timestamp', '<f8')]),
 (25000095,))

In [7]:
# timestamp 기준으로 나누기
train_data = np.array([list(i) for i in data if 1104505203 <= i[3] <= 1230735592])
test_data = np.array([list(i) for i in data if i[3] >= 1230735600])
print(train_data)

[[1.00000000e+00 2.96000000e+02 5.00000000e+00 1.14788004e+09]
 [1.00000000e+00 3.06000000e+02 3.50000000e+00 1.14786882e+09]
 [1.00000000e+00 3.07000000e+02 5.00000000e+00 1.14786883e+09]
 ...
 [1.62521000e+05 5.96150000e+04 3.50000000e+00 1.21933944e+09]
 [1.62521000e+05 5.97250000e+04 4.00000000e+00 1.21933939e+09]
 [1.62521000e+05 6.00740000e+04 4.00000000e+00 1.21933941e+09]]


# 이후로 LightFM or Surprise package 사용해서 구현해보기

In [8]:
user_id = np.unique(np.array([i[0] for i in train_data]))  # (27915,))
movie_id = np.unique(np.array([i[1] for i in train_data]))  # (10533,))
rating = np.array([i[2] for i in train_data])

In [9]:
from collections import defaultdict

rating_dict = defaultdict(dict)
for i in train_data:
    rating_dict[i[0]][i[1]] = i[2]

In [10]:
# movie_id의 id가 각 키의 values에 없다면, movie-id를 추가하고 값을 0으로 넣는다. 이 과정을 거치면 fillna(0)과 같은 효과
# 그리고 추가된 rating_dict에서 key를 row, values flatten하게 쌓으면된다.
# user_id (27915,))
# movie_id (10533,))
for i in range(len(user_id)):
    for j in range(len(movie_id)):
        if movie_id[j] not in rating_dict[user_id[i]].keys():
            rating_dict[user_id[i]][movie_id[j]] = 0.0
        else:
            pass

In [11]:
# # rating_dict의 키(user_id)마다 movie_id를 기준으로 내림차순 정렬, matrix로 재구성
# rating_matrix = []
# for i in range(len(rating_dict)):
#     sorted(rating_dict[user_id[i]])

In [12]:
# rating_dict에서 movie의 개수가 같은 지 확인

len(rating_dict[1])  == len(rating_dict[2]) # 10533

True

In [13]:
rating_dic = {
    'userId': [],
    'movieId': [],
    'rating': []
}

In [14]:
id_list = []
mov_set = set()

In [15]:
for user_key in rating_dict.keys():
    for mov_key in rating_dict[user_key]:
        rating_dic["userId"].append(user_key)
        rating_dic["movieId"].append(mov_key)
        rating_dic["rating"].append(rating_dict[user_key][mov_key])

In [16]:
print(len(rating_dic["userId"]))
print(len(rating_dic["movieId"]))
print(len(rating_dic["rating"]))

294028695
294028695
294028695


In [17]:
train_df = pd.DataFrame(rating_dic)
train_df

Unnamed: 0,userId,movieId,rating
0,1.0,296.0,5.0
1,1.0,306.0,3.5
2,1.0,307.0,5.0
3,1.0,665.0,5.0
4,1.0,899.0,3.5
...,...,...,...
294028690,162521.0,64990.0,0.0
294028691,162521.0,64993.0,0.0
294028692,162521.0,64997.0,0.0
294028693,162521.0,65011.0,0.0


In [None]:
# install surprise: conda install -c conda-forge scikit-surprise
import surprise

reader = surprise.Reader(rating_scale=(1,5))
reader

col_list = ["userId", "movieId", "rating"]

train_data = surprise.Dataset.load_from_df(train_df[col_list], reader)

trainset = train_data.build_full_trainset()

algo = surprise.SVD()
algo.fit(trainset)

# Testset

In [7]:
from collections import defaultdict

test_rating_dict = defaultdict(dict)
for i in test_data:
    test_rating_dict[i[0]][i[1]] = i[2]

In [11]:
for i in range(len(user_id)):
    for j in range(len(movie_id)):
        if movie_id[j] not in test_rating_dict[user_id[i]].keys():
            test_rating_dict[user_id[i]][movie_id[j]] = 0.0
        else:
            pass

In [12]:
test_rating_dic = {
    'userId': [],
    'movieId': [],
    'rating': []
}

id_list = []
mov_set = set()

In [13]:
for user_key in test_rating_dict.keys():
    for mov_key in test_rating_dict[user_key]:
        test_rating_dic["userId"].append(user_key)
        test_rating_dic["movieId"].append(mov_key)
        test_rating_dic["rating"].append(test_rating_dict[user_key][mov_key])

In [None]:
test_df = pd.DataFrame(test_rating_dic)
test_df

In [None]:
test_data = surprise.Dataset.load_from_df(test_df[col_list], reader)
testset = test_data.build_full_trainset()

testset = testset.build_testset()
predictions = algo.test(testset)

from surprise import accuracy
accuracy.rmse(predictions)

In [1]:
test_df

NameError: name 'test_df' is not defined