# Matrix Factorization 구현

## Import module

In [19]:
# install surprise: conda install -c conda-forge scikit-surprise
import os
import numpy as np
import pandas as pd
import surprise

## Load Data

In [20]:
# rating_df
rating_df = pd.read_csv("ml-25m/ratings.csv")

# movie_csv
movie_df = pd.read_csv("ml-25m/movies.csv")

## preprocessing

In [21]:
del movie_df["genres"]  # 필요 없는 genres를 제거
movie_df

merge_df = pd.merge(rating_df, movie_df, on="movieId", how="outer")

merge_df # 25003471

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1.0,296,5.0,1.147880e+09,Pulp Fiction (1994)
1,3.0,296,5.0,1.439474e+09,Pulp Fiction (1994)
2,4.0,296,4.0,1.573939e+09,Pulp Fiction (1994)
3,5.0,296,4.0,8.307862e+08,Pulp Fiction (1994)
4,7.0,296,4.0,8.354447e+08,Pulp Fiction (1994)
...,...,...,...,...,...
25003466,,208411,,,Eternal Blood (2002)
25003467,,208413,,,Big Business (1929)
25003468,,208415,,,The Student of Prague (1926)
25003469,,208655,,,The Coldest Game (2019)


In [22]:
del merge_df["title"]
merge_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1.0,296,5.0,1.147880e+09
1,3.0,296,5.0,1.439474e+09
2,4.0,296,4.0,1.573939e+09
3,5.0,296,4.0,8.307862e+08
4,7.0,296,4.0,8.354447e+08
...,...,...,...,...
25003466,,208411,,
25003467,,208413,,
25003468,,208415,,
25003469,,208655,,


## 결측치 확인

In [23]:
# 결측치의 수가 같다.
merge_df.isnull().sum()

merge_df = merge_df.dropna(0)
merge_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1.0,296,5.0,1.147880e+09
1,3.0,296,5.0,1.439474e+09
2,4.0,296,4.0,1.573939e+09
3,5.0,296,4.0,8.307862e+08
4,7.0,296,4.0,8.354447e+08
...,...,...,...,...
25000090,162358.0,200192,2.0,1.553453e+09
25000091,162358.0,200194,2.0,1.553454e+09
25000092,162386.0,139970,3.5,1.549216e+09
25000093,162386.0,200726,4.0,1.554651e+09


In [24]:
train_df = merge_df[(merge_df['timestamp'] >= 1104505203) & (merge_df['timestamp'] <= 1230735592)]
test_df = merge_df[merge_df['timestamp'] >= 1230735600]

reader = surprise.Reader(rating_scale=(1,5))
reader

<surprise.reader.Reader at 0x7f8bf1e94450>

In [39]:
# timestamp 열 제거
train_time =  train_df["timestamp"]
test_time = test_df["timestamp"]

## Train Model using Surprise package

In [26]:
col_list = ["userId", "movieId", "rating"]

train_data = surprise.Dataset.load_from_df(train_df[col_list], reader)

trainset = train_data.build_full_trainset()

algo = surprise.SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8bf48e0810>

In [27]:
test_data = surprise.Dataset.load_from_df(test_df[col_list], reader)
testset = test_data.build_full_trainset()

testset = testset.build_testset()
predictions = algo.test(testset)
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.9985


0.998478950488254

In [50]:
result_df = pd.DataFrame(predictions)
test_df = pd.DataFrame(test_time)

result_df = pd.concat([result_df, test_df], axis=1).dropna()
del result_df["details"]
result_df.columns = ["userId", "movieId", "rating", "predicted rating", "timestamp"]

result_df = result_df.reset_index()
del result_df["index"]

## 최종 산출물

In [55]:
result_df

Unnamed: 0,userId,movieId,rating,predicted rating,timestamp
0,3.0,1217.0,5.0,3.982777,1.439474e+09
1,3.0,1653.0,5.0,3.738982,1.573939e+09
2,3.0,6711.0,4.0,3.646257,1.238030e+09
3,3.0,7361.0,4.5,3.922553,1.506209e+09
4,3.0,260.0,4.0,3.924778,1.466675e+09
...,...,...,...,...,...
5827742,81068.0,36850.0,4.5,3.425707,1.363294e+09
5827743,81068.0,31502.0,3.0,3.425707,1.315041e+09
5827744,148156.0,31042.0,5.0,3.399324,1.538285e+09
5827745,86661.0,67429.0,5.0,3.471926,1.446835e+09
