In [2]:
# -*- coding: utf-8 -*-

%matplotlib inline

import time
import operator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data Source : https://grouplens.org/datasets/movielens

rating_file_path = "../data/ml-1m/ratings.dat"
movie_file_path = "../data/ml-1m/movies.dat"
user_file_path = "../data/ml-1m/users.dat"

rating_data = pd.io.parsers.read_csv(rating_file_path, 
                                     names=['user_id', 'movie_id', 'rating', 'time'], delimiter='::')
movie_data = pd.io.parsers.read_csv(movie_file_path, 
                                    names=['movie_id', 'title', 'genre'], delimiter='::')
user_data = pd.io.parsers.read_csv(user_file_path, 
                                   names=['user_id', 'gender', 'age', 'occupation', 'zipcode'], delimiter='::')

In [4]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

np.set_printoptions(suppress=True)

reader = Reader(rating_scale=(1, 5))

### 4번 사용자가 아직 안 본 액션 영화들에 대해 평균 평점을 얼마 줄 것인지 출력하는 코드

In [14]:
is_4 = rating_data['user_id'] == 4

user_id4 = rating_data[is_4]
user_id4

Unnamed: 0,user_id,movie_id,rating,time
233,4,3468,5,978294008
234,4,1210,3,978293924
235,4,2951,4,978294282
236,4,1214,4,978294260
237,4,1036,4,978294282
238,4,260,5,978294199
239,4,2028,5,978294230
240,4,480,4,978294008
241,4,1196,2,978294199
242,4,1198,5,978294199


In [21]:
movie_data.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


#### 4번 사용자의 액션 영화 평점 데이터

In [18]:
actionFilter = movie_data['genre'].str.contains("Action")

movie_data_action= movie_data[actionFilter]

merged_movie = user_id4.merge(movie_data_action, on=['movie_id'], how='left')

In [22]:
merged_movie.head()

Unnamed: 0,user_id,movie_id,rating,time,title,genre
0,4,3468,5,978294008,,
1,4,1210,3,978293924,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
2,4,2951,4,978294282,"Fistful of Dollars, A (1964)",Action|Western
3,4,1214,4,978294260,Alien (1979),Action|Horror|Sci-Fi|Thriller
4,4,1036,4,978294282,Die Hard (1988),Action|Thriller


In [24]:
# SVD 라이브러리를 사용하기 위한 학습 데이터를 생성합니다.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_movie[['user_id', 'movie_id', 'rating']], reader)
train_data = data.build_full_trainset()

# SVD 모델을 학습합니다.
train_start = time.time()
model = SVD(n_factors=8,
            lr_all=0.005,
            reg_all=0.02,
            n_epochs=100)
model.fit(train_data)
train_end = time.time()
print("training time of model: %.2f seconds" % (train_end - train_start))

training time of model: 0.00 seconds


In [25]:
# user_id가 4인 유저의 영화 평가 데이터입니다.
target_user_id = 4
target_user_data = rating_data[rating_data['user_id']==target_user_id]
target_user_data.head(5)

Unnamed: 0,user_id,movie_id,rating,time
233,4,3468,5,978294008
234,4,1210,3,978293924
235,4,2951,4,978294282
236,4,1214,4,978294260
237,4,1036,4,978294282


In [26]:
# user_id 4인 유저가 평가한 영화 히스토리 정보를 추출합니다.
target_user_movie_rating_dict = {}

for index, row in target_user_data.iterrows():
    movie_id = row['movie_id']
    target_user_movie_rating_dict[movie_id] = row['rating']
            
print(target_user_movie_rating_dict)

{3468: 5, 1210: 3, 2951: 4, 1214: 4, 1036: 4, 260: 5, 2028: 5, 480: 4, 1196: 2, 1198: 5, 1954: 5, 1097: 4, 3418: 4, 3702: 4, 2366: 4, 1387: 5, 3527: 1, 1201: 5, 2692: 5, 2947: 5, 1240: 5}


In [27]:
# 타겟 유저(user_id가 4인 유저)가 보지 않은 영화 정보를 테스트 데이터로 생성합니다.
test_data = []
for index, row in movie_data.iterrows():
    movie_id = row['movie_id']
    rating = 0
    if movie_id in target_user_movie_rating_dict:
        continue
    test_data.append((target_user_id, movie_id, rating))

In [28]:
# 타겟 유저(user_id가 4인 유저)가 보지 않은 영화 정보를 테스트 데이터로 생성합니다.
test_data = []
for index, row in movie_data.iterrows():
    movie_id = row['movie_id']
    rating = 0
    if movie_id in target_user_movie_rating_dict:
        continue
    test_data.append((target_user_id, movie_id, rating))

In [32]:
# 타겟 유저의 평점 점수를 예측합니다.
target_user_predictions = model.test(test_data)

In [35]:
target_user_predictions

[Prediction(uid=4, iid=1, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=2, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=3, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=4, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=5, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=6, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=7, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=8, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=9, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=10, r_ui=0, est=4.200076451147748, details={'was_impossible': False}),
 Prediction(uid=4, iid=11, r_ui=0, est=4.200076451147748, details={'w

### 4번 사용자는 액션 영화에 평균 4.2점을 줄 것이다. 