In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [54]:
# 데이터 준비
from surprise import Reader, Dataset # 데이터를 읽을 때 읽는 방법을 설정하는 클래스


#데이터불러오기 및 조건 설정
path = 'data/hide_data/data-files/ml-latest-small/ratings.csv'
path2 = 'data/hide_data/data-files/ml-latest-small/movies.csv'

movies_small = pd.read_csv(path2)
ratings_small = pd.read_csv(path)
data = Dataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']],
                             Reader(rating_scale=(0.5, 5)))

In [55]:
# 훈련 / 테스트 데이터 준비
trainset = data.build_full_trainset()
testset = trainset.build_testset()
len(testset), testset[0]

(100836, (1, 1, 4.0))

In [56]:
# 모델 훈련 (학습)

from surprise import SVD

svd = SVD(n_factors=100, n_epochs=20, random_state= 777)

svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x18f6fe7acf0>

In [57]:
print(ratings_small[ratings_small['userId'] == 26].shape)

(21, 4)


In [58]:
uid_mask = ratings_small['userId']  == 26
ratings_small[ uid_mask ][ ["userId","movieId"] ] # 26번 사용자가 평점을 부여한 영화 조회

t = ratings_small[ ~uid_mask ][ ["userId","movieId"] ]
t['movieId'].value_counts()


movieId
356       328
318       317
296       306
593       278
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: count, Length: 9724, dtype: int64

In [59]:
def select_unrated_movies(ratings, user_id):
    all_movie_id = ratings['movieId'].unique() #모든 영화 Id
    uid_mask = ratings_small['userId'] == 26
    rated_movie_ids = ratings[uid_mask]['movieId'].values # 평가한 영화
    unrated_movies = [mid for mid in all_movie_id  if mid not in rated_movie_ids] 
    return unrated_movies

In [60]:
print(select_unrated_movies(ratings_small, 26)[:10])

print(movies_small[movies_small['movieId'] == 1])

[1, 3, 6, 50, 70, 101, 110, 151, 157, 163]
   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy


In [61]:
svd.test(testset)[:10]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.742410990471974, details={'was_impossible': False}),
 Prediction(uid=1, iid=3, r_ui=4.0, est=4.094687379991476, details={'was_impossible': False}),
 Prediction(uid=1, iid=6, r_ui=4.0, est=4.222221134819293, details={'was_impossible': False}),
 Prediction(uid=1, iid=47, r_ui=5.0, est=4.708502111345704, details={'was_impossible': False}),
 Prediction(uid=1, iid=50, r_ui=5.0, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=70, r_ui=3.0, est=4.070344584650523, details={'was_impossible': False}),
 Prediction(uid=1, iid=101, r_ui=5.0, est=4.873275319245894, details={'was_impossible': False}),
 Prediction(uid=1, iid=110, r_ui=4.0, est=4.525297792765697, details={'was_impossible': False}),
 Prediction(uid=1, iid=151, r_ui=5.0, est=4.6498677143378995, details={'was_impossible': False}),
 Prediction(uid=1, iid=157, r_ui=5.0, est=4.23626838735922, details={'was_impossible': False})]

In [62]:
def get_movie_title(movies, movie_id):
    movie_id_mask = movies['movieId'] == movie_id
    return movies[movie_id_mask]['title'].values[0]

def recommend_movies(ratings, movies, user_id, top_n=10):
    unrated_movie_ids = select_unrated_movies(ratings, user_id)

    predictions = [ svd.predict(user_id, movie_id) for movie_id in unrated_movie_ids ]
    predictions.sort(key=lambda p: p.est, reverse=True) # 예상평점기준 내림차순 정렬

    top_n_predictions = predictions[:top_n]
    recommendations = [ (p.iid, get_movie_title(movies, p.iid)) for p in top_n_predictions ]

    return recommendations


In [63]:
recommend_movies(ratings_small, movies_small, 26)

[(608, 'Fargo (1996)'),
 (858, 'Godfather, The (1972)'),
 (750,
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)'),
 (50, 'Usual Suspects, The (1995)'),
 (1089, 'Reservoir Dogs (1992)'),
 (112552, 'Whiplash (2014)'),
 (1204, 'Lawrence of Arabia (1962)'),
 (318, 'Shawshank Redemption, The (1994)'),
 (1252, 'Chinatown (1974)'),
 (923, 'Citizen Kane (1941)')]

In [67]:
uid_mask = ratings_small['userId'] == 26
rated_movie_ids = ratings_small[uid_mask]["movieId"] # 26번 사용자가 평점 부여한 영화 조회
[ movies_small[movies_small["movieId"] == movie_id]["title"] for movie_id in rated_movie_ids ]

[9    GoldenEye (1995)
 Name: title, dtype: object,
 32    Babe (1995)
 Name: title, dtype: object,
 43    Seven (a.k.a. Se7en) (1995)
 Name: title, dtype: object,
 123    Apollo 13 (1995)
 Name: title, dtype: object,
 126    Batman Forever (1995)
 Name: title, dtype: object,
 138    Die Hard: With a Vengeance (1995)
 Name: title, dtype: object,
 156    Net, The (1995)
 Name: title, dtype: object,
 176    Waterworld (1995)
 Name: title, dtype: object,
 192    Disclosure (1994)
 Name: title, dtype: object,
 249    Natural Born Killers (1994)
 Name: title, dtype: object,
 257    Pulp Fiction (1994)
 Name: title, dtype: object,
 260    Quiz Show (1994)
 Name: title, dtype: object,
 302    Ace Ventura: Pet Detective (1994)
 Name: title, dtype: object,
 307    Clear and Present Danger (1994)
 Name: title, dtype: object,
 314    Forrest Gump (1994)
 Name: title, dtype: object,
 337    True Lies (1994)
 Name: title, dtype: object,
 378    Cliffhanger (1993)
 Name: title, dtype: object,
 395  