In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Reader, SVD, KNNBasic, KNNBaseline
from surprise.model_selection import train_test_split
from surprise import accuracy

In [5]:
movies= pd.read_csv("movies.csv")
links= pd.read_csv("links.csv")
ratings= pd.read_csv("ratings.csv")
tags= pd.read_csv("tags.csv")

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


사용자에 대한 평점 점수 기반 영화 추천 시스템을 구성하고자 하므로, timestamp는 제거하고 userid, movieid, rating 변수를 가진 df을 통해 시스템을 구성한다. surprise는 3개의 column을 가진 df에 대해서 분석을 진행한다.

In [33]:
df=ratings.drop(['timestamp'], axis=1)

In [34]:
df['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


- Reader(rating_scale=(min, max))

reader는 수치의 min, max 값에 맞게 점수를 불러오는 역할을 한다. rating의 min이 0.5, max가 5이므로 이에 맞게 read를 설정한다.

In [36]:
reader = Reader(rating_scale=(0.5,5))

In [37]:
df = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)

In [38]:
X_train, X_test = train_test_split(df, test_size=0.2)

### model-based collaborative filtering

- SVD

model-based CF에서는 SVD를 활용할 수 있다.

In [23]:
svd = SVD()
svd.fit(X_train)
predictions = svd.test(X_test)

accuracy.rmse(predictions)

RMSE: 0.8751


0.8750790836723363

### memory-based collaborative filtering

- KNNBasic
- KNNBaseline

memory-based CF에서는 KNN 기반 다양한 방식이 있으나 KNNBasic, KNNBaseline 두 가지 방식을 사용해보고자 한다.

In [24]:
sim_options = {'name': 'cosine', 'user_based': True}
knn1 = KNNBasic(sim_options=sim_options)
knn1.fit(X_train)
predictions = knn1.test(X_test)

accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9693


0.9692781335949554

In [28]:
sim_options = {'name': 'cosine', 'user_based': False}
knn1 = KNNBasic(sim_options=sim_options)
knn1.fit(X_train)
predictions = knn1.test(X_test)

accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9755


0.9755471849717099

In [29]:
sim_options = {'name': 'cosine', 'user_based': True}
knn2 = KNNBaseline(sim_options=sim_options)
knn2.fit(X_train)
predictions = knn2.test(X_test)

accuracy.rmse(predictions)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8748


0.8747621874465136

In [27]:
sim_options = {'name': 'cosine', 'user_based': False}
knn2 = KNNBaseline(sim_options=sim_options)
knn2.fit(X_train)
predictions = knn2.test(X_test)

accuracy.rmse(predictions)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8904


0.890380559243746

In [47]:
predictions

[Prediction(uid=111, iid=7285, r_ui=3.5, est=3.7176885784390747, details={'actual_k': 6, 'was_impossible': False}),
 Prediction(uid=47, iid=724, r_ui=2.0, est=2.608669313727391, details={'actual_k': 22, 'was_impossible': False}),
 Prediction(uid=599, iid=4367, r_ui=2.5, est=2.1407992052731757, details={'actual_k': 31, 'was_impossible': False}),
 Prediction(uid=373, iid=736, r_ui=3.0, est=3.030908505213092, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=105, iid=2858, r_ui=5.0, est=4.786937929570827, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=68, iid=89864, r_ui=3.5, est=3.423940921997052, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=534, iid=2028, r_ui=4.5, est=4.206136090659444, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=357, iid=594, r_ui=4.0, est=3.860446785593035, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=266, iid=2302, r_ui=1.0, est=3.2095943719940774, details={'

이와 같은 결과를 통해, 가장 성능이 좋게 나온 모델은 model based의 SVD 방식이다. 따라서 이를 이용하여 영화를 추천해보고자 한다.

In [39]:
svd.predict(1, 5)

Prediction(uid=1, iid=5, r_ui=None, est=3.5542432047566574, details={'was_impossible': False})

In [44]:
ratings1=ratings[ratings['userId']==1]

In [46]:
movies1=ratings1['movieId'].unique()

In [84]:
score={}
for movie in movies1:
    score[movie]=svd.predict(1, movie)[3]

In [85]:
score

{1: 4.377287329977785,
 3: 4.1656745522232885,
 6: 4.69616321216444,
 47: 4.953826857861943,
 50: 5,
 70: 4.060117898259399,
 101: 4.669395314686718,
 110: 4.821079245931256,
 151: 4.773155150092123,
 157: 4.176642208439544,
 163: 4.394993281837771,
 216: 4.38676859301358,
 223: 4.356854345702416,
 231: 3.9456026176901546,
 235: 4.268588943862005,
 260: 5,
 296: 5,
 316: 4.033609513306394,
 333: 4.688261272060636,
 349: 4.157423101940718,
 356: 4.861589155923328,
 362: 4.403645287424493,
 367: 3.8437649324516316,
 423: 3.6859919412540614,
 441: 4.658963232550143,
 457: 4.89121590477006,
 480: 4.438011540834153,
 500: 3.948970990253836,
 527: 5,
 543: 4.166394478196865,
 552: 3.837187432553861,
 553: 4.782500038735558,
 590: 4.597363133955246,
 592: 4.189731083110617,
 593: 4.937915083090573,
 596: 4.46010580880526,
 608: 4.944113332507,
 648: 3.919096580929617,
 661: 4.3660301772407095,
 673: 3.5108294941243963,
 733: 4.333398577417575,
 736: 3.748654188027664,
 780: 3.939132903330586,

In [86]:
score=sorted(score.items(), key=lambda x: x[1], reverse=True)

In [95]:
score

[(50, 5),
 (260, 5),
 (296, 5),
 (527, 5),
 (923, 5),
 (1136, 5),
 (1210, 5),
 (2329, 5),
 (2571, 5),
 (2959, 5),
 (1196, 4.9974332611934),
 (1617, 4.990983813015399),
 (1213, 4.986944595509914),
 (2580, 4.973796055082709),
 (1408, 4.963520131298954),
 (47, 4.953826857861943),
 (2692, 4.951061562853199),
 (2502, 4.94690682678887),
 (608, 4.944113332507),
 (593, 4.937915083090573),
 (1270, 4.937213214512617),
 (1291, 4.9224550371476985),
 (2529, 4.914065718803264),
 (3578, 4.911821533943429),
 (1089, 4.900469123241392),
 (1220, 4.898920204233712),
 (3703, 4.89715050762988),
 (457, 4.89121590477006),
 (2761, 4.878623137613172),
 (2716, 4.864632704188236),
 (356, 4.861589155923328),
 (1198, 4.849736035431394),
 (1222, 4.848985463445017),
 (1206, 4.842443239038379),
 (919, 4.837906333339878),
 (1954, 4.836167021634149),
 (1208, 4.831188898369543),
 (110, 4.821079245931256),
 (2858, 4.815050246883373),
 (2137, 4.804573727090916),
 (1240, 4.8027679188588595),
 (1080, 4.799682365377007),
 (32

In [97]:
movie_list_n=[]
for i in score[:15]:
    movie_list_n.append(i[0])

In [119]:
movie_list=pd.DataFrame()

for movie in movie_list_n:
    movie_list=pd.concat([movie_list,movies[movies['movieId']==movie]])

In [120]:
movie_list

Unnamed: 0,movieId,title,genres
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
461,527,Schindler's List (1993),Drama|War
705,923,Citizen Kane (1941),Drama|Mystery
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
1734,2329,American History X (1998),Crime|Drama
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller
