# 잠재요인 협업 필터링
 - SVD(singular value decomposition)

In [1]:
import pandas as pd
from google.colab import files
up = files.upload()

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving ratings_noh.csv to ratings_noh.csv
Saving README.txt to README.txt
Saving tags.csv to tags.csv


In [3]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.shape

(100836, 4)

In [5]:
# 사용자수, 영화수
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [6]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 6.9 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630154 sha256=d7e9669e92553ceb4a2b3fafe140eaf6caaa1a500559715ff19da6a206fd2f3e
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [7]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale = (0.5,5))
data_folds = DatasetAutoFolds('ratings_noh.csv',reader = reader)

In [8]:
# 전체 데이터를 학습 데이터로 사용
trainset = data_folds.build_full_trainset()

In [9]:
type(trainset)

surprise.trainset.Trainset

In [10]:
dir(trainset)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_global_mean',
 '_inner2raw_id_items',
 '_inner2raw_id_users',
 '_raw2inner_id_items',
 '_raw2inner_id_users',
 'all_items',
 'all_ratings',
 'all_users',
 'build_anti_testset',
 'build_testset',
 'global_mean',
 'ir',
 'knows_item',
 'knows_user',
 'n_items',
 'n_ratings',
 'n_users',
 'rating_scale',
 'to_inner_iid',
 'to_inner_uid',
 'to_raw_iid',
 'to_raw_uid',
 'ur']

In [15]:
# 모델 생성 및 학습
model = SVD(n_epochs=20,n_factors=50,random_state=2022)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f6f0bac4990>

- 사용자 ID: 9, 영화 ID: 42(dead presidents(1995))

In [16]:
# 영화 정보
mdf = pd.read_csv('movies.csv')
mdf.head(40).tail(5)

Unnamed: 0,movieId,title,genres
35,39,Clueless (1995),Comedy|Romance
36,40,"Cry, the Beloved Country (1995)",Drama
37,41,Richard III (1995),Drama|War
38,42,Dead Presidents (1995),Action|Crime|Drama
39,43,Restoration (1995),Drama


In [17]:

# 사용자 9번이 42번 영화를 보았는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [21]:
# 사용자 9번의 42번 영화에 대한 예상 평점
# model.predict(uid, mid) uid,mid는 스트링으로만
uid, mid = str(9), str(42)
pred = model.predict(uid,mid,verbose=True)

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


In [22]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=3.249924377339538, details={'was_impossible': False})

- 사용자 9번이 보지 않은 영화중에서 예상점수가 가장 높은 Top 10

In [23]:
seen_movies = ratings[ratings.userId == 9]['movieId'].tolist()
total_movies = mdf.movieId.tolist()
unseen_movies= [movie for movie in total_movies if movie not in seen_movies]
len(seen_movies), len(unseen_movies)

(46, 9696)

In [24]:
uid = str(9)
# predictions = [model.predict(uid, str(mid)) for mid in unseen_movies]
predictions = []
for mid in unseen_movies:
  pred = model.predict(uid, str(mid))
  predictions.append(pred)
predictions[:5]

[Prediction(uid='9', iid='1', r_ui=None, est=3.702922347424712, details={'was_impossible': False}),
 Prediction(uid='9', iid='2', r_ui=None, est=3.2274451421980412, details={'was_impossible': False}),
 Prediction(uid='9', iid='3', r_ui=None, est=3.0342513115122123, details={'was_impossible': False}),
 Prediction(uid='9', iid='4', r_ui=None, est=2.661778597408914, details={'was_impossible': False}),
 Prediction(uid='9', iid='5', r_ui=None, est=2.689490348191407, details={'was_impossible': False})]

In [25]:
def sortkey_est(pred):
  return pred.est

In [26]:
predictions.sort(key=sortkey_est, reverse = True)
predictions[:5]

[Prediction(uid='9', iid='318', r_ui=None, est=4.070330794979969, details={'was_impossible': False}),
 Prediction(uid='9', iid='1217', r_ui=None, est=4.063731956995097, details={'was_impossible': False}),
 Prediction(uid='9', iid='1261', r_ui=None, est=4.051908410348554, details={'was_impossible': False}),
 Prediction(uid='9', iid='1204', r_ui=None, est=4.0227662213503805, details={'was_impossible': False}),
 Prediction(uid='9', iid='3275', r_ui=None, est=4.011500870494226, details={'was_impossible': False})]

In [32]:
top_movie_ids = [int(pred.iid) for pred in predictions[:10]]
top_movie_ratings = [pred.est for pred in predictions[:10]]
top_movie_titles = [mdf[mdf.movieId == mid]['title'] for mid in top_movie_ids]

In [33]:
top_df = pd.DataFrame({
    '영화명': top_movie_titles,
    '에상평점': top_movie_ratings
})
top_df.index.name = 'mdf_index'
top_df

Unnamed: 0_level_0,영화명,에상평점
mdf_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"277 Shawshank Redemption, The (1994) Name: ...",4.070331
1,"918 Ran (1985) Name: title, dtype: object",4.063732
2,960 Evil Dead II (Dead by Dawn) (1987) Name...,4.051908
3,"906 Lawrence of Arabia (1962) Name: title, ...",4.022766
4,"2462 Boondock Saints, The (2000) Name: titl...",4.011501
5,3622 Amelie (Fabuleux destin d'Amélie Poula...,3.999696
6,"9071 Spotlight (2015) Name: title, dtype: o...",3.985348
7,"1258 Boogie Nights (1997) Name: title, dtyp...",3.979885
8,"46 Usual Suspects, The (1995) Name: title, ...",3.978625
9,"680 Philadelphia Story, The (1940) Name: ti...",3.978415


In [35]:
mdf[mdf.movieId == 1217]

Unnamed: 0,movieId,title,genres
918,1217,Ran (1985),Drama|War
