[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/MF.ipynb)

# 행렬 분해(Matrix Factorization, MF)

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:35:29--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:35:30 (63.1 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 5
# 평가수 임곗값
minimum_num_rating = 100
# 바이어스 항 사용
use_biase = False
# 학습률
lr_all = 0.005
# 에폭 수
n_epochs = 50

In [4]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 4.8 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626477 sha256=a922027e3c7f6d8735b7192824bc8d1a46af450d003dde385ce514ec0ae33d2d
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [5]:
from surprise import SVD, Reader
from surprise import Dataset as SurpriseDataset

# 평가 수가 minimum_num_rating 건 이상 있는 영화로 줄인다
filtered_movielens_train = movielens_train.groupby("movie_id").filter(
    lambda x: len(x["movie_id"]) >= minimum_num_rating
)

# Surprise용으로 데이터를 가공한다
reader = Reader(rating_scale=(0.5, 5))
data_train = SurpriseDataset.load_from_df(
    filtered_movielens_train[["user_id", "movie_id", "rating"]], reader
).build_full_trainset()

In [6]:
# Surprise로 행렬 분해를 학습한다
# 이름은 SVD지만, 특이값 분석이 아니라 Matrix Factorization가 실행된다
matrix_factorization = SVD(n_factors=factors, n_epochs=n_epochs, lr_all=lr_all, biased=use_biase)
matrix_factorization.fit(data_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9e52285ac0>

In [7]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # 각 사용자별로 예측된 아이템을 저장한다
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # 사용자별로 아이템을 예측 평갓값 순으로 나열해 상위 n개를 저장한다
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [d[0] for d in user_ratings[:n]]

    return top_n

In [8]:
# 학습 데이터에 나오지 않는 사용자와 아이템의 조합을 준비한다
data_test = data_train.build_anti_testset(None)
predictions = matrix_factorization.test(data_test)
pred_user2items = get_top_n(predictions, n=10)
pred_user2items

defaultdict(list,
            {1: [110, 260, 590, 733, 802, 858, 1073, 1210, 1148, 1246],
             22: [318, 260, 608, 1196, 912, 1234, 1213, 858, 904, 2959],
             26: [260, 1210, 318, 527, 1196, 50, 593, 1234, 3578, 1704],
             30: [1148, 912, 457, 3578, 1234, 2762, 1704, 3147, 1233, 2324],
             34: [2324, 1148, 1234, 1207, 4306, 1233, 904, 497, 1247, 17],
             38: [1380, 50, 920, 1036, 1234, 1210, 2324, 2081, 165, 589],
             50: [260, 1207, 904, 2858, 1148, 4993, 1704, 5952, 912, 1233],
             53: [318, 1233, 858, 912, 1193, 904, 2019, 4973, 4226, 1234],
             54: [1210, 1380, 3578, 260, 1196, 1704, 527, 2324, 1234, 1198],
             67: [2959, 1234, 1089, 858, 1213, 1036, 912, 4226, 1221, 1201],
             71: [318, 260, 50, 593, 912, 1234, 527, 1196, 858, 904],
             76: [1721, 597, 587, 3578, 356, 539, 780, 500, 527, 786],
             88: [318, 1036, 50, 589, 1234, 3147, 2000, 457, 593, 733],
             89: [59

In [9]:
# user_id=2인 사용자가 학습 데이터로 평가를 부여한 영화 목록
movielens_train[movielens_train.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
6150,2,648,2.0,868244699,Mission: Impossible (1996),"[Action, Adventure, Mystery, Thriller]","[confusing, confusing plot, memorable sequence...",12.0
6531,2,733,3.0,868244562,"Rock, The (1996)","[Action, Adventure, Thriller]","[gfei own it, alcatraz, nicolas cage, sean con...",18.0
6813,2,736,3.0,868244698,Twister (1996),"[Action, Adventure, Romance, Thriller]","[disaster, disaster, storm, bill paxton, helen...",13.0
7113,2,780,3.0,868244698,Independence Day (a.k.a. ID4) (1996),"[Action, Adventure, Sci-Fi, War]","[action, alien invasion, aliens, will smith, a...",14.0
7506,2,786,3.0,868244562,Eraser (1996),"[Action, Drama, Thriller]","[arnold schwarzenegger, action, arnold, arnold...",19.0
7661,2,802,2.0,868244603,Phenomenon (1996),"[Drama, Romance]","[interesting concept, own, john travolta, john...",15.0
7779,2,858,2.0,868245645,"Godfather, The (1972)","[Crime, Drama]","[oscar (best picture), marlon brando, classic,...",9.0


In [10]:
# user_id=2에 대한 추천(3578, 2571, 2959)
movies[movies.movie_id.isin([3578, 2571, 2959])]

Unnamed: 0,movie_id,title,genre,tag
2487,2571,"Matrix, The (1999)","[Action, Sci-Fi, Thriller]","[artificial intelligence, hackers, heroine in ..."
2874,2959,Fight Club (1999),"[Action, Crime, Drama, Thriller]","[based on a book, chuck palahniuk, edward nort..."
3489,3578,Gladiator (2000),"[Action, Adventure, Drama]","[revenge, crowe's best, gfei own it, dvd, glad..."
