[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/IMF.ipynb)

# 암묵적 행렬 분해(Implicit Matrix Factorization, IMF)

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:12:10--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:12:18 (10.3 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 10
# 평가 수의 임곗값
minimum_num_rating = 0
# 에폭 수
n_epochs = 50
# alpha
alpha = 1.0

In [4]:
# 행력 분석용으로 행렬을 작성한다
filtered_movielens_train = movielens_train.groupby("movie_id").filter(
    lambda x: len(x["movie_id"]) >= minimum_num_rating
)

movielens_train_high_rating = filtered_movielens_train[filtered_movielens_train.rating >= 4]

# 행렬의 인덱스와 영화/사용자를 대응시킨 딕셔너리를 작성한다
unique_user_ids = sorted(movielens_train_high_rating.user_id.unique())
unique_movie_ids = sorted(movielens_train_high_rating.movie_id.unique())
user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
movie_id2index = dict(zip(unique_movie_ids, range(len(unique_movie_ids))))

In [5]:
from scipy.sparse import lil_matrix
# 희소 행렬을 초기화하고, 각 셀에 값을 넣는다
movielens_matrix = lil_matrix((len(unique_movie_ids), len(unique_user_ids)))
for i, row in movielens_train_high_rating.iterrows():
    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    movielens_matrix[movie_index, user_index] = 1.0 * alpha # 이후, 영화의 평갓값을 0/1로 이진화한다. 클릭 수 등의 데이터인 경우에는 log(click 수) 등으로 변형하는 것도 효과적이다.

In [6]:
!pip install implicit==0.4.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 30.9 MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.4-cp38-cp38-linux_x86_64.whl size=3825530 sha256=4f9058d1d351d60c74e7cfac5a34e01c831ecc5683e4cb2c3b0bc2816f422259
  Stored in directory: /root/.cache/pip/wheels/00/ac/67/6f4536c819ed560c2c7e17c0f7a920e3e50c26108616087d05
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.4


In [7]:
# colab 상에서 implicit 라이브러리 실행 후에 에러가 발생할 때는, '런타임 → 런타임 유형 변경'에서 하드웨어 엑셀러레이터로 GPU를 선택합니다

In [8]:
import implicit

# 모델 초기화
model = implicit.als.AlternatingLeastSquares(
    factors=factors, iterations=n_epochs, calculate_training_loss=True, random_state=1
)



In [9]:
# 학습
model.fit(movielens_matrix)

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
from collections import defaultdict

# 추천
recommendations = model.recommend_all(movielens_matrix.T)
pred_user2items = defaultdict(list)
for user_id, user_index in user_id2index.items():
    movie_indexes = recommendations[user_index, :]
    for movie_index in movie_indexes:
        movie_id = unique_movie_ids[movie_index]
        pred_user2items[user_id].append(movie_id)
pred_user2items

  0%|          | 0/997 [00:00<?, ?it/s]

defaultdict(list,
            {1: [380, 457, 595, 590, 150, 349, 165, 500, 597, 161],
             2: [1196, 1198, 527, 356, 2028, 593, 2571, 318, 589, 1240],
             3: [527, 5952, 4993, 356, 8961, 4306, 150, 1208, 4886, 1183],
             4: [380, 457, 349, 356, 318, 377, 292, 10, 593, 434],
             5: [318, 296, 1193, 750, 260, 1208, 36, 25, 50, 924],
             6: [1210, 110, 593, 541, 1291, 1240, 589, 1214, 2762, 608],
             7: [750, 1267, 919, 922, 1193, 858, 1198, 910, 541, 593],
             8: [4993, 1917, 296, 5952, 3753, 2502, 2716, 235, 1275, 356],
             9: [1136, 1206, 2997, 2959, 2918, 1265, 750, 1732, 1220, 2791],
             10: [150, 318, 356, 1247, 539, 2396, 2324, 1090, 1393, 11],
             11: [589, 1197, 1214, 1036, 1200, 457, 1136, 1356, 1307, 356],
             12: [2571, 2028, 1704, 2706, 2683, 3052, 1, 1580, 2700, 1617],
             13: [1196, 260, 541, 356, 2571, 1270, 110, 858, 1240, 2762],
             14: [1198, 5952, 4993, 1

In [11]:
# user_id=2인 사용자가 학습 데이터에서 4 이상의 평가를 부여한 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [12]:
# user_id=2에 대한 추천(1196, 1, 589)
movies[movies.movie_id.isin([1196, 1, 589])]

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
583,589,Terminator 2: Judgment Day (1991),"[Action, Sci-Fi]","[action, sci-fi, dvd, seen more than once, tim..."
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."


In [13]:
# IMF에서는 factors나 alpha 설정이 예측 정밀도에 중요하므로, 값을 바꾸어가며 시도해봅니다.