[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/FM.ipynb)

# Factorization Machiens

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 04:58:56--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 04:58:57 (64.6 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 10
# 평가 수의 임곗값
minimum_num_rating = 200
# 에폭 수
n_epochs = 50
# 학습률
lr = 0.01
# 초기 정보 사용
use_side_information = False

In [4]:
# 평갓값이 minimum_num_rating건 이상인 영화를 필터링한다
filtered_movielens_train = movielens_train.groupby("movie_id").filter(
    lambda x: len(x["movie_id"]) >= minimum_num_rating
)

# 사용자가 평가한 영화
user_evaluated_movies = (
    filtered_movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
)


In [5]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer

# FM용으로 데이터를 정형한다
train_data_for_fm = []
y = []
for i, row in filtered_movielens_train.iterrows():
    x = {"user_id": str(row["user_id"]), "movie_id": str(row["movie_id"])}
    if use_side_information:
        x["tag"] = row["tag"]
        x["user_rating_avg"] = np.mean(user_evaluated_movies[row["user_id"]])
    train_data_for_fm.append(x)
    y.append(row["rating"])

y = np.array(y)

vectorizer = DictVectorizer()
X = vectorizer.fit_transform(train_data_for_fm).toarray()

In [6]:
!pip install xlearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlearn
  Downloading xlearn-0.40a1.tar.gz (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.7 MB/s 
[?25hBuilding wheels for collected packages: xlearn
  Building wheel for xlearn (setup.py) ... [?25l[?25hdone
  Created wheel for xlearn: filename=xlearn-0.40a1-py3-none-any.whl size=229985 sha256=1c9655f81f543749a98e73cf23691b792b0191fc6466ca61b978157e473aae4f
  Stored in directory: /root/.cache/pip/wheels/0c/0a/da/aaa813e25436bce2a282cf1b3c9162c60a6e966a714e4a0b36
Successfully built xlearn
Installing collected packages: xlearn
Successfully installed xlearn-0.40a1


In [7]:
# colab에서 xlearn을 동작시키기 위한 설정
# https://github.com/aksnzhy/xlearn/issues/74#issuecomment-580701773
import os
os.environ['USER'] = 'test'

In [8]:
import xlearn as xl

# FM 모델 초기화
fm_model = xl.FMModel(task="reg", metric="rmse", lr=lr, opt="sgd", k=factors, epoch=n_epochs)

In [9]:
# FM 모델 학습
fm_model.fit(X, y, is_lock_free=False)

In [10]:
# 행렬의 인덱스와 영화/사용자를 대응시킨 딕셔너리를 작성한다
unique_user_ids = sorted(filtered_movielens_train.user_id.unique())
unique_movie_ids = sorted(filtered_movielens_train.movie_id.unique())
user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
movie_id2index = dict(zip(unique_movie_ids, range(len(unique_movie_ids))))

In [11]:
# 예측용 데이터를 준비한다
test_data_for_fm = []
for user_id in unique_user_ids:
    for movie_id in unique_movie_ids:
        x = {"user_id": str(user_id), "movie_id": str(movie_id)}
        if use_side_information:
            tag = dataset.item_content[dataset.item_content.movie_id == movie_id].tag.tolist()[0]
            x["tag"] = tag
            x["user_rating_avg"] = np.mean(user_evaluated_movies[row["user_id"]])
        test_data_for_fm.append(x)

X_test = vectorizer.transform(test_data_for_fm).toarray()

In [12]:
# 예측한다
y_pred = fm_model.predict(X_test)
pred_matrix = y_pred.reshape(len(unique_user_ids), len(unique_movie_ids))

In [13]:
# 학습용에 나오지 않은 사용자나 영화의 예측 평가는 평균 평갓값으로 한다
average_score = movielens_train.rating.mean()
movie_rating_predict = movielens_test.copy()
pred_results = []
for i, row in movielens_test.iterrows():
    user_id = row["user_id"]
    if user_id not in user_id2index or row["movie_id"] not in movie_id2index:
        pred_results.append(average_score)
        continue
    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    pred_score = pred_matrix[user_index, movie_index]
    pred_results.append(pred_score)
movie_rating_predict["rating_pred"] = pred_results
movie_rating_predict

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank,rating_pred
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",1.0,3.574791
5,217,122,3.0,844429650,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,3.574791
33,892,122,4.0,850079961,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,3.574791
46,59,185,3.0,838984807,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",1.0,3.574791
47,62,185,5.0,834874598,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",5.0,3.574791
...,...,...,...,...,...,...,...,...,...
132657,849,5097,4.0,1013470449,Bright Eyes (1934),"[Comedy, Drama]",,2.0,3.574791
132660,951,638,4.0,849628182,Jack and Sarah (1995),[Romance],"[babies, ian mckellen, baby, british]",5.0,3.574791
132717,1005,1384,5.0,897254984,"Substance of Fire, The (1996)",[Drama],,1.0,3.574791
132731,934,56,1.0,945901052,Kids of the Round Table (1995),"[Adventure, Children, Fantasy]",,5.0,3.574791


In [14]:
from collections import defaultdict

# 각 사용자에 대한 추천 리스트를 작성한다

pred_user2items = defaultdict(list)

for user_id in unique_user_ids:
    user_index = user_id2index[user_id]
    movie_indexes = np.argsort(-pred_matrix[user_index, :])
    for movie_index in movie_indexes:
        movie_id = unique_movie_ids[movie_index]
        if movie_id not in user_evaluated_movies[user_id]:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

pred_user2items

defaultdict(list,
            {1: [318, 50, 527, 858, 1193, 541, 260, 593, 2959, 1198],
             2: [318, 50, 527, 1193, 541, 593, 2959, 1198, 2858, 2571],
             3: [318, 50, 527, 858, 1193, 541, 260, 593, 2959, 1198],
             4: [318, 50, 527, 858, 1193, 541, 260, 593, 2959, 1198],
             5: [318, 50, 1193, 260, 2959, 1198, 2858, 2571, 1196, 1617],
             6: [318, 50, 527, 541, 593, 2959, 2858, 1617, 1136, 608],
             7: [318, 527, 858, 1193, 2959, 1198, 2858, 2571, 1136, 2028],
             8: [318, 858, 1193, 296, 110, 356, 150, 1, 2396, 2716],
             9: [318, 50, 527, 858, 1193, 541, 260, 593, 2959, 1198],
             10: [318, 50, 858, 260, 2959, 1198, 2858, 2571, 1196, 1136],
             11: [318, 50, 527, 858, 1193, 541, 593, 2959, 2858, 1136],
             12: [318, 50, 527, 858, 1193, 541, 260, 593, 1198, 2571],
             13: [318, 527, 858, 541, 260, 593, 2959, 1198, 2858, 2571],
             14: [318, 50, 527, 858, 1193, 541, 593

In [15]:
# user_id=2인 사용자가 학습 데이터에 평가를 부여한 영화 목록
movielens_train[movielens_train.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
6150,2,648,2.0,868244699,Mission: Impossible (1996),"[Action, Adventure, Mystery, Thriller]","[confusing, confusing plot, memorable sequence...",12.0
6531,2,733,3.0,868244562,"Rock, The (1996)","[Action, Adventure, Thriller]","[gfei own it, alcatraz, nicolas cage, sean con...",18.0
6813,2,736,3.0,868244698,Twister (1996),"[Action, Adventure, Romance, Thriller]","[disaster, disaster, storm, bill paxton, helen...",13.0
7113,2,780,3.0,868244698,Independence Day (a.k.a. ID4) (1996),"[Action, Adventure, Sci-Fi, War]","[action, alien invasion, aliens, will smith, a...",14.0
7506,2,786,3.0,868244562,Eraser (1996),"[Action, Drama, Thriller]","[arnold schwarzenegger, action, arnold, arnold...",19.0
7661,2,802,2.0,868244603,Phenomenon (1996),"[Drama, Romance]","[interesting concept, own, john travolta, john...",15.0
7779,2,858,2.0,868245645,"Godfather, The (1972)","[Crime, Drama]","[oscar (best picture), marlon brando, classic,...",9.0


In [16]:
# user_id=2에 대한 추천(318, 50, 527)
movies[movies.movie_id.isin([318, 50, 527])]

Unnamed: 0,movie_id,title,genre,tag
49,50,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]","[kevin spacey, ensemble cast, complicated, mus..."
315,318,"Shawshank Redemption, The (1994)",[Drama],"[based on a short story, directorial debut, fr..."
523,527,Schindler's List (1993),"[Drama, War]","[speilberg, drama, holocaust, steven spielberg..."
