[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/FM.ipynb)

# Factorization Machiens

In [None]:
# Colab用のnotebookです。このnotebook1枚でデータのダウンロードから、レコメンドまで完結するようになっています。（予測評価は含めていません。）
# MovieLensデータがまだダウンロードされてなければこのセルを実行して、ダウンロードしてください
# MovieLensデータの分析は、data_download.ipynbをご参照ください

# データのダウンロードと解凍
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

In [2]:
# Movielensのデータの読み込み（データ量が多いため、読み込みに時間がかかる場合があります）
import pandas as pd

# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 因子数
factors = 10
# 評価数の閾値
minimum_num_rating = 200
# エポック数
n_epochs = 50
# 学習率
lr = 0.01
# 補助情報の利用
use_side_information = False

In [6]:
# 評価値がminimum_num_rating件以上ある映画に絞る
filtered_movielens_train = movielens_train.groupby("movie_id").filter(
    lambda x: len(x["movie_id"]) >= minimum_num_rating
)

# ユーザーが評価した映画
user_evaluated_movies = (
    filtered_movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
)


In [9]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer

# FM用にデータの整形
train_data_for_fm = []
y = []
for i, row in filtered_movielens_train.iterrows():
    x = {"user_id": str(row["user_id"]), "movie_id": str(row["movie_id"])}
    if use_side_information:
        x["tag"] = row["tag"]
        x["user_rating_avg"] = np.mean(user_evaluated_movies[row["user_id"]])
    train_data_for_fm.append(x)
    y.append(row["rating"])

y = np.array(y)

vectorizer = DictVectorizer()
X = vectorizer.fit_transform(train_data_for_fm).toarray()

In [None]:
!pip install xlearn

In [None]:
# colabでxlearnを動かすための設定
# https://github.com/aksnzhy/xlearn/issues/74#issuecomment-580701773
import os
os.environ['USER'] = 'test'

In [11]:
import xlearn as xl

# FMモデルの初期化
fm_model = xl.FMModel(task="reg", metric="rmse", lr=lr, opt="sgd", k=factors, epoch=n_epochs)

In [12]:
# FMモデルの学習
fm_model.fit(X, y, is_lock_free=False)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 4 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (/var/folders/31/92gcylk93xvbjbyb75sdmpqm0000gn/T/tmp7f5xyv68.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mNumber of Feature: 1061
[32m[------------] [0mTime cost for reading problem: 0.01 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 53.89 KB
[32m[--------

In [14]:
# 行列のインデックスと映画/ユーザーを対応させる辞書を作成
unique_user_ids = sorted(filtered_movielens_train.user_id.unique())
unique_movie_ids = sorted(filtered_movielens_train.movie_id.unique())
user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
movie_id2index = dict(zip(unique_movie_ids, range(len(unique_movie_ids))))

In [16]:
# 予測用のデータの準備
test_data_for_fm = []
for user_id in unique_user_ids:
    for movie_id in unique_movie_ids:
        x = {"user_id": str(user_id), "movie_id": str(movie_id)}
        if use_side_information:
            tag = dataset.item_content[dataset.item_content.movie_id == movie_id].tag.tolist()[0]
            x["tag"] = tag
            x["user_rating_avg"] = np.mean(user_evaluated_movies[row["user_id"]])
        test_data_for_fm.append(x)

X_test = vectorizer.transform(test_data_for_fm).toarray()

In [17]:
# 予測する
y_pred = fm_model.predict(X_test)
pred_matrix = y_pred.reshape(len(unique_user_ids), len(unique_movie_ids))

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 8 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from /var/folders/31/92gcylk93xvbjbyb75sdmpqm0000gn/T/tmpvpvqm4eg
[32m[------------] [0mLoss function: squared
[32m[------------] [0mScore function: fm
[32m[------------] [0mNumber of Feature: 1061
[32m[------------] [0mNumber of K: 10
[32m[------------] [0mTime cost for loading model: 0.00 (sec)
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already c

In [19]:
# 学習用に出てこないユーザーや映画の予測評価値は、平均評価値とする
average_score = movielens_train.rating.mean()
movie_rating_predict = movielens_test.copy()
pred_results = []
for i, row in movielens_test.iterrows():
    user_id = row["user_id"]
    if user_id not in user_id2index or row["movie_id"] not in movie_id2index:
        pred_results.append(average_score)
        continue
    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    pred_score = pred_matrix[user_index, movie_index]
    pred_results.append(pred_score)
movie_rating_predict["rating_pred"] = pred_results
movie_rating_predict

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank,rating_pred
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",1.0,3.574791
5,217,122,3.0,844429650,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,3.574791
33,892,122,4.0,850079961,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,3.574791
46,59,185,3.0,838984807,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",1.0,3.574791
47,62,185,5.0,834874598,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",5.0,3.574791
...,...,...,...,...,...,...,...,...,...
132657,849,5097,4.0,1013470449,Bright Eyes (1934),"[Comedy, Drama]",,2.0,3.574791
132660,951,638,4.0,849628182,Jack and Sarah (1995),[Romance],"[babies, ian mckellen, baby, british]",5.0,3.574791
132717,1005,1384,5.0,897254984,"Substance of Fire, The (1996)",[Drama],,1.0,3.574791
132731,934,56,1.0,945901052,Kids of the Round Table (1995),"[Adventure, Children, Fantasy]",,5.0,3.574791


In [21]:
from collections import defaultdict

# 各ユーザーに対するレコメンドリストの作成

pred_user2items = defaultdict(list)

for user_id in unique_user_ids:
    user_index = user_id2index[user_id]
    movie_indexes = np.argsort(-pred_matrix[user_index, :])
    for movie_index in movie_indexes:
        movie_id = unique_movie_ids[movie_index]
        if movie_id not in user_evaluated_movies[user_id]:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

pred_user2items

defaultdict(list,
            {1: [318, 50, 527, 858, 260, 1193, 2959, 593, 541, 2858],
             2: [318, 50, 527, 1193, 2959, 593, 541, 2858, 2571, 1136],
             3: [318, 50, 527, 858, 260, 1193, 2959, 593, 541, 2858],
             4: [318, 50, 527, 858, 260, 1193, 2959, 593, 541, 2858],
             5: [318, 50, 260, 1193, 2959, 2858, 2571, 1136, 1198, 1196],
             6: [318, 50, 527, 2959, 593, 541, 2858, 1136, 1617, 296],
             7: [318, 527, 858, 1193, 2959, 2858, 2571, 1136, 1198, 296],
             8: [318, 858, 1193, 296, 110, 356, 1, 150, 2396, 2716],
             9: [318, 50, 527, 858, 260, 1193, 2959, 593, 541, 2571],
             10: [318, 50, 858, 260, 2959, 2858, 2571, 1136, 1198, 1196],
             11: [318, 50, 527, 858, 1193, 2959, 593, 541, 2858, 1136],
             12: [318, 50, 527, 858, 260, 1193, 593, 541, 2571, 1136],
             13: [318, 527, 858, 260, 2959, 593, 541, 2858, 2571, 1136],
             14: [318, 50, 527, 858, 1193, 2959, 593

In [23]:
# user_id=2のユーザーが学習データで評価を付けた映画一覧
movielens_train[movielens_train.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
6150,2,648,2.0,868244699,Mission: Impossible (1996),"[Action, Adventure, Mystery, Thriller]","[confusing, confusing plot, memorable sequence...",12.0
6531,2,733,3.0,868244562,"Rock, The (1996)","[Action, Adventure, Thriller]","[gfei own it, alcatraz, nicolas cage, sean con...",18.0
6813,2,736,3.0,868244698,Twister (1996),"[Action, Adventure, Romance, Thriller]","[disaster, disaster, storm, bill paxton, helen...",13.0
7113,2,780,3.0,868244698,Independence Day (a.k.a. ID4) (1996),"[Action, Adventure, Sci-Fi, War]","[action, alien invasion, aliens, will smith, a...",14.0
7506,2,786,3.0,868244562,Eraser (1996),"[Action, Drama, Thriller]","[arnold schwarzenegger, action, arnold, arnold...",19.0
7661,2,802,2.0,868244603,Phenomenon (1996),"[Drama, Romance]","[interesting concept, own, john travolta, john...",15.0
7779,2,858,2.0,868245645,"Godfather, The (1972)","[Crime, Drama]","[oscar (best picture), marlon brando, classic,...",9.0


In [25]:
# user_id=2に対するおすすめ(318, 50, 527)
movies[movies.movie_id.isin([318, 50, 527])]

Unnamed: 0,movie_id,title,genre,tag
49,50,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]","[kevin spacey, ensemble cast, complicated, mus..."
315,318,"Shawshank Redemption, The (1994)",[Drama],"[based on a short story, directorial debut, fr..."
523,527,Schindler's List (1993),"[Drama, War]","[speilberg, drama, holocaust, steven spielberg..."
