[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/MF.ipynb)

# Matrix Factorization

In [1]:
# Colab用のnotebookです。このnotebook1枚でデータのダウンロードから、レコメンドまで完結するようになっています。（予測評価は含めていません。）
# MovieLensデータがまだダウンロードされてなければこのセルを実行して、ダウンロードしてください
# MovieLensデータの分析は、data_download.ipynbをご参照ください

# データのダウンロードと解凍
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

In [1]:
# Movielensのデータの読み込み（データ量が多いため、読み込みに時間がかかる場合があります）
import pandas as pd

# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [2]:
# 因子数
factors = 5
# 評価数の閾値
minimum_num_rating = 100
# バイアス項の使用
use_biase = False
# 学習率
lr_all = 0.005
# エポック数
n_epochs = 50

In [None]:
!pip install surprise

In [5]:
from surprise import SVD, Reader
from surprise import Dataset as SurpriseDataset

# 評価数がminimum_num_rating件以上ある映画に絞る
filtered_movielens_train = movielens_train.groupby("movie_id").filter(
    lambda x: len(x["movie_id"]) >= minimum_num_rating
)

# Surprise用にデータを加工
reader = Reader(rating_scale=(0.5, 5))
data_train = SurpriseDataset.load_from_df(
    filtered_movielens_train[["user_id", "movie_id", "rating"]], reader
).build_full_trainset()

In [6]:
# Surpriseで行列分解を学習
# SVDという名前だが、特異値分解ではなく、Matrix Factorizationが実行される
matrix_factorization = SVD(n_factors=factors, n_epochs=n_epochs, lr_all=lr_all, biased=use_biase)
matrix_factorization.fit(data_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x112d37590>

In [9]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # 各ユーザーごとに、予測されたアイテムを格納する
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # ユーザーごとに、アイテムを予測評価値順に並べ上位n個を格納する
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [d[0] for d in user_ratings[:n]]

    return top_n

In [11]:
# 学習データに出てこないユーザーとアイテムの組み合わせを準備
data_test = data_train.build_anti_testset(None)
predictions = matrix_factorization.test(data_test)
pred_user2items = get_top_n(predictions, n=10)
pred_user2items

defaultdict(list,
            {1: [110, 260, 590, 733, 780, 858, 1073, 1210, 1356, 1148],
             22: [2571, 318, 7153, 260, 5952, 4993, 2959, 1198, 1136, 1196],
             26: [318, 260, 2571, 1198, 7153, 5952, 1148, 527, 4993, 1036],
             30: [858, 912, 1584, 1234, 1704, 1233, 1250, 2324, 3147, 1148],
             34: [1036, 7153, 4993, 4306, 5952, 1291, 2115, 1234, 1101, 1527],
             38: [527, 1036, 1240, 589, 260, 1198, 780, 4306, 2028, 1148],
             50: [50, 2959, 1233, 2028, 3578, 2858, 2571, 1148, 1234, 912],
             53: [318, 260, 2571, 2959, 1233, 1234, 593, 4226, 296, 4973],
             54: [1240, 2571, 2959, 3578, 527, 2028, 6874, 7153, 1198, 4993],
             67: [260, 1210, 1196, 2571, 3578, 7153, 2959, 4993, 1234, 5952],
             71: [318, 2858, 912, 858, 593, 1233, 4973, 527, 50, 1193],
             76: [597, 1721, 587, 539, 500, 780, 356, 62, 802, 3578],
             88: [590, 318, 1036, 1240, 165, 589, 2115, 527, 1101, 733],
    

In [13]:
# user_id=2のユーザーが学習データで評価を付けた映画一覧
movielens_train[movielens_train.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
6150,2,648,2.0,868244699,Mission: Impossible (1996),"[Action, Adventure, Mystery, Thriller]","[confusing, confusing plot, memorable sequence...",12.0
6531,2,733,3.0,868244562,"Rock, The (1996)","[Action, Adventure, Thriller]","[gfei own it, alcatraz, nicolas cage, sean con...",18.0
6813,2,736,3.0,868244698,Twister (1996),"[Action, Adventure, Romance, Thriller]","[disaster, disaster, storm, bill paxton, helen...",13.0
7113,2,780,3.0,868244698,Independence Day (a.k.a. ID4) (1996),"[Action, Adventure, Sci-Fi, War]","[action, alien invasion, aliens, will smith, a...",14.0
7506,2,786,3.0,868244562,Eraser (1996),"[Action, Drama, Thriller]","[arnold schwarzenegger, action, arnold, arnold...",19.0
7661,2,802,2.0,868244603,Phenomenon (1996),"[Drama, Romance]","[interesting concept, own, john travolta, john...",15.0
7779,2,858,2.0,868245645,"Godfather, The (1972)","[Crime, Drama]","[oscar (best picture), marlon brando, classic,...",9.0


In [15]:
# user_id=2に対するおすすめ(3578, 2571, 2959)
movies[movies.movie_id.isin([3578, 2571, 2959])]

Unnamed: 0,movie_id,title,genre,tag
2487,2571,"Matrix, The (1999)","[Action, Sci-Fi, Thriller]","[artificial intelligence, hackers, heroine in ..."
2874,2959,Fight Club (1999),"[Action, Crime, Drama, Thriller]","[based on a book, chuck palahniuk, edward nort..."
3489,3578,Gladiator (2000),"[Action, Adventure, Drama]","[revenge, crowe's best, gfei own it, dvd, glad..."
