[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/LDA_collaboration.ipynb)

# Latent Dirichlet Allocation (LDA)の行動データへの適用

In [1]:
# Colab用のnotebookです。このnotebook1枚でデータのダウンロードから、レコメンドまで完結するようになっています。（予測評価は含めていません。）
# MovieLensデータがまだダウンロードされてなければこのセルを実行して、ダウンロードしてください
# MovieLensデータの分析は、data_download.ipynbをご参照ください

# データのダウンロードと解凍
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

In [1]:
# Movielensのデータの読み込み（データ量が多いため、読み込みに時間がかかる場合があります）
import pandas as pd

# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [2]:
# 因子数
factors = 50
# エポック数
n_epochs = 30

In [None]:
!pip install gensim==4.0.1

In [5]:
from gensim.corpora.dictionary import Dictionary

# LDAのインプットに使うデータの作成
lda_data = []
movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    lda_data.append(data["movie_id"].apply(str).tolist())

common_dictionary = Dictionary(lda_data)
common_corpus = [common_dictionary.doc2bow(text) for text in lda_data]




In [7]:
import gensim
lda_model = gensim.models.LdaModel(
    common_corpus, id2word=common_dictionary, num_topics=factors, passes=n_epochs
)

In [38]:
# 各ユーザーの所属トピックが格納される
lda_topics = lda_model[common_corpus]

# 例: とあるユーザーの所属トピック
lda_topics[0]

[(0, 0.058512874), (2, 0.056664612), (20, 0.71739906), (33, 0.1163097)]

In [52]:
# topic0の映画一覧
for token_id, score in lda_model.get_topic_terms(0, topn=10):
    movie_id = int(common_dictionary.id2token[token_id])
    title = movies[movies.movie_id == movie_id].title.tolist()[0]
    print(f'movie_id={movie_id}, title={title}, score={score}')

movie_id=370, title=Naked Gun 33 1/3: The Final Insult (1994), score=0.05946573242545128
movie_id=256, title=Junior (1994), score=0.02701723948121071
movie_id=44, title=Mortal Kombat (1995), score=0.02674136497080326
movie_id=485, title=Last Action Hero (1993), score=0.022686509415507317
movie_id=442, title=Demolition Man (1993), score=0.013746446929872036
movie_id=466, title=Hot Shots! Part Deux (1993), score=0.01224201824516058
movie_id=7153, title=Lord of the Rings: The Return of the King, The (2003), score=0.01153385080397129
movie_id=315, title=Specialist, The (1994), score=0.01045612245798111
movie_id=480, title=Jurassic Park (1993), score=0.009095880202949047
movie_id=160, title=Congo (1995), score=0.008071123622357845


In [32]:
# スターウォーズ・エピーソード５(movie_id=1196）のトピック(各トピックの所属確率)
lda_model[common_dictionary.doc2bow(["1196"])]

[(0, 0.010000048),
 (1, 0.010000048),
 (2, 0.010000048),
 (3, 0.010000048),
 (4, 0.010000048),
 (5, 0.010000048),
 (6, 0.010000048),
 (7, 0.010000048),
 (8, 0.010000048),
 (9, 0.010000048),
 (10, 0.010000048),
 (11, 0.010000048),
 (12, 0.010000048),
 (13, 0.010000048),
 (14, 0.010000048),
 (15, 0.010000048),
 (16, 0.010000048),
 (17, 0.010000048),
 (18, 0.010000048),
 (19, 0.010000048),
 (20, 0.010000048),
 (21, 0.010000048),
 (22, 0.010000048),
 (23, 0.010000048),
 (24, 0.010000048),
 (25, 0.010000048),
 (26, 0.010000048),
 (27, 0.010000048),
 (28, 0.010000048),
 (29, 0.010000048),
 (30, 0.010000048),
 (31, 0.010000048),
 (32, 0.010000048),
 (33, 0.010000048),
 (34, 0.010000048),
 (35, 0.010000048),
 (36, 0.010000048),
 (37, 0.010000048),
 (38, 0.010000048),
 (39, 0.010000048),
 (40, 0.010000048),
 (41, 0.010000048),
 (42, 0.010000048),
 (43, 0.010000048),
 (44, 0.010000048),
 (45, 0.010000048),
 (46, 0.010000048),
 (47, 0.010000048),
 (48, 0.010000048),
 (49, 0.5099976)]

In [46]:
from collections import defaultdict

# 各ユーザーにレコメンドリストの作成
# そのユーザーの一番所属確率が高いトピックを取得して、そのトピック内で確率の高いアイテムから格納していく

user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

pred_user2items = defaultdict(list)
for i, (user_id, data) in enumerate(movielens_train_high_rating.groupby("user_id")):
    evaluated_movie_ids = user_evaluated_movies[user_id]
    # ユーザーの所属確率が一番高いトピックを取得
    user_topic = sorted(lda_topics[i], key=lambda x: -x[1])[0][0]
    # そのトピック内で、確率の高いアイテムを取得
    topic_movies = lda_model.get_topic_terms(user_topic, topn=len(movies))

    for token_id, score in topic_movies:
        movie_id = int(common_dictionary.id2token[token_id])
        if movie_id not in evaluated_movie_ids:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

pred_user2items

defaultdict(list,
            {1: [457, 150, 593, 318, 380, 110, 296, 590, 349, 161],
             2: [1198, 1196, 1240, 1291, 1200, 1197, 1270, 593, 1193, 1234],
             3: [7151, 337, 1704, 3181, 2771, 1641, 2336, 3157, 1242, 2762],
             4: [457, 593, 356, 318, 296, 10, 47, 50, 527, 454],
             5: [1193, 2858, 1617, 1208, 904, 296, 1252, 1213, 50, 1233],
             6: [1210, 1580, 1036, 480, 589, 1240, 1270, 3793, 780, 1610],
             7: [1193, 2858, 527, 1208, 858, 296, 1213, 111, 1233, 1247],
             8: [1610, 110, 356, 3753, 1, 2706, 2716, 1097, 2529, 1356],
             9: [2959, 296, 5952, 4226, 7153, 2571, 4993, 47, 318, 6874],
             10: [2908, 3735, 2160, 1358, 3267, 1794, 1274, 1407, 1249, 903],
             11: [296, 593, 1101, 2002, 1193, 8528, 2490, 858, 1304, 1673],
             12: [1527, 4799, 1079, 3265, 1298, 1356, 4816, 223, 1884, 3030],
             13: [1199, 1204, 1291, 919, 1084, 1265, 3897, 2716, 348, 2289],
             14:

In [48]:
# user_id=2のユーザーが学習データで、4以上の評価を付けた映画一覧
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [49]:
# user_id=2に対するおすすめ(1198, 1196, 1240)
movies[movies.movie_id.isin([1198, 1196, 1240])]

Unnamed: 0,movie_id,title,genre,tag
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."
1173,1198,Raiders of the Lost Ark (Indiana Jones and the...,"[Action, Adventure]","[egypt, lucas, seen more than once, dvd collec..."
1212,1240,"Terminator, The (1984)","[Action, Sci-Fi, Thriller]","[arnold schwarzenegger, sci-fi, time travel, d..."
