[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/LDA_collaboration.ipynb)

# Latent Dirichlet Allocation (LDA)の行動データへの適用

In [3]:
#①データのアップロードと解凍
#一時的にデータサーバがアクセスできなくなってしまったため、手動でデータをアップロードして処理しています
!mkdir ../data

In [4]:
!unzip -n ../data/ml-10m.zip -d ../data/

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [None]:
# Colab用のnotebookです。このnotebook1枚でデータのダウンロードから、レコメンドまで完結するようになっています。（予測評価は含めていません。）
# MovieLensデータがまだダウンロードされてなければこのセルを実行して、ダウンロードしてください
# MovieLensデータの分析は、data_download.ipynbをご参照ください

# データのダウンロードと解凍
#!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
#!unzip -n ../data/ml-10m.zip -d ../data/

--2026-02-02 19:18:05--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... ^C
unzip:  cannot find or open ../data/ml-10m.zip, ../data/ml-10m.zip.zip or ../data/ml-10m.zip.ZIP.


In [20]:
# ②Movielensのデータの読み込み（データ量が多いため、読み込みに時間がかかる場合があります）
import pandas as pd

# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
1,1,185,5.0,838983525,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",17.0
2,1,231,5.0,838983392,Dumb & Dumber (1994),[Comedy],"[jeff daniels, jim carrey, stupid, jim carrey,...",19.0
3,1,292,5.0,838983421,Outbreak (1995),"[Action, Drama, Sci-Fi, Thriller]","[biology, gross, disease, futuristmovies.com, ...",18.0
4,1,316,5.0,838983392,Stargate (1994),"[Action, Adventure, Sci-Fi]","[egypt, space, time travel, time travel, alien...",20.0
5,1,329,5.0,838983392,Star Trek: Generations (1994),"[Action, Adventure, Drama, Sci-Fi]","[far future, space, trekie, enterprise, futuri...",21.0
...,...,...,...,...,...,...,...,...
132825,1053,33794,5.0,1134008301,Batman Begins (2005),"[Action, Crime]","[super-hero, prequel, gfei own it, dvd, christ...",16.0
132826,1053,34162,5.0,1134007983,Wedding Crashers (2005),"[Comedy, Romance]","[bit overrated, false identity, funeral, gay c...",66.0
132827,1053,34319,3.5,1134007773,"Island, The (2005)","[Action, Drama, Sci-Fi, Thriller]","[biology, genetics, freedom, slavery, scarlett...",85.0
132828,1053,35836,5.0,1134008021,"40 Year Old Virgin, The (2005)","[Comedy, Romance]","[erection, funny, romantic comedy, steve carel...",56.0


In [29]:
import pandas as pd

user_ratings_df = pd.read_csv('/content/myselfdata.csv')

new_movielens_train = pd.concat([movielens_train, user_ratings_df], ignore_index=True)
display(new_movielens_train)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
0,1,185,5.0,8.389835e+08,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",17.0
1,1,231,5.0,8.389834e+08,Dumb & Dumber (1994),[Comedy],"[jeff daniels, jim carrey, stupid, jim carrey,...",19.0
2,1,292,5.0,8.389834e+08,Outbreak (1995),"[Action, Drama, Sci-Fi, Thriller]","[biology, gross, disease, futuristmovies.com, ...",18.0
3,1,316,5.0,8.389834e+08,Stargate (1994),"[Action, Adventure, Sci-Fi]","[egypt, space, time travel, time travel, alien...",20.0
4,1,329,5.0,8.389834e+08,Star Trek: Generations (1994),"[Action, Adventure, Drama, Sci-Fi]","[far future, space, trekie, enterprise, futuri...",21.0
...,...,...,...,...,...,...,...,...
127849,-1,7,4.0,1.770030e+09,Sabrina (1995),"['Comedy', 'Romance']","['based on a play', 'greg kinnear', 'remake', ...",20.0
127850,-1,902,5.0,1.770030e+09,Breakfast at Tiffany's (1961),"['Drama', 'Romance']","['classic', 'capote', '1960s', 'apartment', 'b...",21.0
127851,-1,3462,3.5,1.770030e+09,Modern Times (1936),"['Comedy', 'Romance']","['black and white', 'close to perfect', 'facto...",22.0
127852,-1,741,1.5,1.770030e+09,Ghost in the Shell (KÃ´kaku kidÃ´tai) (1995),"['Animation', 'Sci-Fi']","['anime', 'anime', 'complex', 'japan', 'techno...",23.0


In [30]:
#③LDAの学習に必要なパラメタを定義
# 因子数
factors = 50
# エポック数
n_epochs = 30

In [31]:
#④ColabのPythonにgensimをインストールする
!pip install gensim



In [33]:
from gensim.corpora.dictionary import Dictionary

# ⑤LDAのインプットに使うデータの作成
lda_data = []
movielens_train_high_rating = new_movielens_train[new_movielens_train.rating >= 4]
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    lda_data.append(data["movie_id"].apply(str).tolist())

common_dictionary = Dictionary(lda_data)
common_corpus = [common_dictionary.doc2bow(text) for text in lda_data]


In [34]:
import gensim

# ⑥LDAの学習（潜在トピックを洗い出す）
lda_model = gensim.models.LdaModel(
    common_corpus, id2word=common_dictionary, num_topics=factors, passes=n_epochs
)

In [35]:
# ⑦各ユーザーの潜在トピックが格納される
lda_topics = lda_model[common_corpus]

# 例: とあるユーザーの潜在トピック
lda_topics[0]

[(17, np.float32(0.92999405))]

In [36]:
# topic0の映画一覧（LDAが見つけた潜在トピックを構成する映画を確認）
for token_id, score in lda_model.get_topic_terms(0, topn=10):
    movie_id = int(common_dictionary.id2token[token_id])
    title = movies[movies.movie_id == movie_id].title.tolist()[0]
    print(f'movie_id={movie_id}, title={title}, score={score}')

movie_id=3578, title=Gladiator (2000), score=0.009941140189766884
movie_id=2571, title=Matrix, The (1999), score=0.009685342200100422
movie_id=589, title=Terminator 2: Judgment Day (1991), score=0.00846150517463684
movie_id=2028, title=Saving Private Ryan (1998), score=0.008238798007369041
movie_id=2762, title=Sixth Sense, The (1999), score=0.008028590120375156
movie_id=110, title=Braveheart (1995), score=0.007239874918013811
movie_id=2959, title=Fight Club (1999), score=0.007199016399681568
movie_id=780, title=Independence Day (a.k.a. ID4) (1996), score=0.007188891526311636
movie_id=3147, title=Green Mile, The (1999), score=0.006431619171053171
movie_id=1704, title=Good Will Hunting (1997), score=0.006318614352494478


In [37]:
# スターウォーズ・エピーソード５(movie_id=1196）のトピック(各トピックの潜在確率)
lda_model[common_dictionary.doc2bow(["1196"])]

[(0, np.float32(0.010000075)),
 (1, np.float32(0.010000075)),
 (2, np.float32(0.010000075)),
 (3, np.float32(0.010000075)),
 (4, np.float32(0.010000075)),
 (5, np.float32(0.010000075)),
 (6, np.float32(0.010000075)),
 (7, np.float32(0.010000075)),
 (8, np.float32(0.5099967)),
 (9, np.float32(0.010000075)),
 (10, np.float32(0.010000075)),
 (11, np.float32(0.010000075)),
 (12, np.float32(0.010000075)),
 (13, np.float32(0.010000075)),
 (14, np.float32(0.010000075)),
 (15, np.float32(0.010000075)),
 (16, np.float32(0.010000075)),
 (17, np.float32(0.010000075)),
 (18, np.float32(0.010000075)),
 (19, np.float32(0.010000075)),
 (20, np.float32(0.010000075)),
 (21, np.float32(0.010000075)),
 (22, np.float32(0.010000075)),
 (23, np.float32(0.010000075)),
 (24, np.float32(0.010000075)),
 (25, np.float32(0.010000075)),
 (26, np.float32(0.010000075)),
 (27, np.float32(0.010000075)),
 (28, np.float32(0.010000075)),
 (29, np.float32(0.010000075)),
 (30, np.float32(0.010000075)),
 (31, np.float32(0.0

In [42]:
from collections import defaultdict

# ⑧各ユーザーにレコメンドリストの作成
# そのユーザーの最も確率の高い潜在トピックについて、そのトピックに属する映画を格納していく

user_evaluated_movies = new_movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

pred_user2items = defaultdict(list)
for i, (user_id, data) in enumerate(movielens_train_high_rating.groupby("user_id")):
    evaluated_movie_ids = user_evaluated_movies[user_id]
    # ユーザーの潜在確率が一番高いトピックを取得
    user_topic = sorted(lda_topics[i], key=lambda x: -x[1])[0][0]
    # そのトピック内で、確率の高いアイテムを取得
    topic_movies = lda_model.get_topic_terms(user_topic, topn=len(movies))

    for token_id, score in topic_movies:
        movie_id = int(common_dictionary.id2token[token_id])
        if movie_id not in evaluated_movie_ids:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

pred_user2items[-1]

[1196, 1373, 260, 1240, 480, 1127, 1907, 2735, 1702, 8405]

In [44]:
# user_id=-1に対するおすすめ
movies[movies.movie_id.isin([1196, 1373, 260, 1240, 480, 1127, 1907, 2735, 1702, 8405])]

Unnamed: 0,movie_id,title,genre,tag
257,260,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo..."
476,480,Jurassic Park (1993),"[Action, Adventure, Sci-Fi, Thriller]","[based on a book, biology, michael crichton, s..."
1104,1127,"Abyss, The (1989)","[Action, Adventure, Sci-Fi, Thriller]","[aliens, ocean, submarine, dvd collection, to ..."
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."
1212,1240,"Terminator, The (1984)","[Action, Sci-Fi, Thriller]","[arnold schwarzenegger, sci-fi, time travel, d..."
1342,1373,Star Trek V: The Final Frontier (1989),"[Action, Sci-Fi]","[golden raspberry (worst actor), owned, based ..."
1643,1702,Flubber (1997),"[Children, Comedy, Fantasy]","[robin williams, absent-minded professor, robi..."
1823,1907,Mulan (1998),"[Adventure, Animation, Children, Comedy, Drama]","[2, fabulous, walt disney, disney, disney was ..."
2650,2735,"Golden Child, The (1986)","[Action, Adventure, Comedy, Fantasy, Mystery]",[don't remember]
7787,8405,Hour of the Wolf (Vargtimmen) (1968),"[Drama, Horror]","[who cares dvds, erlend's dvds, ingmar bergman..."


In [40]:
#(参考) 上記user_id=-1の学習データで、4以上の評価を付けた映画一覧
movielens_train_high_rating[movielens_train_high_rating.user_id==-1]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
127830,-1,4896,5.0,1770030000.0,Harry Potter and the Sorcerer's Stone (a.k.a. ...,"['Adventure', 'Children', 'Fantasy']","['based on a book', 'franchise', 'adventure', ...",1.0
127831,-1,5816,5.0,1770030000.0,Harry Potter and the Chamber of Secrets (2002),"['Adventure', 'Children', 'Fantasy']","['based on a book', 'franchise', 'harry potter...",2.0
127832,-1,8368,4.0,1770030000.0,Harry Potter and the Prisoner of Azkaban (2004),"['Adventure', 'Children', 'Fantasy']","['based on a book', 'franchise', 'magic', 'har...",3.0
127833,-1,40815,4.5,1770030000.0,Harry Potter and the Goblet of Fire (2005),"['Adventure', 'Fantasy', 'Thriller']","['based on a book', 'big budget', 'franchise',...",4.0
127836,-1,5618,5.0,1770030000.0,Spirited Away (Sen to Chihiro no kamikakushi) ...,"['Adventure', 'Animation', 'Children', 'Fantasy']","['anime', ""holy christ it's fantastic in every...",7.0
127837,-1,1022,4.5,1770030000.0,Cinderella (1950),"['Animation', 'Children', 'Fantasy', 'Musical'...","['disney', 'disney', 'library vhs', 'fairy tal...",8.0
127839,-1,63239,5.0,1770030000.0,Cinderella (1997),"['Children', 'Fantasy', 'Musical', 'Romance']",,10.0
127840,-1,594,4.0,1770030000.0,Snow White and the Seven Dwarfs (1937),"['Animation', 'Children', 'Drama', 'Fantasy', ...","['disney', 'classic', 'disney', 'national film...",11.0
127844,-1,595,4.0,1770030000.0,Beauty and the Beast (1991),"['Animation', 'Children', 'Fantasy', 'Musical'...","['fairy tale', 'disney', 'disney', 'disney', '...",15.0
127848,-1,916,5.0,1770030000.0,Roman Holiday (1953),"['Comedy', 'Romance']","[""good in it's day"", 'classic', 'audrey hepbur...",19.0
