[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/LDA_collaboration.ipynb)

# LDA를 행동 데이터에 적용

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:15:58--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:15:59 (63.6 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 50
# 에폭 수
n_epochs = 30

In [4]:
!pip install gensim==4.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.0.1
  Downloading gensim-4.0.1-cp38-cp38-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 1.6 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [5]:
from gensim.corpora.dictionary import Dictionary

# LDA 입력으로 사용할 데이터를 작성한다
lda_data = []
movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    lda_data.append(data["movie_id"].apply(str).tolist())

common_dictionary = Dictionary(lda_data)
common_corpus = [common_dictionary.doc2bow(text) for text in lda_data]




In [6]:
import gensim
lda_model = gensim.models.LdaModel(
    common_corpus, id2word=common_dictionary, num_topics=factors, passes=n_epochs
)

In [7]:
# 각 사용자의 소속 토픽이 저장된다
lda_topics = lda_model[common_corpus]

# 예: 어떤 사용자의 소속 토픽
lda_topics[0]

[(2, 0.13143347), (42, 0.81523174)]

In [8]:
# topic0인 영화 목록
for token_id, score in lda_model.get_topic_terms(0, topn=10):
    movie_id = int(common_dictionary.id2token[token_id])
    title = movies[movies.movie_id == movie_id].title.tolist()[0]
    print(f'movie_id={movie_id}, title={title}, score={score}')

movie_id=2858, title=American Beauty (1999), score=0.014902905561029911
movie_id=4022, title=Cast Away (2000), score=0.013141673058271408
movie_id=356, title=Forrest Gump (1994), score=0.009344562888145447
movie_id=1625, title=Game, The (1997), score=0.009039949625730515
movie_id=3408, title=Erin Brockovich (2000), score=0.008023954927921295
movie_id=1610, title=Hunt for Red October, The (1990), score=0.007280151825398207
movie_id=2762, title=Sixth Sense, The (1999), score=0.007244058884680271
movie_id=1833, title=Mercury Rising (1998), score=0.007027590647339821
movie_id=50, title=Usual Suspects, The (1995), score=0.00636146729812026
movie_id=2706, title=American Pie (1999), score=0.006130032241344452


In [9]:
# 스타워즈/에피소드 5(movie_id=1196)의 토픽(각 토픽에 소속될 확률)
lda_model[common_dictionary.doc2bow(["1196"])]

[(0, 0.010000083),
 (1, 0.010000083),
 (2, 0.010000083),
 (3, 0.010000083),
 (4, 0.010000083),
 (5, 0.010000083),
 (6, 0.010000083),
 (7, 0.010000083),
 (8, 0.010000083),
 (9, 0.010000083),
 (10, 0.010000083),
 (11, 0.010000083),
 (12, 0.010000083),
 (13, 0.010000083),
 (14, 0.010000083),
 (15, 0.010000083),
 (16, 0.010000083),
 (17, 0.010000083),
 (18, 0.010000083),
 (19, 0.010000083),
 (20, 0.010000083),
 (21, 0.010000083),
 (22, 0.010000083),
 (23, 0.010000083),
 (24, 0.010000083),
 (25, 0.010000083),
 (26, 0.010000083),
 (27, 0.010000083),
 (28, 0.010000083),
 (29, 0.010000083),
 (30, 0.010000083),
 (31, 0.010000083),
 (32, 0.010000083),
 (33, 0.010000083),
 (34, 0.010000083),
 (35, 0.010000083),
 (36, 0.010000083),
 (37, 0.010000083),
 (38, 0.010000083),
 (39, 0.010000083),
 (40, 0.010000083),
 (41, 0.50999594),
 (42, 0.010000083),
 (43, 0.010000083),
 (44, 0.010000083),
 (45, 0.010000083),
 (46, 0.010000083),
 (47, 0.010000083),
 (48, 0.010000083),
 (49, 0.010000083)]

In [10]:
from collections import defaultdict

# 각 사용자에 대한 추천 리스트를 작성한다
# 각 사용자의 소속 확률이 가장 높은 토픽을 얻고, 해당 토픽 안에서 확률이 높은 아이템을 저장해 나간다

user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

pred_user2items = defaultdict(list)
for i, (user_id, data) in enumerate(movielens_train_high_rating.groupby("user_id")):
    evaluated_movie_ids = user_evaluated_movies[user_id]
    # 사용자의 소속 확률이 가장 높은 토픽을 얻는다
    user_topic = sorted(lda_topics[i], key=lambda x: -x[1])[0][0]
    # 해당 토픽 안에서 확률이 높은 아이템을 얻는다
    topic_movies = lda_model.get_topic_terms(user_topic, topn=len(movies))

    for token_id, score in topic_movies:
        movie_id = int(common_dictionary.id2token[token_id])
        if movie_id not in evaluated_movie_ids:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

pred_user2items

defaultdict(list,
            {1: [457, 380, 593, 150, 349, 590, 318, 165, 110, 296],
             2: [457, 380, 593, 150, 480, 349, 589, 318, 165, 588],
             3: [2571, 4993, 4963, 3147, 3578, 5349, 1580, 356, 2762, 2273],
             4: [457, 593, 318, 296, 356, 10, 47, 50, 1, 474],
             5: [1193, 904, 1252, 750, 1208, 908, 913, 1136, 260, 1304],
             6: [1240, 1210, 589, 1200, 1214, 541, 593, 1374, 1356, 1617],
             7: [1193, 858, 750, 1208, 527, 111, 1247, 1221, 1136, 1206],
             8: [3481, 4034, 4973, 1258, 2692, 7361, 2502, 5902, 296, 4262],
             9: [260, 1196, 1240, 1210, 589, 1200, 2571, 1198, 1214, 541],
             10: [1465, 3082, 1610, 474, 2804, 4709, 3469, 2197, 1792, 2989],
             11: [1307, 1197, 1079, 1234, 1278, 1220, 2797, 1968, 1259, 2791],
             12: [720, 1265, 852, 2891, 3254, 4448, 4002, 608, 4223, 4963],
             13: [2571, 4963, 3147, 5349, 6539, 356, 2762, 2273, 6377, 4701],
             14: [257

In [11]:
# user_id=2인 사용자가 학습 데이터에서 4 이상의 평가를 부여한 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [12]:
# user_id=2에 대한 추천(1198, 1196, 1240)
movies[movies.movie_id.isin([1198, 1196, 1240])]

Unnamed: 0,movie_id,title,genre,tag
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."
1173,1198,Raiders of the Lost Ark (Indiana Jones and the...,"[Action, Adventure]","[egypt, lucas, seen more than once, dvd collec..."
1212,1240,"Terminator, The (1984)","[Action, Sci-Fi, Thriller]","[arnold schwarzenegger, sci-fi, time travel, d..."
