[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/LDA_content.ipynb)

# 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:19:32--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:19:33 (106 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 50
# 에폭 수
n_epochs = 30

In [4]:
movie_content = movies.copy()
# tag가 부여되어 있지 않은 영화가 있지만, genre는 모든 영화에 부여되어 있다
# tag와 genre를 결합한 것을 영화 콘텐츠 정보로 하여 비슷한 영화를 찾아서 추천한다
# tag가 없는 영화는 NaN으로 되어 있으므로, 빈 리스트로 변환한 뒤 처리한다
movie_content["tag_genre"] = movie_content["tag"].fillna("").apply(list) + movie_content["genre"].apply(list)
movie_content["tag_genre"] = movie_content["tag_genre"].apply(lambda x: list(map(str, x)))

# 태그와 장르 데이터를 사용해 lda를 학습한다
tag_genre_data = movie_content.tag_genre.tolist()

In [5]:
!pip install gensim==4.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.0.1
  Downloading gensim-4.0.1-cp38-cp38-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [6]:
from gensim.corpora.dictionary import Dictionary

# LDA의 입력으로 사용할 데이터를 작성한다
common_dictionary = Dictionary(tag_genre_data)
common_corpus = [common_dictionary.doc2bow(text) for text in tag_genre_data]




In [7]:
import gensim

# LDA 학습
lda_model = gensim.models.LdaModel(
    common_corpus, id2word=common_dictionary, num_topics=factors, passes=n_epochs
)

In [8]:
# topic0인 단어 목록
for token_id, score in lda_model.get_topic_terms(0, topn=10):
    word = common_dictionary.id2token[token_id]
    print(f'word={word}, score={score}')

word=coen brothers, score=0.06292896717786789
word=bechdel test:fail, score=0.04510311409831047
word=based on book, score=0.04402820020914078
word=philip k. dick, score=0.04182061925530434
word=kidnapping, score=0.03534778952598572
word=quirky, score=0.02876952663064003
word=david lynch, score=0.022975284606218338
word=fascism, score=0.01680714637041092
word=sure thing, score=0.015899477526545525
word=robert altman, score=0.015737345442175865


In [9]:
# samurai라는 단어의 토픽(각 토픽에 소속될 확률)
lda_model[common_dictionary.doc2bow(['samurai'])]

[(0, 0.01000004),
 (1, 0.01000004),
 (2, 0.01000004),
 (3, 0.01000004),
 (4, 0.01000004),
 (5, 0.01000004),
 (6, 0.01000004),
 (7, 0.01000004),
 (8, 0.01000004),
 (9, 0.01000004),
 (10, 0.01000004),
 (11, 0.01000004),
 (12, 0.01000004),
 (13, 0.01000004),
 (14, 0.01000004),
 (15, 0.01000004),
 (16, 0.01000004),
 (17, 0.01000004),
 (18, 0.01000004),
 (19, 0.01000004),
 (20, 0.01000004),
 (21, 0.01000004),
 (22, 0.01000004),
 (23, 0.01000004),
 (24, 0.01000004),
 (25, 0.01000004),
 (26, 0.01000004),
 (27, 0.01000004),
 (28, 0.01000004),
 (29, 0.01000004),
 (30, 0.01000004),
 (31, 0.01000004),
 (32, 0.509998),
 (33, 0.01000004),
 (34, 0.01000004),
 (35, 0.01000004),
 (36, 0.01000004),
 (37, 0.01000004),
 (38, 0.01000004),
 (39, 0.01000004),
 (40, 0.01000004),
 (41, 0.01000004),
 (42, 0.01000004),
 (43, 0.01000004),
 (44, 0.01000004),
 (45, 0.01000004),
 (46, 0.01000004),
 (47, 0.01000004),
 (48, 0.01000004),
 (49, 0.01000004)]

In [10]:
# 각 영화의 토픽을 저장한다
lda_topics = lda_model[common_corpus]

# 각 영화에 가장 확률이 높은 토픽을 1개 추출해서 저장한다
movie_topics = []
movie_topic_scores = []
for movie_index, lda_topic in enumerate(lda_topics):
    sorted_topic = sorted(lda_topics[movie_index], key=lambda x: -x[1])
    # 가장 확률이 높은 토픽
    movie_topic, topic_score = sorted_topic[0]
    movie_topics.append(movie_topic)
    movie_topic_scores.append(topic_score)
movie_content["topic"] = movie_topics
movie_content["topic_score"] = movie_topic_scores
movie_content

Unnamed: 0,movie_id,title,genre,tag,tag_genre,topic,topic_score
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat...","[pixar, pixar, pixar, animation, pixar, animat...",34,0.777594
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro...","[for children, game, animals, joe johnston, ro...",40,0.275520
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra...","[funniest movies, comedinha de velhinhos engra...",7,0.501525
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie],"[girl movie, Comedy, Drama, Romance]",49,0.755000
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin...","[steve martin, pregnancy, remake, steve martin...",34,0.692084
...,...,...,...,...,...,...,...
10676,65088,Bedtime Stories (2008),"[Adventure, Children, Comedy]",,"[Adventure, Children, Comedy]",34,0.755000
10677,65091,Manhattan Melodrama (1934),"[Crime, Drama, Romance]",,"[Crime, Drama, Romance]",49,0.456076
10678,65126,Choke (2008),"[Comedy, Drama]","[chuck palahniuk, based on book]","[chuck palahniuk, based on book, Comedy, Drama]",49,0.505002
10679,65130,Revolutionary Road (2008),"[Drama, Romance]",[toplist08],"[toplist08, Drama, Romance]",49,0.673333


In [11]:
from collections import defaultdict
from collections import Counter

# 각 사용자에 대한 추천 리스트를 작성한다
# 사용자가 높게 평가한 영화가, 어떤 토픽에 많이 소속되어 있는지 카운트한다
# 가장 많은 토픽을 사용자가 좋아하는 토픽으로 간주하고, 해당 토픽의 영화를 추천한다

movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]
user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

movie_id2index = dict(zip(movie_content.movie_id.tolist(), range(len(movie_content))))
pred_user2items = defaultdict(list)
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    # 사용자가 높이 평가한 영화
    evaluated_movie_ids = user_evaluated_movies[user_id]
    # 최근 열람한 영화를 얻는다
    movie_ids = data.sort_values("timestamp")["movie_id"].tolist()[-10:]

    movie_indexes = [movie_id2index[id] for id in movie_ids]
    
    # 최근 열람한 영화의 토픽을 얻고, 풀현 횟수를 카운트한다
    topic_counter = Counter([movie_topics[i] for i in movie_indexes])
    # 가장 출현 횟수가 많았던 토픽을 얻는다
    frequent_topic = topic_counter.most_common(1)[0][0]
    # 해당 토픽의 영화 중에서도 점수가 높은 것을 추천한다
    topic_movies = (
        movie_content[movie_content.topic == frequent_topic]
        .sort_values("topic_score", ascending=False)
        .movie_id.tolist()
    )

    for movie_id in topic_movies:
        if movie_id not in evaluated_movie_ids:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break
pred_user2items

defaultdict(list,
            {1: [1210, 1196, 260, 5378, 33493, 780, 2628, 1200, 1580, 1584],
             2: [1196, 5378, 33493, 2628, 1200, 1580, 377, 1584, 1438, 788],
             3: [48161, 993, 930, 2178, 524, 2375, 904, 2186, 290, 43710],
             4: [1210, 1196, 260, 5378, 33493, 780, 2628, 1200, 1580, 1584],
             5: [8749, 6921, 3134, 7176, 6649, 8126, 4046, 7238, 26151, 6985],
             6: [6152, 7236, 3702, 2528, 4460, 6678, 4166, 2117, 1527, 1682],
             7: [25766, 910, 116, 5231, 2136, 5682, 2937, 464, 7080, 2494],
             8: [3683, 4881, 1394, 5756, 33823, 2583, 76, 663, 1464, 45028],
             9: [1513, 45221, 4452, 5802, 1285, 6807, 1602, 5439, 6816, 4102],
             10: [8749, 6921, 3134, 7176, 6649, 8126, 4046, 7238, 26151, 6985],
             11: [48161, 993, 930, 2178, 524, 2375, 904, 2186, 290, 43710],
             12: [1210, 1196, 260, 5378, 33493, 1200, 1580, 377, 1584, 1438],
             13: [260, 5378, 33493, 780, 2628, 1200, 

In [12]:
# user_id=2인 사용자가 학습 데이터에서 4 이상의 평가를 부여한 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [13]:
# user_id=2에 대한 추천(2115, 76, 2006)
movies[movies.movie_id.isin([2115, 76, 2006])]

Unnamed: 0,movie_id,title,genre,tag
75,76,Screamers (1995),"[Action, Sci-Fi, Thriller]","[philip k. dick, artificial intelligence, post..."
1922,2006,"Mask of Zorro, The (1998)","[Action, Adventure, Romance]","[california, mexico, funny, banderas, anthony ..."
2031,2115,Indiana Jones and the Temple of Doom (1984),"[Action, Adventure]","[lucas, want it, dvd collection, harrison ford..."


In [14]:
# 여기에서는 각 사용자의 평가 이력으로부터 1개의 토픽을 픽업했지만, 토픽의 확률값을 사용해서 가중치를 주면서 아이템을 추출할 수도 있습니다.