[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/Item2vec.ipynb)

# Item2vec

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:13:14--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:13:21 (11.1 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 인자 수
factors = 100
# 에폭 수
n_epochs = 30
# window 크기
window = 100
# 스킵 그램
use_skip_gram = 1
# 계측적 소프트맥스
use_hierarchial_softmax = 0
# 사용할 단어의 출현 횟수의 임곗값
min_count = 5

In [4]:
# item2vec의 입력으로 사용할 데이터를 생성한다
item2vec_data = []
movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    # 평가된 순서대로 나열한다
    # item2vec에서는 window라는 파라미터가 있으며, item이 평가된 순서도 중요한 요소이다
    item2vec_data.append(data.sort_values("timestamp")["movie_id"].tolist())


In [5]:
!pip install gensim==4.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.0.1
  Downloading gensim-4.0.1-cp38-cp38-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [6]:
import gensim

# item2vec 학습
model = gensim.models.word2vec.Word2Vec(
    item2vec_data,
    vector_size=factors,
    window=window,
    sg=use_skip_gram,
    hs=use_hierarchial_softmax,
    epochs=n_epochs,
    min_count=min_count,
)



In [7]:
# 스타워즈/에피소드 5(movie_id=1196）를 입력했을 때의 유사 영화
# 스타워즈/에피소드 4, 6이 상위에 나타나므로 학습되었음을 알 수 있다
for movie_id, score in model.wv.most_similar(1196):
    title = movies[movies.movie_id == movie_id].title.tolist()[0]
    print(f'movie_id={movie_id}, title={title}, score={score}')

movie_id=260, title=Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977), score=0.8508290648460388
movie_id=1210, title=Star Wars: Episode VI - Return of the Jedi (1983), score=0.839510977268219
movie_id=1198, title=Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), score=0.8310489058494568
movie_id=2571, title=Matrix, The (1999), score=0.7958545684814453
movie_id=1240, title=Terminator, The (1984), score=0.7907897233963013
movie_id=541, title=Blade Runner (1982), score=0.7818757891654968
movie_id=1197, title=Princess Bride, The (1987), score=0.7788391709327698
movie_id=1200, title=Aliens (1986), score=0.7773821949958801
movie_id=1214, title=Alien (1979), score=0.7692050933837891
movie_id=1291, title=Indiana Jones and the Last Crusade (1989), score=0.7649016976356506


In [8]:
# 각 사용자에 대한 추천 리스트를 작성한다

pred_user2items = dict()
for user_id, data in movielens_train_high_rating.groupby("user_id"):
    input_data = []
    for item_id in data.sort_values("timestamp")["movie_id"].tolist():
        if item_id in model.wv.key_to_index:
            input_data.append(item_id)
    if len(input_data) == 0:
        # 추천 계산할 수 없는 경우에는 빈 배열
        pred_user2items[user_id] = []
        continue
    recommended_items = model.wv.most_similar(input_data, topn=10)
    pred_user2items[user_id] = [d[0] for d in recommended_items]

pred_user2items

{1: [380, 457, 110, 150, 590, 592, 318, 595, 165, 500],
 2: [589, 1196, 593, 356, 480, 1198, 457, 318, 296, 1291],
 3: [2571, 1196, 318, 1198, 527, 2028, 593, 4993, 3578, 5952],
 4: [457, 356, 380, 377, 318, 593, 10, 349, 296, 367],
 5: [1208, 1193, 318, 36, 904, 296, 1213, 750, 908, 1617],
 6: [1240, 1291, 1617, 1210, 541, 1036, 589, 593, 2858, 1214],
 7: [750, 1208, 1267, 910, 858, 1193, 1247, 1221, 1204, 541],
 8: [1210, 4993, 110, 260, 356, 2028, 5952, 318, 296, 3147],
 9: [1136, 750, 1617, 2791, 1089, 296, 1265, 1206, 2571, 50],
 10: [1207, 608, 1247, 318, 1271, 593, 1230, 1185, 300, 858],
 11: [1197, 1036, 457, 589, 1214, 110, 1200, 356, 1265, 541],
 12: [2571, 1196, 1240, 2028, 1291, 1210, 589, 1580, 1198, 1617],
 13: [1196, 1198, 1270, 1240, 2571, 110, 260, 356, 593, 589],
 14: [1198, 7153, 4306, 4886, 4993, 5952, 2571, 8360, 356, 912],
 16: [541, 2571, 1198, 1270, 750, 589, 1291, 1374, 1036, 1197],
 17: [1196, 750, 541, 904, 1208, 1617, 1240, 1221, 912, 1198],
 18: [1210, 50, 

In [9]:
# user_id=2인 사용자가 학습 데이터에서 4 이상의 평가를 부여한 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [10]:
# user_id=2에 대한 추천(480, 1196, 589)
movies[movies.movie_id.isin([480, 1196, 589])]

Unnamed: 0,movie_id,title,genre,tag
476,480,Jurassic Park (1993),"[Action, Adventure, Sci-Fi, Thriller]","[based on a book, biology, michael crichton, s..."
583,589,Terminator 2: Judgment Day (1991),"[Action, Sci-Fi]","[action, sci-fi, dvd, seen more than once, tim..."
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."


In [11]:
# item2vec에서는 인자 수, 에폭 수, windows 크기, 사용하는 단어의 출현 횟수의 임곗값 모두가 중요하므루, 그리드 서치를 통해 최적의 값을 결정합니다.