## Word2Vec 알고리즘


In [2]:
!pip3 install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 8.0 MB/s eta 0:00:011
Collecting smart-open>=1.8.1
  Downloading smart_open-4.0.1.tar.gz (117 kB)
[K     |████████████████████████████████| 117 kB 11.4 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108248 sha256=2680afbac03bc19a5340ea4849df5bf84bf2a2ef1492668843fdf8a41f839b81
  Stored in directory: /Users/oms1226/Library/Caches/pip/wheels/34/3d/14/f19c01a19c9201cdb6a76b049904d5226912569be919ad1eae
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-4.0.1


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim 

In [5]:
import warnings
warnings.filterwarnings(action='ignore')

In [6]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = './movies/'

In [7]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [8]:
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [9]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [12]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'movieId', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [17]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title_x,original_title_y,original_title_x.1,original_title_y.1,original_title
0,383,21,3.0,789652009,The Endless Summer,The Endless Summer,The Endless Summer,The Endless Summer,The Endless Summer
1,383,47,5.0,789652009,,,,,
2,383,1079,3.0,789652009,,,,,
3,409,21,5.0,828212412,The Endless Summer,The Endless Summer,The Endless Summer,The Endless Summer,The Endless Summer
4,409,25,4.0,828212412,Jarhead,Jarhead,Jarhead,Jarhead,Jarhead


In [24]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [25]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [26]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

Word2vec 적용

In [27]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [28]:
sentence

[['Jay and Silent Bob Strike Back',
  'Vivement dimanche!',
  'Rocky III',
  'American Pie',
  'My Tutor',
  'Greed'],
 ['Terminator 3: Rise of the Machines',
  'The Conversation',
  'The Hours',
  '48 Hrs.',
  'Back to the Future Part II',
  'Silent Hill',
  'Crustacés et coquillages',
  'Lost in Translation',
  'Night on Earth',
  "Dave Chappelle's Block Party",
  "Ocean's Eleven",
  'Sissi',
  'Live and Let Die',
  'A Clockwork Orange',
  'Солярис',
  'Sommer vorm Balkon',
  'La science des rêves',
  'Trois couleurs : Rouge',
  'Grbavica',
  'Czlowiek z zelaza',
  'Le Mépris',
  'Batman Returns',
  'Romeo + Juliet',
  'Monsoon Wedding',
  'Stand by Me',
  'Lucky Number Slevin',
  'Cat on a Hot Tin Roof',
  'The Dark',
  'The Devil Wears Prada',
  'Lili Marleen',
  'Star Trek IV: The Voyage Home',
  'A Nightmare on Elm Street',
  'Notting Hill',
  'Once Were Warriors',
  'Reservoir Dogs',
  '2001: A Space Odyssey',
  'Rebecca',
  'Psycho',
  'The Poseidon Adventure',
  'Batman Begins

In [29]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)#sq = 0//cbow, sg = 1//skip-gram

In [23]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('Snow Cake', 0.8847211599349976),
 ('Snow White and the Seven Dwarfs', 0.7787439823150635),
 ('Face/Off', 0.7612183094024658),
 ('Domicile Conjugal', 0.751819372177124),
 ('Rumor Has It...', 0.7406324148178101),
 ('Heavenly Creatures', 0.7241724729537964),
 ('Blow', 0.719482958316803),
 ('The Godfather', 0.7167437076568604),
 ('Nirgendwo in Afrika', 0.7141861319541931),
 ('Some Like It Hot', 0.7043530941009521)]

## Doc2Vec 적용

![](https://drive.google.com/uc?export=view&id=1g2ausKfoaAT0jMwSatRUG3fiGWfDuysV
)

In [15]:
from gensim.models import doc2vec

In [16]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [17]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [18]:
meta['pre_overview'] = overview

In [19]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [20]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [21]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [22]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 868.6329123973846


In [23]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('It Stains the Sands Red', 0.7269709706306458),
 ('Unstrung Heroes', 0.7174409031867981),
 ('Due Amici', 0.7136842012405396),
 ('La moutarde me monte au nez', 0.7097397446632385),
 ('Children in the Surf at Coney Island', 0.6954304575920105),
 ('Live Forever as You Are Now with Alan Resnick', 0.6951810717582703),
 ('Letzte Worte', 0.694780170917511),
 ('Skazka o Poteryannom Vremeni', 0.6917924880981445),
 ('Killing Zoe', 0.6796911954879761),
 ('Testről és lélekről', 0.6681605577468872),
 ("Independents' Day", 0.6628682017326355),
 ('Meet Me in Venice', 0.662304699420929),
 ('Trois vies et une seule mort', 0.6607695817947388),
 ('エクスマキナ', 0.6600291132926941),
 ('El vendedor de humo', 0.6584940552711487),
 ('The Aristocats', 0.6533187031745911),
 ('Wszyscy jesteśmy Chrystusami', 0.6474179029464722),
 ('Begegnung mit Fritz Lang', 0.6451306343078613),
 ('Return to the Batcave: The Misadventures of Adam and Burt',
  0.6444643139839172),
 ('Milk Money', 0.6425882577896118)]

In [24]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('Hatchet II', 0.7217295169830322),
 ('The Mad Miss Manton', 0.6965864896774292),
 ('Soloalbum', 0.6962880492210388),
 ('Handsome Harry', 0.6917732954025269),
 ('Winnie the Pooh and the Honey Tree', 0.6841850876808167),
 ('The Princess and the Goblin', 0.6834225654602051),
 ('Winnie the Pooh and a Day for Eeyore', 0.6833527088165283),
 ("I Don't Know How She Does It", 0.6819157600402832),
 ('Just Go with It', 0.67984938621521),
 ('$ Dollars', 0.6774436235427856),
 ('Train', 0.674703061580658),
 ('Bulldog Drummond Escapes', 0.6741313934326172),
 ('1990: I guerrieri del Bronx', 0.6741003394126892),
 ('Zamilované Maso', 0.6739451289176941),
 ('Riot in Cell Block 11', 0.6702640056610107),
 ('La Neuvaine', 0.6693864464759827),
 ('The Green Years', 0.6691372990608215),
 ('The Overbrook Brothers', 0.668933093547821),
 ('Se sei vivo spara', 0.6686355471611023),
 ('Hævnen', 0.6685347557067871)]