## 단어 임베딩을 위한 Corpus 생성

In [1]:
import nltk
ltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\EZEN\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
from nltk.corpus import movie_reviews
sentences = [list(s) for s in movie_reviews.sents()]

In [3]:
movie_reviews.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [4]:
type(movie_reviews.sents())

nltk.corpus.reader.util.ConcatenatedCorpusView

In [5]:
len(sentences)

71532

In [6]:
sentences[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

In [7]:
movie_reviews.sents()[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

## 코퍼스를 입력 인수로 하여 Word2Vec 클래스 객체를 생성

In [8]:
from gensim.models.word2vec import Word2Vec

In [9]:
# 트레이닝 과정
%time model = Word2Vec(sentences)

Wall time: 3.96 s


In [10]:
# 트레이닝 종료후 메모리 반환(unload)
model.init_sims(replace=True)

In [11]:
dir(model.wv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for

In [12]:
model.wv.similarity('actor', 'actress')

0.87563944

In [13]:
model.wv.similarity('he', 'she')

0.8526267

In [14]:
model.wv.similarity('actor', 'she')

0.2627675

In [15]:
model.wv.most_similar("accident")

[('plane', 0.8866292238235474),
 ('abandoned', 0.8721041083335876),
 ('prison', 0.8637433648109436),
 ('dying', 0.861630380153656),
 ('record', 0.854856550693512),
 ('ball', 0.8479363322257996),
 ('secret', 0.8421413898468018),
 ('boat', 0.8404271602630615),
 ('church', 0.8388646841049194),
 ('jail', 0.8384977579116821)]

In [16]:
# she + (actor - actress)
model.wv.most_similar(positive=['she', 'actor'], negative='actress', topn=1)

[('he', 0.311848908662796)]

## 네이버 영화 감상 코퍼스

In [22]:
import codecs

def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

train_data = read_data('data/ratings_train.txt')

In [23]:
len(train_data)

150000

In [24]:
print(train_data[:5])

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'], ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'], ['10265843', '너무재밓었다그래서보는것을추천한다', '0'], ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'], ['6483659', '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '1']]


In [25]:
from konlpy.tag import Okt
tagger = Okt()

def tokenize(doc):
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]

train_docs = [row[1] for row in train_data]

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [26]:
# 시간이 매우 오래 걸림
%time sentences = [tokenize(d) for d in train_docs]

Wall time: 5min 46s


In [27]:
%time model = Word2Vec(sentences)
model.init_sims(replace=True)

Wall time: 6.07 s


In [34]:
model.wv.vectors.shape

(15409, 100)

In [28]:
model.wv.similarity(*tokenize(u'배우 여배우'))

0.73382354

In [29]:
model.wv.similarity(*tokenize(u'배우 남자'))

0.29028413

In [30]:
# 남자 + (여배우 - 배우) = 여자
from konlpy.utils import pprint
pprint(model.wv.most_similar(positive=tokenize(u'남자 여배우'), 
                             negative=tokenize(u'배우'), topn=5))

[('여자/Noun', 0.8398576378822327),
 ('아빠/Noun', 0.7005077600479126),
 ('여자애/Noun', 0.6867963671684265),
 ('꼬마/Noun', 0.677230954170227),
 ('할아버지/Noun', 0.6763973832130432)]


In [31]:
# 아빠 + (남자 - 여자) = 엄마
pprint(model.wv.most_similar(positive=tokenize(u'아빠 남자'), 
                             negative=tokenize(u'여자'), topn=1))

[('엄마/Noun', 0.8623928427696228)]


In [35]:
model.save('model/movie_review.model')