In [1]:
# mac475의 ipython 표준 style을 적용함
from IPython.core.display import HTML
styles = open("../resources/styles/custom.css", "r").read()
HTML( styles )

In [2]:
#  Pretty Display of Variables를 적용하여 중간 결과를 확인하고자 함
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# matplotlib plot내 한글의 표현위한 설정
from matplotlib import font_manager, rc
font_fname = 'c:/windows/fonts/malgun.ttf'     # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)

## 1. 영화리뷰 data import

In [3]:
# data reading function
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t')[1:] for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

In [4]:
train_data = read_data('./datasets/ratings_train.txt')
test_data = read_data('./datasets/ratings_test.txt')

## 2. tokenizing

In [5]:
# pos tagger 정의, parts of speech : 품사
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [6]:
from tqdm import tqdm

train_docs = [(tokenize(row[0]), row[1]) for row in tqdm(train_data)]

100%|█████████████████████████████████████████████████████████████████████████| 150000/150000 [03:18<00:00, 757.16it/s]


In [7]:
test_docs = [(tokenize(row[0]), row[1]) for row in tqdm(test_data)]

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [00:58<00:00, 858.50it/s]


In [8]:
train_docs[0]
test_docs[0]

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증/Noun',
  '나다/Verb',
  '목소리/Noun'],
 '0')

(['굳다/Adjective', 'ㅋ/KoreanParticle'], '1')

In [9]:
# 모든 document내의 pos tagging된 word를 하나의 list에 tokens로 담는다
tokens = [t for d in train_docs for t in d[0]]
len(tokens)

2194536

## 3. doc2vec 기반 분석

### doc2vec concept

- 참고 : https://stackoverflow.com/questions/42827175/gensim-what-is-difference-between-word2vec-and-doc2vec

In word2vec, you train to find word vectors and then run similarity queries between words.

In doc2vec, you tag your text and you also get tag vectors.

For instance, you have different documents from different authors and use authors as tags on documents

Then, after doc2vec training you can use the same vector aritmetics to run similarity queries on author tags: i.e who are the most similar authors to AUTHOR_X? If two authors generally use the same words then their vector will be closer. AUTHOR_X is not a real word which is part of your corpus just something you determine. So you don't need to have it or manually insert it into your text. Gensim allows you to train doc2vec with or without word vectors (i.e. if you only care about tag similarities between each other).

Here is a good presentation (https://www.youtube.com/watch?v=vkfXBGnDplQ) on word2vec basics and how they use doc2vec in an innovative way for product recommendations (related blog post).

If you tell me about what problem you are trying to solve, may be I can suggest which method will be more appropriate.

In [15]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

# 여기서는 15만개 training documents 전부 사용함
cnt = 1000000
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in tqdm(train_docs[:cnt])]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in tqdm(test_docs[:cnt])]

100%|██████████████████████████████████████████████████████████████████████| 150000/150000 [00:00<00:00, 371142.09it/s]
100%|████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 177777.17it/s]


In [16]:
from gensim.models import doc2vec

# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)



In [17]:
len(tagged_train_docs)

150000

In [18]:
?doc_vectorizer.train()

In [20]:
# Train document vectors!
for epoch in tqdm(range(10)):
    doc_vectorizer.train(tagged_train_docs, epochs=epoch, total_examples=len(tagged_train_docs))
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

1871790

 10%|████████▎                                                                          | 1/10 [00:06<00:58,  6.45s/it]

1872469

 20%|████████████████▌                                                                  | 2/10 [00:13<00:52,  6.53s/it]

3743748

 30%|████████████████████████▉                                                          | 3/10 [00:25<01:00,  8.65s/it]

5617265

 40%|█████████████████████████████████▏                                                 | 4/10 [00:45<01:08, 11.38s/it]

7487437

 50%|█████████████████████████████████████████▌                                         | 5/10 [01:12<01:12, 14.54s/it]

9360677

 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:45<01:10, 17.59s/it]

11232239

 70%|██████████████████████████████████████████████████████████                         | 7/10 [02:25<01:02, 20.84s/it]

13104418

 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [03:12<00:48, 24.12s/it]

14976882

 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [04:08<00:27, 27.64s/it]

16849972

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:09<00:00, 30.93s/it]


In [21]:
# To save
doc_vectorizer.save('./model/17.10.15.02.doc2vec.model')

In [25]:
doc_vectorizer.most_similar('공포/Noun')

doc_vectorizer.most_similar('ㅋㅋ/KoreanParticle')

[('공포영화/Noun', 0.4493910074234009),
 ('호러/Noun', 0.392019659280777),
 ('서스펜스/Noun', 0.38125309348106384),
 ('미스터리/Noun', 0.3314100503921509),
 ('스릴러/Noun', 0.3085022568702698),
 ('긴박/Noun', 0.3073579668998718),
 ('무섭다/Adjective', 0.3045444190502167),
 ('당혹/Noun', 0.2915957570075989),
 ('귀신/Noun', 0.29066091775894165),
 ('박진/Noun', 0.28867214918136597)]

[('ㅋㅋㄱ/KoreanParticle', 0.2807654142379761),
 ('빵/Noun', 0.26301223039627075),
 ('-_-;/Punctuation', 0.24678769707679749),
 ('~~~!!!/Punctuation', 0.24049502611160278),
 ('왤다/Verb', 0.24032855033874512),
 ('ㅉㅉ/KoreanParticle', 0.23816987872123718),
 ('ㅋ/KoreanParticle', 0.23777486383914948),
 ('개잼/Noun', 0.22892747819423676),
 ('빵빵/Noun', 0.2283269762992859),
 ('두준/Noun', 0.22832536697387695)]

In [26]:
tagged_train_docs[0]

TaggedDocument(words=['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증/Noun', '나다/Verb', '목소리/Noun'], tags=['0'])

In [29]:
doc_vectorizer.most_similar(positive=['여자/Noun', '왕/Noun'], negative=['남자/Noun'])

[('압/Noun', 0.2658236622810364),
 ('증인/Noun', 0.24955277144908905),
 ('구혜선/Noun', 0.24669775366783142),
 ('크리스틴/Noun', 0.24441316723823547),
 ('총집/Noun', 0.23989495635032654),
 ('조정석/Noun', 0.23231792449951172),
 ('심은경/Noun', 0.2314663529396057),
 ('김정일/Noun', 0.23072248697280884),
 ('연기대상/Noun', 0.229667529463768),
 ('더욱더/Noun', 0.22628194093704224)]

In [31]:
len(doc_vectorizer.docvecs)

2

In [35]:
tokens_test = '여자/Noun 왕/Noun'.split()

new_vector = doc_vectorizer.infer_vector(tokens_test)

In [37]:
?doc_vectorizer.docvecs.most_similar()

In [38]:
doc_vectorizer.docvecs.most_similar([new_vector]) #gives you top 10 document tags and their cosine similarity

[('0', 0.17736783623695374), ('1', 0.1082363873720169)]

In [None]:
# 다음은 이것을 확인할 것

# https://www.lucypark.kr/courses/2015-ba/text-mining.html#3-load-tokens-with-nltktext