In [1]:
# mac475의 ipython 표준 style을 적용함
from IPython.core.display import HTML
styles = open("../resources/styles/custom.css", "r", encoding='utf-8').read()
HTML( styles )

In [2]:
#  Pretty Display of Variables를 적용하여 중간 결과를 확인하고자 함
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# matplotlib plot내 한글의 표현위한 설정
from matplotlib import font_manager, rc
font_fname = 'c:/windows/fonts/malgun.ttf'     # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)

## 1. 영화리뷰 data import

In [3]:
# data reading function
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t')[1:] for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

In [4]:
train_data = read_data('./datasets/ratings_train.txt')
test_data = read_data('./datasets/ratings_test.txt')

## 2. tokenizing

In [5]:
# pos tagger 정의, parts of speech : 품사
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [6]:
# lambda function의 활용 참고

# list(filter(lambda x: x % 2, [0,1,2,3]))
# list(filter(lambda x: x == 'a', ['a','b','c','d']))

In [9]:
# Noun 추출만을 위한 pos tagger
def tokenize_only_noun(doc):
#     print(doc)
    return ['/'.join(t) for t in list(filter(lambda x: x[1]=='Noun', pos_tagger.pos(doc, norm=True, stem=True)))]

In [10]:
from tqdm import tqdm

# train docs 생성
train_docs = [(tokenize(row[0]), row[1]) for row in tqdm(train_data)]
train_docs_only_nouns = [(tokenize_only_noun(row[0]), row[1]) for row in tqdm(train_data)]

                                                                               

In [11]:
# test docs 생성
test_docs = [(tokenize(row[0]), row[1]) for row in tqdm(test_data)]
test_docs_only_nouns = [(tokenize_only_noun(row[0]), row[1]) for row in tqdm(test_data)]

                                                                               

In [12]:
# 확인
train_docs[0]
# test_docs[0]

train_docs_only_nouns[0]
# test_docs_only_nouns[0]

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증/Noun',
  '나다/Verb',
  '목소리/Noun'],
 '0')

(['더빙/Noun', '진짜/Noun', '짜증/Noun', '목소리/Noun'], '0')

In [13]:
# 모든 document내의 pos tagging된 word를 하나의 list에 tokens로 담는다
tokens = [t for d in train_docs for t in d[0]]
len(tokens)

# nouns만 한정하여, 모든 document내의 pos tagging된 word를 하나의 list에 tokens로 담는다
tokens_only_nouns = [t for d in train_docs_only_nouns for t in d[0]]
len(tokens)

2194536

2194536

## 3. doc2vec 기반 분석

### doc2vec concept

- 참고 : https://stackoverflow.com/questions/42827175/gensim-what-is-difference-between-word2vec-and-doc2vec

    >In word2vec, you train to find word vectors and then run similarity queries between words.
    
    >In doc2vec, you tag your text and you also get tag vectors. For instance, you have different documents from different authors and use authors as tags on documents. Then, after doc2vec training you can use the same vector aritmetics to run similarity queries on author tags: i.e who are the most similar authors to AUTHOR_X? If two authors generally use the same words then their vector will be closer. AUTHOR_X is not a real word which is part of your corpus just something you determine. So you don't need to have it or manually insert it into your text. Gensim allows you to train doc2vec with or without word vectors (i.e. if you only care about tag similarities between each other).

    >Here is a good presentation (https://www.youtube.com/watch?v=vkfXBGnDplQ) on word2vec basics and how they use doc2vec in an innovative way for product recommendations (related blog post).
    
참고 : https://stackoverflow.com/questions/42781292/doc2vec-get-most-similar-documents

In [14]:
train_docs[0]
train_docs_only_nouns[0]

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증/Noun',
  '나다/Verb',
  '목소리/Noun'],
 '0')

(['더빙/Noun', '진짜/Noun', '짜증/Noun', '목소리/Noun'], '0')

In [50]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

# 여기서는 15만개 training documents 전부 사용함
cnt = 150000
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in tqdm(train_docs[:cnt])]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in tqdm(test_docs[:cnt])]

tagged_train_docs_only_nouns = [TaggedDocument(d, [c]) for d, c in tqdm(train_docs_only_nouns[:cnt])]
tagged_test_docs_only_nouns = [TaggedDocument(d, [c]) for d, c in tqdm(test_docs_only_nouns[:cnt])]

                                                                               

In [51]:
from gensim.models import doc2vec

# 사전 구축 : 모든 pos
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

In [53]:
# tagged_train_docs[13]
# tagged_train_docs_only_nouns[13]

In [179]:
# 사전 구축 : noun만 포함하는 pos
doc_vectorizer_only_nouns = doc2vec.Doc2Vec(size=500, alpha=0.025, min_alpha=0.025, seed=1235)
doc_vectorizer_only_nouns.build_vocab(tagged_train_docs_only_nouns)

In [55]:
tagged_train_docs_only_nouns[13]
tagged_train_docs_only_nouns[13]

TaggedDocument(words=['담/Noun', '신문/Noun', '기사/Noun', '로만/Noun', '자꾸/Noun', '그/Noun', '사람/Noun', '것/Noun'], tags=['1'])

TaggedDocument(words=['담/Noun', '신문/Noun', '기사/Noun', '로만/Noun', '자꾸/Noun', '그/Noun', '사람/Noun', '것/Noun'], tags=['1'])

In [56]:
len(tagged_train_docs)

150000

In [57]:
# Train document vectors!
for epoch in tqdm(range(10)):
    doc_vectorizer.train(tagged_train_docs, epochs=epoch, total_examples=len(tagged_train_docs))
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

  0%|                                                   | 0/10 [00:00<?, ?it/s]

1872079

 10%|████▎                                      | 1/10 [00:04<00:44,  4.90s/it]

1872739

 20%|████████▌                                  | 2/10 [00:09<00:39,  4.90s/it]

3743266

 30%|████████████▉                              | 3/10 [00:19<00:44,  6.34s/it]

5616065

 40%|█████████████████▏                         | 4/10 [00:34<00:53,  8.95s/it]

7488084

 50%|█████████████████████▌                     | 5/10 [00:54<01:00, 12.15s/it]

9362323

 60%|█████████████████████████▊                 | 6/10 [01:18<01:03, 15.78s/it]

11233019

 70%|██████████████████████████████             | 7/10 [01:47<00:59, 19.71s/it]

13103870

 80%|██████████████████████████████████▍        | 8/10 [02:21<00:47, 23.92s/it]

14979658

 90%|██████████████████████████████████████▋    | 9/10 [02:59<00:28, 28.27s/it]

16852400

                                                                               

In [180]:
# Train document vectors!
for epoch in tqdm(range(10)):
    doc_vectorizer_only_nouns.train(tagged_train_docs_only_nouns, epochs=epoch, total_examples=len(tagged_train_docs_only_nouns))
    doc_vectorizer_only_nouns.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer_only_nouns.min_alpha = doc_vectorizer_only_nouns.alpha  # fix the learning rate, no decay

  0%|                                                   | 0/10 [00:00<?, ?it/s]

950500

 10%|████▎                                      | 1/10 [00:04<00:36,  4.07s/it]

950480

 20%|████████▌                                  | 2/10 [00:08<00:32,  4.08s/it]

1900972

 30%|████████████▉                              | 3/10 [00:16<00:37,  5.30s/it]

2850808

 40%|█████████████████▏                         | 4/10 [00:28<00:44,  7.38s/it]

3802124

 50%|█████████████████████▌                     | 5/10 [00:45<00:50, 10.15s/it]

4752568

 60%|█████████████████████████▊                 | 6/10 [01:05<00:53, 13.32s/it]

5701808

 70%|██████████████████████████████             | 7/10 [01:30<00:50, 16.72s/it]

6653345

 80%|██████████████████████████████████▍        | 8/10 [01:59<00:40, 20.30s/it]

7603383

 90%|██████████████████████████████████████▋    | 9/10 [02:32<00:24, 24.13s/it]

8553417

                                                                               

In [184]:
# To save
doc_vectorizer.save('./model/17.10.15.02.doc2vec.model')

# To save
doc_vectorizer_only_nouns.save('./model/17.10.15.02.doc2vec.only.nouns.model')

In [189]:
doc_vectorizer.most_similar(positive=['영화/Noun','로맨스/Noun','장르/Noun'], negative=['공포/Noun'])

[('로드무비/Noun', 0.39678576588630676),
 ('로맨틱/Noun', 0.37818723917007446),
 ('가족영화/Noun', 0.3733956217765808),
 ('정통/Noun', 0.35419774055480957),
 ('섹슈얼/Noun', 0.34228917956352234),
 ('러브스토리/Noun', 0.3355391323566437),
 ('액션영화/Noun', 0.3314771354198456),
 ('영화장르/Noun', 0.33072495460510254),
 ('코메디/Noun', 0.330555260181427),
 ('멜로/Noun', 0.3128451704978943)]

In [190]:
doc_vectorizer_only_nouns.most_similar(positive=['영화/Noun','로맨스/Noun','장르/Noun'], negative=['공포/Noun'])

[('정통/Noun', 0.3654744625091553),
 ('로맨틱/Noun', 0.354615718126297),
 ('로드무비/Noun', 0.32557517290115356),
 ('멜로/Noun', 0.3230037987232208),
 ('코메디/Noun', 0.3103031516075134),
 ('주변인/Noun', 0.31011444330215454),
 ('남여/Noun', 0.3047192096710205),
 ('러브스토리/Noun', 0.30434930324554443),
 ('영국영화/Noun', 0.3023414611816406),
 ('로코/Noun', 0.30176958441734314)]

In [183]:
len(doc_vectorizer.docvecs)

2

In [157]:
# # doc_vectorizer.infer_vector(doc_vectorizer.infer_vector('영화/Noun,로맨스/Noun,장르/Noun'.split()))

# # token_question = '더빙/Noun 진짜/Noun 짜증/Noun 목소리/Noun 짜증/Noun '.split()
# token_question = '짜증/Noun'.split()
# new_vector = doc_vectorizer_only_nouns.infer_vector(token_question)
# # doc_vectorizer.infer_vector(tokens_test)

# doc_vectorizer_only_nouns.docvecs.most_similar([new_vector], topn=1) #gives you top 10 document tags and their cosine similarity

[('0', 0.3540586531162262)]

In [159]:
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]

In [None]:
len(train_x)       # 사실 이 때문에 앞의 term existance와는 공평한 비교는 아닐 수 있다
# => 150000
len(train_x[0])
# => 300

In [191]:
train_only_nouns_x = [doc_vectorizer_only_nouns.infer_vector(doc.words) for doc in tagged_train_docs_only_nouns]
train_only_nouns_y = [doc.tags[0] for doc in tagged_train_docs_only_nouns]

In [192]:
len(train_only_nouns_x)       # 사실 이 때문에 앞의 term existance와는 공평한 비교는 아닐 수 있다
# => 150000
len(train_only_nouns_x[0])
# => 300

150000

500

In [170]:
test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]

In [171]:
len(test_x)
# => 50000
len(test_x[0])
# => 300

50000

300

In [193]:
test_only_nouns_x = [doc_vectorizer_only_nouns.infer_vector(doc.words) for doc in tagged_test_docs]
test_only_nouns_y = [doc.tags[0] for doc in tagged_test_docs_only_nouns]

In [194]:
len(test_only_nouns_x)
# => 50000
len(test_only_nouns_x[0])
# => 300

50000

500

In [195]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=1234)

classifier.fit(train_x, train_y)
classifier.score(test_x, test_y)
# => 0.78246000000000004

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.63185999999999998

In [196]:
classifier.fit(train_only_nouns_x, train_only_nouns_y)
classifier.score(test_only_nouns_x, test_only_nouns_y)
# => 0.78246000000000004

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.6401

In [158]:
# ?doc_vectorizer.docvecs.most_similar()

In [None]:
# doc_vectorizer.docvecs.most_similar([new_vector]) #gives you top 10 document tags and their cosine similarity

In [None]:
# 다음은 이것을 확인할 것

# https://www.lucypark.kr/courses/2015-ba/text-mining.html#3-load-tokens-with-nltktext