In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from gensim.models import doc2vec
# TaggedDocument는 해당 corpus의 id를 함께 넘겨주는 것을 말함
from gensim.models.doc2vec import TaggedDocument
import time
import tensorflow_datasets as tfds
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


1. colab : !pip install konlpy 설치해야 함
1. https://cholol.tistory.com/466
1. https://github.com/tkdlek11112/faq_chatbot_learning/blob/master/DAY1/FAQ_CHATBOT_DAY1.ipynb


In [13]:
faqs = pd.read_csv('jokes.csv')
faqs[:3]

Unnamed: 0,ID,Question,Answer
0,1,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,2,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,3,What do you call a person who is outside a doo...,Matt


In [14]:
# 영어 형태소분석
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

In [22]:
faqs['Question'][:3]

0    Did you hear about the Native American man tha...
1         What's the best anti diarrheal prescription?
2    What do you call a person who is outside a doo...
Name: Question, dtype: object

In [30]:
# word_tokenize 를 하려면, punkt를 다운로드 해야 함
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jmpkorea00\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [40]:
# 토근화 - 문장을 단어단위로 분리 
tokened_questions = [word_tokenize(question.lower()) for question in faqs['Question']]
tokened_questions[0]

['did',
 'you',
 'hear',
 'about',
 'the',
 'native',
 'american',
 'man',
 'that',
 'drank',
 '200',
 'cups',
 'of',
 'tea',
 '?']

In [44]:
# pos 명사 분리
lemmatizer = WordNetLemmatizer()


In [42]:
# lemmatize 사용하기 위해 다운로드
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jmpkorea00\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
# lemmatize 사용하기 위해 다운로드
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jmpkorea00\AppData\Roaming\nltk_data...


True

In [48]:
# lemmatization
lemmed_questions = [[lemmatizer.lemmatize(word) for word in doc] for doc in tokened_questions]
lemmed_questions[0]

['did',
 'you',
 'hear',
 'about',
 'the',
 'native',
 'american',
 'man',
 'that',
 'drank',
 '200',
 'cup',
 'of',
 'tea',
 '?']

In [49]:
# stopwords.words 불용어 제거를 위해 다운로드
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmpkorea00\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [51]:
# stopword 제거 불용어 제거하기
stop_words = stopwords.words('english')
questions = [[w for w in doc if not w in stop_words] for doc in lemmed_questions]
questions[0]

['hear', 'native', 'american', 'man', 'drank', '200', 'cup', 'tea', '?']

In [52]:
# 리스트에서 각 문장부분 토큰화

index_questions = []
for i in range(len(faqs)):
    # 불용어 제거한 질문, index와 함께 list로 저장
    index_questions.append([questions[i], i ])

# Doc2Vec에서 사용하는 TaggedDocument 문서형으로 변경
# (영문형태소분석,index) 처리
tagged_questions = [TaggedDocument(d, [int(c)]) for d, c in index_questions]

In [54]:
tagged_questions[0]

TaggedDocument(words=['hear', 'native', 'american', 'man', 'drank', '200', 'cup', 'tea', '?'], tags=[0])

In [55]:
# doc2vec 모델(model) 실행
import multiprocessing
cores = multiprocessing.cpu_count()
d2v_faqs = doc2vec.Doc2Vec(
                                vector_size=200,
#                                 alpha=0.025,
#                                 min_alpha=0.025,
                                hs=1,
                                negative=0,
                                dm=0,
                                dbow_words = 1,
                                min_count = 5,
                                workers = cores,
                                seed=0,
                                epochs=20
                                )
d2v_faqs.build_vocab(tagged_questions)
d2v_faqs.train(tagged_questions,
               total_examples = d2v_faqs.corpus_count,
               epochs = d2v_faqs.epochs
                                )

In [56]:
# 1. 테스트 질문, 테스트하는 문장도 같은 전처리를 해준다.
test_string = "What's the best anti diarrheal prescription?"
# 2. 영문 형태소 분석
tokened_test_string = word_tokenize(test_string)
# 3. pos(명사형)형태 분리
lemmed_test_string = [lemmatizer.lemmatize(word) for word in tokened_test_string]
# 4. 불용어 제거
test_string = [w for w in lemmed_test_string if not w in stop_words]

test_string

['What', "'s", 'best', 'anti', 'diarrheal', 'prescription', '?']

In [57]:
# 벡터화에서 근접한 5개 찾기
topn = 5  # 5개설정
# 질문 - 벡터화
test_vector = d2v_faqs.infer_vector(test_string)
# 근접한 5개 찾기
result = d2v_faqs.docvecs.most_similar([test_vector], topn=topn)
# result[,0] : index, result[:1] : 정확도 
print(result)

for i in range(topn):
    print("{}위. {}, {} {}".format(i+1, result[i][1], result[i][0],faqs['Question'][result[i][0]] ))

[(1, 0.9330215454101562), (29760, 0.6743109822273254), (30815, 0.6502100825309753), (30143, 0.6352273225784302), (32653, 0.6149254441261292)]
1위. 0.9330215454101562, 1 What's the best anti diarrheal prescription?
2위. 0.6743109822273254, 29760 What's a Jackhammer's Best Friend?
3위. 0.6502100825309753, 30815 What's the best thing about owning a car in Liverpool?
4위. 0.6352273225784302, 30143 Why do hummingbirds hum?
5위. 0.6149254441261292, 32653 Which U.S. state abbreviation is the best?


  result = d2v_faqs.docvecs.most_similar([test_vector], topn=topn)


In [58]:
# 모든 질문을 해서 1위의 결과가 나온 것은 몇개인지 확인, 성능 측정
raten = 5
found = 0
for i in range(len(faqs)):
    tstr = faqs['Question'][i]
    tokened_test_string = word_tokenize(tstr)
    lemmed_test_string = [lemmatizer.lemmatize(word) for word in tokened_test_string]
    ttok = [w for w in lemmed_test_string if not w in stop_words]
    tvec = d2v_faqs.infer_vector(ttok)
    re = d2v_faqs.docvecs.most_similar([tvec], topn = raten)
    for j in range(raten):
        if i == re[j][0]: 
            found = found + 1
            break

print("정확도 = {} % ({}/{} )  ".format(found/len(faqs),found, len(faqs)))

  re = d2v_faqs.docvecs.most_similar([tvec], topn = raten)


정확도 = 0.8646162690428284 % (33088/38269 )  


In [59]:
# 모델 저장

# 모델 1
#d2v_faqs.save(os.path.join('data','/content/drive/My Drive/data/d2v_faqs_size100_min1_batch50_epoch100_nounonly_dm0.model'))

#모델 2
d2v_faqs.save('d2v_faqs_size200_min5_epoch20_jokes.model')