In [155]:
import os
import csv
import itertools

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorlayer as tl

from sklearn.model_selection import train_test_split

from konlpy.tag import Twitter
twitter = Twitter()

In [156]:
data_path = './dataset/chat.in'

In [157]:
#load dataset

delimiter = "\t"

def read_in_data(data_path):
    user = []
    bot = []
    with open(data_path, mode="rt", encoding="utf-8") as fh:
        utt = fh.readlines()
        
        for i, line in enumerate(utt):
            split_line = line.split(delimiter)
            query = split_line[0].replace("\"", "").replace("\n", "")
            answer = split_line[1].replace("\"", "").replace("\n", "")
            
            user.append(query)
            bot.append(answer)
            
        return user, bot
    
user_query, answer_query = read_in_data(data_path)

In [160]:
#데이터 dataset으로 변환 및 문장 쪼개기
dict_data = {'user': user_query, 'bot': answer_query}
df_data = pd.DataFrame(dict_data, columns=['user', 'bot'])

In [162]:
twit_tokenizer = lambda x: twitter.morphs(x)
df_data['user_token'] = df_data['user'].apply(twit_tokenizer) #Tokenizer로 저장
df_data['bot_token'] = df_data['bot'].apply(twit_tokenizer) #Tokenizer로 저장

In [163]:
df_data

Unnamed: 0,user,bot,user_token,bot_token
0,메롱,저랑 놀고 싶으신거군요,[메롱],"[저, 랑, 놀고, 싶으, 신거, 군요]"
1,아 그러면 안되,네? 어떤 말씀이신지 궁금하네요,"[아, 그러면, 안되]","[네, ?, 어떤, 말씀, 이, 신지, 궁금하네, 요]"
2,오늘 하루 어때?,당신과 함께해서 좋은거 같아요,"[오늘, 하루, 어때, ?]","[당신, 과, 함께해서, 좋, 은, 거, 같아, 요]"
3,너 지금 얼마 있어?,저는 마음만큼은 부자에요.,"[너, 지금, 얼마, 있어, ?]","[저, 는, 마음, 만큼은, 부자, 에요, .]"


In [164]:
#tokenize 된 단어를 합쳐서 사전을 만들어보자
token_dict = itertools.chain.from_iterable(list(df_data['user_token'] + list(df_data['bot_token'])))
token_dict = list(token_dict)

In [166]:
#http://tensorlayer.readthedocs.io/en/latest/
#딕셔너리 만들기

vocab_size = 38

data, count, dictionary, reverse_dictionary = \
        tl.nlp.build_words_dataset(token_dict, vocab_size, True)
    
ids = lambda x: tl.nlp.words_to_word_ids(x, dictionary) #to ids
context = lambda x: tl.nlp.word_ids_to_words(x, reverse_dictionary) #to ids

Real vocabulary size    38
Limited vocabulary size 38


In [171]:
#token - ids 생성
df_data['user_ids'] = df_data['user_token'].apply(ids)
df_data['user_ids'].apply(context)

df_data['bot_ids'] = df_data['bot_token'].apply(ids)
df_data['bot_ids'].apply(context)

0            [저, 랑, 놀고, 싶으, 신거, 군요]
1    [네, ?, 어떤, 말씀, 이, 신지, 궁금하네, 요]
2     [당신, 과, 함께해서, 좋, 은, 거, 같아, 요]
3      [저, 는, 마음, 만큼은, 부자, 에요, UNK]
Name: bot_ids, dtype: object

In [174]:
#Vocab Dictionary 생성하기, most frequent word
tl.nlp.create_vocabulary('./dataset/test5.in', './dataset/chat.in', max_vocabulary_size=10)

Creating vocabulary ./dataset/test5.in from data ./dataset/chat.in


In [175]:
df_data["user_ids"]

0                    [4]
1           [10, 11, 12]
2        [19, 20, 21, 1]
3    [29, 30, 31, 32, 1]
Name: user_ids, dtype: object

In [177]:
tl.prepro.sequences_add_end_id(list(df_data['user_ids']), end_id='EOS') # 단순히 add end_ID

[[4, 'EOS'],
 [10, 11, 12, 'EOS'],
 [19, 20, 21, 1, 'EOS'],
 [29, 30, 31, 32, 1, 'EOS']]

In [154]:
tl.prepro.sequences_add_end_id(list(df_data['user_ids']), end_id=999) # 단순히 add end_ID
tl.prepro.sequences_add_start_id(list(df_data['user_ids']), start_id=123) # 단순히 add end_ID
tl.prepro.sequences_add_end_id_after_pad(list(df_data['user_ids']), end_id=99, pad_id=0) #add end_ID + padding, 근데 잘 안됨
tl.prepro.sequences_get_mask(list(df_data['user_ids']), pad_val=0) #이것도 왜..?ㄴ

array([1, 1, 1, 1], dtype=object)

In [101]:
def gen_dataset():
    with open("./dataset/test.in", 'w') as f:
        for i in range(len(df_data['user_ids'])):
            f.write(str(df_data['user_ids'][i]) + '\n')
            f.write(str(df_data['bot_ids'][i]) + '\n')        

gen_dataset()

In [103]:
dataset = tf.data.TextLineDataset("./dataset/test.in")
tr_data = tf.data.Dataset.from_tensor_slices((user_query, answer_query))

iterator = tf.data.Iterator.from_structure(tr_data.output_types,
                                   tr_data.output_shapes)
next_element = iterator.get_next()
training_init_op = iterator.make_initializer(tr_data)

In [99]:
#https://github.com/vahidk/EffectiveTensorflow
dataset = tf.data.TextLineDataset("./dataset/test.in")

tr_data = tf.data.Dataset.from_tensor_slices((user_query, answer_query))

iterator = tf.data.Iterator.from_structure(tr_data.output_types,
                                   tr_data.output_shapes)
next_element = iterator.get_next()
training_init_op = iterator.make_initializer(tr_data)

In [104]:
with tf.Session() as sess:

    # initialize the iterator on the training data
    sess.run(training_init_op)

    # get each element of the training dataset until the end is reached
    while True:
        try:
            elem = sess.run(next_element)
            print(elem)
        except tf.errors.OutOfRangeError:
            print("End of training dataset.")
            break

(b'\xeb\xa9\x94\xeb\xa1\xb1', b'\xec\xa0\x80\xeb\x9e\x91 \xeb\x86\x80\xea\xb3\xa0 \xec\x8b\xb6\xec\x9c\xbc\xec\x8b\xa0\xea\xb1\xb0\xea\xb5\xb0\xec\x9a\x94')
(b'\xec\x95\x84 \xea\xb7\xb8\xeb\x9f\xac\xeb\xa9\xb4 \xec\x95\x88\xeb\x90\x98', b'\xeb\x84\xa4? \xec\x96\xb4\xeb\x96\xa4 \xeb\xa7\x90\xec\x94\x80\xec\x9d\xb4\xec\x8b\xa0\xec\xa7\x80 \xea\xb6\x81\xea\xb8\x88\xed\x95\x98\xeb\x84\xa4\xec\x9a\x94')
(b'\xec\x98\xa4\xeb\x8a\x98 \xed\x95\x98\xeb\xa3\xa8 \xec\x96\xb4\xeb\x95\x8c?', b'\xeb\x8b\xb9\xec\x8b\xa0\xea\xb3\xbc \xed\x95\xa8\xea\xbb\x98\xed\x95\xb4\xec\x84\x9c \xec\xa2\x8b\xec\x9d\x80\xea\xb1\xb0 \xea\xb0\x99\xec\x95\x84\xec\x9a\x94')
(b'\xeb\x84\x88 \xec\xa7\x80\xea\xb8\x88 \xec\x96\xbc\xeb\xa7\x88 \xec\x9e\x88\xec\x96\xb4?', b'\xec\xa0\x80\xeb\x8a\x94 \xeb\xa7\x88\xec\x9d\x8c\xeb\xa7\x8c\xed\x81\xbc\xec\x9d\x80 \xeb\xb6\x80\xec\x9e\x90\xec\x97\x90\xec\x9a\x94.')
End of training dataset.


In [140]:
#Data split
X_train, X_test, y_train, y_test = train_test_split(df_data['user_ids'], df_data['bot_ids'], test_size=0.3,random_state=1)

Real vocabulary size    8
Limited vocabulary size 8
