In [1]:
%%bash
git clone https://github.com/songys/Chatbot_data.git

fatal: destination path 'Chatbot_data' already exists and is not an empty directory.


In [45]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
data_df = pd.read_csv('Chatbot_data/ChatbotData .csv', encoding='utf-8')

In [4]:
data_df.head(5)

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [5]:
question, answer = list(data_df['Q']), list(data_df['A'])

In [30]:
class MorphTokenizer(object):
    def __init__(self, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
        self.word_index = {'<pad>': 0, '<unk>': 1, '<eos>': 2, '<sos>': 3}
        self.index_word = {0: '<pad>', 1: '<unk>', 2: '<eos>', 3: '<sos>'}
        self.indexer = len(self.word_index)
        
        self.filters = filters
        
    def fit_on_texts(self, text_dataset):
        from konlpy.tag import Twitter
        from tqdm import tqdm
        
        morph_analyzer = Twitter()

        for text in tqdm(text_dataset):
            for c in self.filters:
                text = text.replace(c, '')
            
            morph_tokens = morph_analyzer.morphs(text.replace(' ', ''))

            for t in morph_tokens:
                if not t in self.word_index:
                    self.word_index[t] = self.indexer
                    self.index_word[self.indexer] = t
                    self.indexer += 1

    def texts_to_sequences(self, text_dataset):
        from konlpy.tag import Twitter
        from tqdm import tqdm
        
        morph_analyzer = Twitter()
    
        indicied_dataset = list()        
        for text in tqdm(text_dataset):
            for c in self.filters:
                text = text.replace(c, '')
        
            morph_tokens = morph_analyzer.morphs(text.replace(' ', ''))
            indicied_seq = list()
            for t in morph_tokens:        
                if t in self.word_index:
                    indicied_seq.append(self.word_index[t])
                else:
                    indicied_seq.append(self.word_index['<unk>'])
            
            indicied_dataset.append(indicied_seq)    
            
        return indicied_dataset

In [31]:
tokenizer = MorphTokenizer()

In [32]:
tokenizer.fit_on_texts(question + answer)

100%|██████████| 23646/23646 [00:17<00:00, 1378.13it/s]


In [33]:
len(tokenizer.word_index)

9805

In [34]:
len(tokenizer.index_word)

9805

In [35]:
question_sequences = tokenizer.texts_to_sequences(question)

100%|██████████| 11823/11823 [00:07<00:00, 1532.07it/s]


In [36]:
answer_seqeunces = tokenizer.texts_to_sequences(answer)

100%|██████████| 11823/11823 [00:09<00:00, 1247.35it/s]


In [44]:
text_token_len_list = [len(s) for s in list(answer_seqeunces + question_sequences)]

print("max: ", max(text_token_len_list))
print("min: ", min(text_token_len_list))
print("mean: ", np.mean(text_token_len_list))
print("std: ", np.std(text_token_len_list))

max:  31
min:  1
mean:  6.194028588344752
std:  2.881177536849193


In [46]:
MAX_SEQUENCE_LENGTH = 15

prepro_questions = pad_sequences(question_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
prepro_answers = pad_sequences(answer_seqeunces, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [49]:
prepro_questions

array([[   4,    5,    0, ...,    0,    0,    0],
       [   6,    7,    8, ...,    0,    0,    0],
       [  11,   12,   13, ...,    0,    0,    0],
       ...,
       [7303, 5666, 3808, ...,    0,    0,    0],
       [ 219, 1523,   60, ...,    0,    0,    0],
       [ 440,  115,  351, ...,    0,    0,    0]], dtype=int32)

In [50]:
prepro_answers

array([[ 613,  134,  355, ...,    0,    0,    0],
       [ 952, 7304,   19, ...,    0,    0,    0],
       [  99,   57, 7243, ...,    0,    0,    0],
       ...,
       [6765,   90,  661, ...,    0,    0,    0],
       [ 453, 4858,  205, ...,    0,    0,    0],
       [9149, 5227,  351, ...,    0,    0,    0]], dtype=int32)

In [51]:
FILE_DIR_PATH = './data/'
QUESTION_DATA_FILE_NAME = 'question.npy'
ANSWER_DATA_FILE_NAME = 'answer.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

In [52]:
import os

if not os.path.exists(FILE_DIR_PATH):
    os.makedirs(FILE_DIR_PATH)

np.save(open(FILE_DIR_PATH + QUESTION_DATA_FILE_NAME, 'wb'), prepro_questions)
np.save(open(FILE_DIR_PATH + ANSWER_DATA_FILE_NAME, 'wb'), prepro_answers)

In [53]:
data_prepro_configs = {'vocab': tokenizer.word_index, 
                       'vocab_index_to_word': tokenizer.index_word, 
                       'vocab_size': len(tokenizer.word_index)}

In [54]:
import json

with open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'w') as f:
    json.dump(data_prepro_configs, f)