** keras example **

download data in http://www.manythings.org/anki/

i use fra-eng.zip file

문자 단위로 처리하고 출력하는 모델을 만듭니다.

진행 과정은 아래와 같이 진행됩니다.

1. 문장들을 3차원 배열(encoder_input, decoder_input, decoder_target)으로 변환합니다.
    - encoder_input은 (num_pairs, max_english_sentence_length, num_elglish_character) 형태의 3차원 배열로 영어 문장의 one-hot 형식 벡터 데이터를 갖고 있습니다.
    
    - decoder_input은 (num_pairs, max_french_sentece_length, num_french_character) 형태의 3차원 배열로 불어 문장의 one-hot 형식입니다.
    - decoder_target은 decoder_input과 같지만 하나의 time step 만큼 offset 됩니다. decoer_target[:, t, :]는 decoder_input[:, t+1, :]과 같습니다.
    
2. 기본 LSTM 기반의 seq2seq model을 주어진 encoder_input과 decoder_input로 decoder_target을 예측합니다.
3. model이 작동하는지 확인하기 위해 일부 문장을 디코딩합니다.(encdoer_input의 샘플을 decoder_target의 표본으로 변환합니다)


문장을 디코딩 하는 학습 단계와 추론 단계는 좀 다릅니다. 같은 내부 계층을 사용하지만 서로 다른 모델을 사용하죠.

- return_state : encoder의 출력과 내부 RNN 상태 반환
- initial_state : decoder의 초기 상태를 지정
- return_sequences : 전체 시퀀스를 반환

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [48]:
batch_size = 64
epochs = 60
latent_dim = 256
num_samples = 10000

In [49]:
input_texts, target_texts = [], []
input_characters = set()
target_characters = set()

In [50]:
with open('fra.txt', 'r', encoding = 'utf-8') as f:
    lines = f.read().split('\n')

In [51]:
lines[:3]

['Go.\tVa !', 'Hi.\tSalut !', 'Run!\tCours\u202f!']

이렇게 \t을 기준으로 번역 상태가 들어가있다.

In [52]:
for cnt, line in enumerate(lines[: min(num_samples, len(lines) - 1)]):
    
    input_text, target_text = line.split('\t')
    if cnt < 10: print(input_text, ",   ", target_text)
    # \t 문자를 시작문자로, \n 문자를 종료 문자로 사용한다.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

Go. ,    Va !
Hi. ,    Salut !
Run! ,    Cours !
Run! ,    Courez !
Wow! ,    Ça alors !
Fire! ,    Au feu !
Help! ,    À l'aide !
Jump. ,    Saute.
Stop! ,    Ça suffit !
Stop! ,    Stop !


In [53]:
input_texts[:5]

['Go.', 'Hi.', 'Run!', 'Run!', 'Wow!']

In [54]:
target_texts[:5]

['\tVa !\n',
 '\tSalut !\n',
 '\tCours\u202f!\n',
 '\tCourez\u202f!\n',
 '\tÇa alors\u202f!\n']

In [55]:
print(random.sample(input_characters, 10))
print(random.sample(target_characters, 10))

['E', '3', 'T', '.', 'N', 's', 'V', 't', '$', 'G']
['è', '1', '»', 'P', 'e', 'd', ')', 'G', 'à', 'c']


In [56]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [57]:
print(input_characters[:5])
print(target_characters[:5])
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

[' ', '!', '$', '%', '&']
['\t', '\n', ' ', '!', '$']
Number of samples: 10000
Number of unique input tokens: 69
Number of unique output tokens: 93
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [58]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [59]:
for key, value in input_token_index.items():
    if value < 10 : print( key , " , ", value)

   ,  0
!  ,  1
$  ,  2
%  ,  3
&  ,  4
'  ,  5
,  ,  6
-  ,  7
.  ,  8
0  ,  9


In [60]:
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens),dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens),dtype='float32')

In [61]:
test_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens),dtype='float32')

In [62]:
print(np.shape(encoder_input_data))
print(np.shape(decoder_input_data))
print(np.shape(decoder_target_data))

(10000, 16, 69)
(10000, 59, 93)
(10000, 59, 93)


이제 one-hot 형식의 데이터로 만들어준다

In [63]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        print(i, t, "  //    char :    ", char, " //  char index : ", input_token_index[char])
        test_input_data[i, t, input_token_index[char]] = 1.
    print(test_input_data[0])
    print(test_input_data[0][1])
    print(test_input_data[0][2])
    break

0 0   //    char :     G  //  char index :  25
0 1   //    char :     o  //  char index :  57
0 2   //    char :     .  //  char index :  8
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


이런식으로 one-hot을 만들어준다.

In [64]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        
        if t > 0:
            decoder_target_data[i, t-1, target_token_index[char]] = 1.

모델을 만들어준다.

In [65]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# 출력 상태 벡터만 가져온다.
encoder_states = [state_h, state_c]

In [66]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [67]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 69)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 93)     0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, 256), (None, 333824      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 256),  358400      input_6[0][0]                    
                                                                 lstm_5[0][1]                     
          

In [68]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
#checkpoint = ModelCheckpoint(file_path, monitor = 'val_loss', verbose = 1, save_best_only=True)
early_stop = EarlyStopping(monitor = 'val_loss', patience = 3)

In [69]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks = [early_stop])

Train on 8000 samples, validate on 2000 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60


<keras.callbacks.History at 0x23616633208>

추론 모델 생성

- 입력문장을 encode하고 초기 상태의 decoder의 상태를 가지고 옵니다.
- 초기 상태 decoder의 한 단계와 "시퀀스 시작" 토큰을 목표로 실행합니다. 출력은 다음 목표 문자입니다.
- 예측된 목표 문자를 붙이고 반복합니다.

In [77]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, None, 69)          0         
_________________________________________________________________
lstm_5 (LSTM)                [(None, 256), (None, 256) 333824    
Total params: 333,824
Trainable params: 333,824
Non-trainable params: 0
_________________________________________________________________


In [71]:
decoder_state_input_h = Input(shape = (latent_dim, ))
decoder_state_input_c = Input(shape = (latent_dim, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state = decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, None, 93)     0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 256),  358400      input_6[0][0]                    
                                                                 input_7[0][0]                    
          

위에서와 다르게 이번에는 idx : char 구조로 변환

즉, 이것은 숫자 -> 문자 변환용

In [72]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [75]:
for key, value in reverse_input_char_index.items():
    if key < 10 : print( key , " , ", value)

0  ,   
1  ,  !
2  ,  $
3  ,  %
4  ,  &
5  ,  '
6  ,  ,
7  ,  -
8  ,  .
9  ,  0


추론할 때 사용하는 함수 구현

In [87]:
def decode_sequence(input_seq):
    
    #아까 만들어졌던 one-hot 값이 들어온다.
    #print("input_seq : ", input_seq , "\n")
    
    #입력 문장을 인코딩
    states_value = encoder_model.predict(input_seq)
    
    #디코더의 입력으로 쓸 단일 문자
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # (1, 1, 93) 모양을 가진다.
    #print("np.shape(target_seq) : ", np.shape(target_seq), "\n")
    
    #처음 입력은 시작 문자였던 \t 로 설정한다.
    target_seq[0, 0, target_token_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        #이전의 출력, 상태를 디코더에 넣어 새로운 출력, 상태를 얻음
        # 이전 문자와 상태 데이터를 가지고 다음 문장과 상태를 얻는 것
        
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        #output으로 나온 벡터행렬
        #print("output_token : ", output_tokens, "\n")
        
        #h도 마찬가지 벡터 행렬
        #print("h : ", h, "\n")
        
        # 사전을 이용해 one-hot을 실제 문자로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        #예를들어 42 이런식으로 인덱스가 나온다.
        #print("sampled_token_index : ", sampled_token_index, "\n")
        sampled_char = reverse_target_char_index[sampled_token_index]
        
        # 인덱스가 이제 실제 값으로 바뀐다. 42 -> V
        #print("sampled_char : ", sampled_char, "\n")
        
        decoded_sentence += sampled_char
        
        #종료 문자나 문장 길이가 초과되면 종료
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            
        # 디코더의 다음 입력으로 쓸 데이터
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        
        #아까 위에서 나온 그 인덱스 값(42)가 이제 다음에 들어가야 하니까 저렇게 둔다.
        #print("target_seq[0, 0, sampled_token_index] : ", target_seq[0, 0, sampled_token_index], "\n")
        #상태값 변경
        states_value = [h, c]
    return decoded_sentence

In [89]:
for seq_index in range(10):
    #아까 만들어졌던 one-hot 값이 들어간다
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Go.
Decoded sentence: Va te chercher !

-
Input sentence: Hi.
Decoded sentence: Attends une bien.

-
Input sentence: Run!
Decoded sentence: Prends le commences !

-
Input sentence: Run!
Decoded sentence: Prends le commences !

-
Input sentence: Wow!
Decoded sentence: Comme c'est chaut !

-
Input sentence: Fire!
Decoded sentence: Assieds-toi !

-
Input sentence: Help!
Decoded sentence: Aidez-vous !

-
Input sentence: Jump.
Decoded sentence: Pas un peux maintenant.

-
Input sentence: Stop!
Decoded sentence: Arrête de te plaîter !

-
Input sentence: Stop!
Decoded sentence: Arrête de te plaîter !

