- This script demonstrates how to implement a basic character-level sequence-to-sequence model. We apply it to translating

- It is character-by-character model. Note that it is fairly unusual to do character-level machine translation, as word-level models are more common in this domain.

In [55]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense,GRU
import numpy as np
from keras.callbacks import EarlyStopping,ModelCheckpoint

In [2]:
lines = open('tur.txt', encoding='utf-8').read().split('\n')

In [13]:
l=lines[1221]

In [28]:
eng_sent = []
tur_sent = []
eng_chars = set()
tur_chars = set()
nb_samples = 4000

# Process english and french sentences
for line in range(nb_samples):
    
    eng_line = str(lines[line]).split('\t')[0]
    
    # Append '\t' for start of the sentence and '\n' to signify end of the sentence
    tur_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
    
    eng_sent.append(eng_line)
    tur_sent.append(tur_line)
    
    for ch in eng_line:
        if (ch not in eng_chars):
            eng_chars.add(ch)
            
    for ch in tur_line:
        if (ch not in tur_chars):
            tur_chars.add(ch)

In [31]:
tur_chars = sorted(list(tur_chars))
eng_chars = sorted(list(eng_chars))
print(len(tur_chars),len(eng_chars))

79 69


In [34]:
# dictionary to index each english character - key is index and value is english character
eng_index_to_char_dict = {}

# dictionary to get english character given its index - key is english character and value is index
eng_char_to_index_dict = {}

for k, v in enumerate(eng_chars):
    eng_index_to_char_dict[k] = v
    eng_char_to_index_dict[v] = k

In [35]:
# dictionary to index each turkish character - key is index and value is french character
tur_index_to_char_dict = {}

# dictionary to get turkish character given its index - key is turkish character and value is index
tur_char_to_index_dict = {}
for k, v in enumerate(tur_chars):
    tur_index_to_char_dict[k] = v
    tur_char_to_index_dict[v] = k

In [37]:
max_len_eng_sent = max([len(line) for line in eng_sent])
max_len_tur_sent = max([len(line) for line in tur_sent])

In [38]:
max_len_eng_sent,max_len_tur_sent

(13, 41)

In [39]:
tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent,len(eng_chars)), dtype='float32')
tokenized_tur_sentences = np.zeros(shape = (nb_samples,max_len_tur_sent,len(tur_chars)), dtype='float32')
target_data = np.zeros((nb_samples, max_len_tur_sent, len(tur_chars)),dtype='float32')

In [40]:
# Vectorize the english and french sentences

for i in range(nb_samples):
    #print(eng_sent[i])
    for k,ch in enumerate(eng_sent[i]):
        #print(i,k,eng_char_to_index_dict[ch],eng_index_to_char_dict[eng_char_to_index_dict[ch]])
        tokenized_eng_sentences[i,k,eng_char_to_index_dict[ch]] = 1
        
    for k,ch in enumerate(tur_sent[i]):
       # print(i,k,tur_char_to_index_dict[ch],tur_index_to_char_dict[tur_char_to_index_dict[ch]])
        tokenized_tur_sentences[i,k,tur_char_to_index_dict[ch]] = 1
        
        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i,k-1,tur_char_to_index_dict[ch]] = 1

In [54]:
tokenized_tur_sentences[1]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [50]:
tokenized_tur_sentences[1212][-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [56]:
tokenized_eng_sentences.shape,tokenized_tur_sentences.shape,target_data.shape

((4000, 13, 69), (4000, 41, 79), (4000, 41, 79))

In [71]:
# Encoder model

encoder_input = Input(shape=(None,len(eng_chars)))
#encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs=GRU(256,activation="relu",return_sequences=True)(encoder_input)
#encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
#encoder_states = [encoder_h, encoder_c]

In [72]:
encoder_outputs

<tf.Tensor 'gru_4/transpose_1:0' shape=(?, ?, 256) dtype=float32>

In [76]:
# Decoder model

decoder_input = Input(shape=(None,len(tur_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(tur_chars),activation='softmax')
decoder_out = decoder_dense (decoder_out)


In [77]:
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, None, 69)     0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, None, 79)     0                                            
__________________________________________________________________________________________________
lstm_9 (LSTM)                   [(None, 256), (None, 333824      input_19[0][0]                   
__________________________________________________________________________________________________
lstm_10 (LSTM)                  [(None, None, 256),  344064      input_20[0][0]                   
                                                                 lstm_9[0][1]                     
          

In [78]:
early_stopping = EarlyStopping(monitor='val_loss',min_delta=0,patience=3,verbose=0,mode='auto')

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(x=[tokenized_eng_sentences,tokenized_tur_sentences], 
          y=target_data,
          batch_size=64,
          epochs=2000,
          validation_split=0.2,callbacks=[early_stopping])

Train on 3200 samples, validate on 800 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000

In [None]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [None]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(tur_chars)))
    target_seq[0, 0, tur_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_tur_char = tur_index_to_char_dict[max_val_index]
        translated_sent += sampled_tur_char
        
        if ( (sampled_tur_char == '\n') or (len(translated_sent) > max_len_tur_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(tur_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent



In [None]:
for seq_index in range(10):
    #seq_index+=40
    inp_seq = tokenized_eng_sentences[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', eng_sent[seq_index])
    print('Decoded sentence:', translated_sent)