In [2]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding, Input, LSTM, Dense
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
MAX_VOCAB_SIZE = 3000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
NUM_SAMPLES = 1000

### Get input data

In [4]:
encoder_input = []

decoder_output = []
decoder_input = []

counter = 0
for line in open('data/spa-eng/spa.txt'):
    temp_line = line.split('\t')
    if not temp_line:
        continue
    
    input_ = temp_line[0].strip()
    translation = temp_line[1].strip()
    
    encoder_input.append(input_)
    
    decoder_input.append('<sos> ' + translation)
    decoder_output.append(translation + ' <eos>')
    
    counter += 1
    if counter >= NUM_SAMPLES:
        break
    
len(encoder_input), len(decoder_input), len(decoder_output)

(1000, 1000, 1000)

In [5]:
# decoder_output[:10]

### Tokenizing input data

In [6]:
## Tokenizing Encoder Data
en_tokenizer_ = Tokenizer(num_words=MAX_VOCAB_SIZE)
en_tokenizer_.fit_on_texts(encoder_input)

## Total number of unique words
en_word2ind = en_tokenizer_.word_index
print(len(en_word2ind))

en_sequences = en_tokenizer_.texts_to_sequences(encoder_input)

en_NUM_WORDS = min(len(en_word2ind) + 1, MAX_VOCAB_SIZE)
print(en_NUM_WORDS)

378
379


In [7]:
len(en_sequences), en_sequences[100]

(1000, [182])

In [8]:
dec_tokenizer_ = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
dec_tokenizer_.fit_on_texts(decoder_input + decoder_output)

## Total number of unique words
dec_word2ind = dec_tokenizer_.word_index
print(len(dec_word2ind))

dec_out_sequences = dec_tokenizer_.texts_to_sequences(decoder_output)
dec_inp_sequences = dec_tokenizer_.texts_to_sequences(decoder_input)

dec_NUM_WORDS = min(len(dec_word2ind) + 1, MAX_VOCAB_SIZE)
print(dec_NUM_WORDS)

1014
1015


In [9]:
(dec_out_sequences[0], decoder_output[0])

([121, 2], 'Ve. <eos>')

### Padding data

In [10]:
## getting max len
MAX_ENC_SEQ_LEN = max([len(s) for s in en_sequences])
print(MAX_ENC_SEQ_LEN)

en_sequences = pad_sequences(en_sequences, maxlen=MAX_ENC_SEQ_LEN) # default padding='pre'

3


In [11]:
en_sequences[998]

array([  0,  41, 242], dtype=int32)

In [12]:
## getting max len
MAX_DEC_SEQ_LEN = max([len(s) for s in dec_out_sequences])
print(MAX_DEC_SEQ_LEN)

dec_inp_sequences = pad_sequences(dec_inp_sequences, maxlen=MAX_DEC_SEQ_LEN, padding='post')
dec_out_sequences = pad_sequences(dec_out_sequences, maxlen=MAX_DEC_SEQ_LEN, padding='post')

7


In [13]:
dec_inp_sequences[0]

array([  1, 121,   0,   0,   0,   0,   0], dtype=int32)

### One-hoting the target outputs

In [14]:
### Need to one-hot the targets because cannot use sparse-categorical-crossentropy for list of outputs
one_hot_targets = np.zeros((len(dec_out_sequences), MAX_DEC_SEQ_LEN, dec_NUM_WORDS))

for i, seq in enumerate(dec_out_sequences):
    
    for j, word_ind in enumerate(seq):
        
        one_hot_targets[i, j, word_ind] = 1

In [15]:
one_hot_targets.shape

(1000, 7, 1015)

### Prepare embeddings

In [16]:
word2vec = {}
for line in open('glove.6B/glove.6B.{0}d.txt'.format(EMBEDDING_DIM)):
    temp = line.strip().split(' ')
    word = temp[0]
    arr = np.asarray(temp[1:], dtype='float16')
    word2vec[word] = arr

print(len(word2vec))

400000


In [17]:
# word2vec['is']

In [18]:
embedding_matrix = np.zeros((en_NUM_WORDS, EMBEDDING_DIM))
for k, i in en_word2ind.items():

    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(k)
        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
print(embedding_matrix.shape)

(379, 50)


### Model

In [19]:
LATENT_DIM = 15

In [20]:
### ENCODER ###
## create the embedding_layer
en_embedding_layer = Embedding(input_dim=en_NUM_WORDS, output_dim=EMBEDDING_DIM, weights=[embedding_matrix])

en_input = Input(shape=(MAX_ENC_SEQ_LEN, ))
# initial_h = Input(shape=(LATENT_DIM, ))
# initial_c = Input(shape=(LATENT_DIM, ))

x = en_embedding_layer(en_input)

lstm1 = LSTM(LATENT_DIM, return_state=True) # return_sequences=True, 
en_o, en_h, en_c = lstm1(x)#, initial_state = [initial_h, initial_c])

en_states = [en_h, en_c]








In [21]:
### DECODER ###
dec_embedding_layer = Embedding(input_dim=dec_NUM_WORDS, output_dim=EMBEDDING_DIM)#, weights=[embedding_matrix])

dec_input = Input(shape=(MAX_DEC_SEQ_LEN, ))
y = dec_embedding_layer(dec_input)

lstm2 = LSTM(LATENT_DIM, return_state=True, return_sequences=True)
dec_o, _, _ = lstm2(y, initial_state = en_states)

dense = Dense(dec_NUM_WORDS, activation='softmax')
output = dense(dec_o)

model = Model([en_input, dec_input], output)

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.01),
              metrics=['accuracy'])




In [27]:
dec_o.shape

TensorShape([Dimension(None), Dimension(None), Dimension(15)])

In [23]:
en_NUM_WORDS, dec_NUM_WORDS, one_hot_targets.shape

(379, 1015, (1000, 7, 1015))

In [24]:
MAX_ENC_SEQ_LEN, MAX_DEC_SEQ_LEN, len(en_sequences[0]), len(dec_inp_sequences[0])

(3, 7, 3, 7)

In [25]:
len(en_sequences), len(dec_inp_sequences)

(1000, 1000)

In [26]:
model.fit([en_sequences, dec_inp_sequences],
          one_hot_targets,
          epochs=2,
          batch_size=5,
          validation_split=VALIDATION_SPLIT)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 800 samples, validate on 200 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1343e8710>

### Making the predictions

In [25]:
encoder_model = Model(en_input, en_states)

dec_input_single = Input(shape=(1, ))
dec_input_single_x = dec_embedding_layer(dec_input_single)

initial_h = Input(shape=(LATENT_DIM, ))
initial_c = Input(shape=(LATENT_DIM, ))
# dec_inp_states = [initial_h, initial_c]

dec_op, dec_h, dec_c = lstm2(dec_input_single_x, initial_state = [initial_h, initial_c])
# dec_o, dec_h, dec_c = lstm2(dec_input, initial_state = dec_inp_states)
# dec_out_states = [dec_h, dec_c]

dec_output = dense(dec_op)

decoder_model = Model([dec_input_single, initial_h, initial_c], [dec_output, dec_h, dec_c])

In [26]:
dec_ind2word = {v:k for k, v in dec_word2ind.items()}
len(dec_ind2word)

1014

In [31]:
en_ind2word = {v:k for k, v in en_word2ind.items()}
len(en_ind2word)

378

In [56]:
def translate():
    
    i = np.random.choice(len(en_sequences))
    input_seq = en_sequences[i:i+1]
    print('Input seq:', input_seq)
    input_text = ' '.join([en_ind2word.get(w) for w in input_seq[0] if w>0])
    print('Input seq:', input_text)
    
    en_output_prediction = encoder_model.predict(input_seq)
    h = en_output_prediction[0]
    c = en_output_prediction[1]
    
    sos = dec_word2ind['<sos>']
    dec_start = np.array([[sos]])
    
    eos = dec_word2ind['<eos>']
    
    generated_output = []
    
    for _ in range(MAX_DEC_SEQ_LEN):
        p, h, c = decoder_model.predict([dec_start, h, c])
        
        # probabilties
        probs = p[0,0]
        index = np.argmax(probs)
#         print(index)
        
        # if probabilty of <sos> is high then it prints a warning
        if index == sos:
            print('wtf')
        
        if index == eos:
            break

        generated_output.append(dec_ind2word.get(index, '<WTF> %s'% index))
        dec_start[0][0] = index
    
    return ' '.join(generated_output)

In [63]:
print('Output text:', translate())

Input seq: [[  0   7 232]]
Input seq: it's cold
Output text: estoy bueno.
