In [1]:
from tensorflow import keras
import re
# Importing our translations
data_path = "untitled.txt"
# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

for line in lines:
  # Input and target sentences are separated by tabs
    input_doc, target_doc = line.split('\t')
  # Appending each input sentence to input_docs
    input_docs.append(input_doc)
  # Splitting words from punctuation
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
  # Redefine target_doc below and append it to target_docs:
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
  
  # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        print(token)

        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        print(token)
    # And here:
        if token not in target_tokens:
            target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

We'll
see
.
<START>
Después
veremos
.
<END>
We'll
see
.
<START>
Ya
veremos
.
<END>
We'll
try
.
<START>
Lo
intentaremos
.
<END>
We've
won
!
<START>
¡
Hemos
ganado
!
<END>
Well
done
.
<START>
Bien
hecho
.
<END>
What's
up
?
<START>
¿
Qué
hay
?
<END>
Who
cares
?
<START>
¿
A
quién
le
importa
?
<END>
Who
drove
?
<START>
¿
Quién
condujo
?
<END>
Who
drove
?
<START>
¿
Quién
conducía
?
<END>
Who
is
he
?
<START>
¿
Quién
es
él
?
<END>
Who
is
it
?
<START>
¿
Quién
es
?
<END>


In [3]:
from tensorflow import keras
import numpy as np
import re
#need layers from keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

In [4]:
print('Number of samples:', len(input_docs))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_features_dict = dict([(token, i) for i, token in enumerate(input_tokens)])
# Build out target_features_dict:
target_features_dict = dict([(token, i) for i, token in enumerate(target_tokens)])


Number of samples: 11
Number of unique input tokens: 18
Number of unique output tokens: 27
Max sequence length for inputs: 4
Max sequence length for outputs: 12


In [5]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_features_dict = dict((i, token) for token, i in input_features_dict.items())
# Build out reverse_target_features_dict:
reverse_target_features_dict = dict((i,token) for token, i in target_features_dict.items())

encoder_input_data = np.zeros((len(input_docs), max_encoder_seq_length, num_encoder_tokens),dtype='float32')
print("\nHere's the first item in the encoder input matrix:\n", encoder_input_data[0], "\n\nThe number of columns should match the number of unique input tokens and the number of rows should match the maximum sequence length for input sentences.")


Here's the first item in the encoder input matrix:
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] 

The number of columns should match the number of unique input tokens and the number of rows should match the maximum sequence length for input sentences.


In [6]:
# Build out the decoder_input_data matrix:
decoder_input_data = np.zeros((len(input_docs), max_decoder_seq_length,num_decoder_tokens), dtype='float32')
# Build out the decoder_target_data matrix:
decoder_target_data = np.zeros((len(target_docs), max_decoder_seq_length,num_decoder_tokens), dtype='float32')

for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
    
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):

        print("Encoder input timestep & token:", timestep, token)
    # Assign 1. for the current line, timestep, & word in encoder_input_data
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.

    for timestep, token in enumerate(target_doc.split()):

    # decoder_target_data is ahead of decoder_input_data by one timestep
        print("Decoder input timestep & token:", timestep, token)
    # Assign 1. for the current line, timestep, & word in decoder_input_data
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.
        if timestep > 0:
      # decoder_target_data will be ahead by one timestep and will not include the start token.
          print("Decoder target timestep:", timestep)
      # Assign 1. for the current line, timestep, & word in decoder_target_data:
          decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

Encoder input timestep & token: 0 We'll
Encoder input timestep & token: 1 see
Encoder input timestep & token: 2 .
Decoder input timestep & token: 0 <START>
Decoder input timestep & token: 1 Después
Decoder target timestep: 1
Decoder input timestep & token: 2 veremos
Decoder target timestep: 2
Decoder input timestep & token: 3 .
Decoder target timestep: 3
Decoder input timestep & token: 4 <END>
Decoder target timestep: 4
Encoder input timestep & token: 0 We'll
Encoder input timestep & token: 1 see
Encoder input timestep & token: 2 .
Decoder input timestep & token: 0 <START>
Decoder input timestep & token: 1 Ya
Decoder target timestep: 1
Decoder input timestep & token: 2 veremos
Decoder target timestep: 2
Decoder input timestep & token: 3 .
Decoder target timestep: 3
Decoder input timestep & token: 4 <END>
Decoder target timestep: 4
Encoder input timestep & token: 0 We'll
Encoder input timestep & token: 1 try
Encoder input timestep & token: 2 .
Decoder input timestep & token: 0 <START>
D

In [7]:
# Create the input layer to define a matrix to hold all the one-hot vectors that we’ll feed to the model
encoder_inputs = Input(shape=(None, num_encoder_tokens))
# Create the LSTM layer with some output dimensionality
encoder_lstm = LSTM(256, return_state=True)
# Retrieve the outputs and states
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
# Put the states together in a list
encoder_states = [state_hidden, state_cell] 

# The decoder input and LSTM layers:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
# Retrieve the LSTM outputs and states:
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# Build a final Dense layer - a final activation layer, using the Softmax function, that will give us the probability distribution — where all probabilities sum to one — for each token
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
# Filter outputs through the Dense layer --> transforms our LSTM output from a dimensionality to the number of unique words within the hidden layer’s vocabulary 
decoder_outputs = decoder_dense(decoder_outputs)

In [8]:
# Building the training model:
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
training_model.summary()

# Compile the model:
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
# Choose the batch size and number of epochs:
batch_size = 50
epochs = 50
# Train the model:
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 18)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 27)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        281600      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

<keras.callbacks.History at 0x7fbd780e8c70>