In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
from src.data_manager import *

In [3]:
versions = get_bible_versions_by_file_name(['t_kjv', 't_bbe'])

In [4]:
create_datasets(versions, 1.0, write_files = True);

Finding shared verses between 2 versions...        done in 0.405 seconds
Separate test verses...                            done in 0.018 seconds
Separate validation verses...                      done in 0.028 seconds
Zip together verses (shuffle = True)...            done in 0.049 seconds
Store datasets to files...                         done in 0.021 seconds

# Training Verses:    19,282 (62%)
# Validation Verses:       0 (0%)
# Test Verses:        11,766 (38%)


In [5]:
datasets = load_datasets()

In [6]:
# Configuration
batch_size = 64
epochs = 10
latent_dim = 256
num_samples = 5000

In [7]:
input_texts = datasets['training']['t_kjv']
target_texts = [ f'\t{target_text}\n' for target_text in datasets['training']['t_bbe'] ]
input_characters = {char for input_text in input_texts for char in input_text}
target_characters = {char for target_text in target_texts for char in target_text}
len(input_texts), len(target_texts)

(19282, 19282)

In [8]:
input_characters = sorted(input_characters)
target_characters = sorted(target_characters)
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = len(max(input_texts, key = len))
max_decoder_seq_length = len(max(target_texts, key = len))
max_encoder_seq_length, max_decoder_seq_length

(528, 506)

In [9]:
# character to encoding index map
input_token_index = { char: i for i, char in enumerate(input_characters) }
target_token_index = { char: i for i, char in enumerate(target_characters) }

In [10]:
# Initialize one-hot encoding arrays
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [11]:
for i, input_text in enumerate(input_texts):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    # initialize the rest to blank (spaces)
    encoder_input_data[i, t + 1 :, input_token_index[' ']] = 1.0

for i, target_text in enumerate(target_texts):
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[' ']] = 1.0
    decoder_target_data[i, t:, target_token_index[' ']] = 1.0

In [12]:
# Defining the models, yay!
encoder_inputs = keras.Input(shape = (None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state = True)
encoder_states = encoder(encoder_inputs)[1:]

decoder_inputs = keras.Input(shape = (None, num_decoder_tokens))
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state = encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(
    optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy']
)
model.fit(
    [encoder_input_data[:num_samples], decoder_input_data[:num_samples]],
    decoder_target_data[:num_samples],
    batch_size = batch_size,
    epochs = epochs,
    validation_split = 0.8
)
# Save model
model.save('models/kjv_bbe_lstm_c2c')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10