In [8]:
from sklearn.externals import joblib
import tensorflow as tf
# from tensorflow.python.keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
# from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from tensorflow.python.keras.models import Sequential
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
# from tensorflow.python.keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from keras.callbacks import ModelCheckpoint
import numpy as np

In [None]:
tf.__version__

In [None]:
tf.keras.__version__

In [None]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
both = joblib.load('dataset.pkl')
train = joblib.load('train.pkl')
test = joblib.load('test.pkl')

In [3]:
both_list = np.array(both.tolist())
train_list = np.array(train.tolist())
test_list = np.array(test.tolist())

In [4]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(both_list[:,0])
# eng_tokenizer.texts_to_sequences(both_list[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max(len(line.split()) for line in both_list[:, 0])
print(f'english vocabulary size - {eng_vocab_size}')
print(f'english max sentence Length - {eng_length}')

english vocabulary size - 9381
english max sentence Length - 163


In [5]:
esp_tokenizer = Tokenizer()
esp_tokenizer.fit_on_texts(both_list[:, 1])
esp_vocab_size = len(esp_tokenizer.word_index) + 1
esp_length = max(len(line.split()) for line in both_list[:, 1])
print(f'spanish vocabulary size - {esp_vocab_size}')
print(f'spanish max sentence Length - {esp_length}')

spanish vocabulary size - 12472
spanish max sentence Length - 175


In [6]:
X = eng_tokenizer.texts_to_sequences(train_list[:, 0])
trainX = pad_sequences(X, maxlen=eng_length, padding='post')

In [7]:
Y = esp_tokenizer.texts_to_sequences(train_list[:, 1])
trainY = pad_sequences(X, maxlen=esp_length, padding='post')
y_train_categorical = to_categorical(trainY, num_classes=esp_vocab_size)

In [8]:
tX = eng_tokenizer.texts_to_sequences(test_list[:, 0])
testX = pad_sequences(tX, maxlen=eng_length, padding='post')

In [9]:
tY = esp_tokenizer.texts_to_sequences(test_list[:, 1])
testY = pad_sequences(tY, maxlen=esp_length, padding='post')
y_test_categorical = to_categorical(testY, num_classes=esp_vocab_size)

In [10]:
trainX.shape

(4000, 163)

In [11]:
y_train_categorical.shape

(4000, 175, 12472)

In [14]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
 

In [15]:
# define model
model = define_model(eng_vocab_size, esp_vocab_size, eng_length, esp_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 163, 256)          2401536   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 175, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 175, 256)          525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 175, 12472)        3205304   
Total params: 6,657,464
Trainable params: 6,657,464
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [17]:
model.fit(trainX, y_train_categorical, epochs=30, batch_size=16, validation_data=(testX, y_test_categorical), callbacks=[checkpoint], verbose=2)


Train on 4000 samples, validate on 1000 samples
Epoch 1/30


KeyboardInterrupt: 