In [18]:
# Imports
import numpy as np
import random as rand
import tensorflow as tf
import pandas as pd
import keras
import matplotlib
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Add as keras_add_layers
from keras.callbacks import History
np.set_printoptions(suppress=True)

In [58]:
# Static letter variable
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
sequence_length = 10

# Generates a random sequence of letters
def generate_random_sequence_of_letters(length=sequence_length):
    sequence = ''
    for _ in range(length):
        sequence += letters[rand.randint(0,len(letters) - 1)]
    return sequence

# Shifts the letters in a sequence by the specified amount
def shift_sequence(sequence,shift_size=rand.randint(0,len(letters) - 2)):
    ret_sequence = ''
    for letter in sequence:
        index = letters.index(letter)
        index += shift_size
        while index > len(letters) - 1:
            index -= len(letters)
        ret_sequence += letters[index]
    return ret_sequence
            
# Converts a sequence of letters into an index vector
def convert_to_index_vector(sequence):
    ret_vector = []
    for letter in sequence:
        ret_vector.append(float(letters.index(letter)))
    return ret_vector

# Generates the scrambled inputs for a given sequence
def generate_scrambled_inputs(sequence):
    input_vector = convert_to_index_vector(shift_sequence(sequence,shift_size=12))
    return input_vector

# Converts a sequence to a hot vector sequence
def convert_sequence_to_hot_vectors(sequence):
    vectors = []
    for num in sequence:
        vector = np.zeros(len(letters))
        vector[int(num)] = 1
        vectors.append(vector)
    return np.array(vectors)

# Converts a hot vector sequence to a letter sequence
def convert_hot_vectors_to_sequence(vectors):
    indices = [np.argmax(vector) for vector in vectors]
    return convert_index_vector(indices)

# Generates training data
def generate_training_data(size=10000):
    inputs = []
    outputs = []
    for _ in range(size):
        sequence = generate_random_sequence_of_letters()
        outputs.append(convert_sequence_to_hot_vectors(convert_to_index_vector(sequence)))
        inputs.append(convert_sequence_to_hot_vectors(generate_scrambled_inputs(sequence)))
    return [np.array(inputs),np.array(outputs)]

# Converts an index vector to a string
def convert_index_vector(index_vector):
    sequence = ''
    for index in index_vector:
        if round(index) < len(letters):
            sequence += letters[int(round(index))]
        else:
            sequence += '*'
    return sequence

In [67]:
# Create model
model = Sequential()
model.add(LSTM(sequence_length,input_shape=(sequence_length,len(letters)),return_sequences=True))
model.add(Dense(len(letters),activation='softmax'))
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['acc'])

In [59]:
# Actual generation of training_data
training_data = generate_training_data(10000)
training_inputs = training_data[0]
training_outputs = training_data[1]

In [66]:
# Training of the model
history = model.fit(training_inputs,training_outputs,batch_size=256,epochs=100,validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
1280/8000 [===>..........................] - ETA: 1s - loss: 0.0031 - acc: 1.0000

KeyboardInterrupt: 

In [61]:
# Generate test data
test_data = generate_training_data(100)
test_inputs = test_data[0]
test_outputs = test_data[1]

In [62]:
# Print predictions vs actual
outputs = model.predict(test_inputs)
for i in range(len(test_outputs)):
    print(convert_hot_vectors_to_sequence(outputs[i]),convert_hot_vectors_to_sequence(test_outputs[i]))
    #print(convert_index_vector(outputs[i]),convert_index_vector(test_outputs[i]))

IWXMZLWRXI IWXMZLWRXI
CYDHSNMNKE CYDHSNMNKE
NZSNBZFGVY NZSNBZFGVY
OGMGXIMIVJ OGMGXIMIVJ
ZLCLNFMFTZ ZLCLNFMFTZ
WJJVZDGBUV WJJVZDGBUV
VPLYAHHDEM VPLYAHHDEM
LJBJJBCVPS LJBJJBCVPS
MUDNODCHKL MUDNODCHKL
CCCYYASFVB CCCYYASFVB
JBZMFLRMDP JBZMFLRMDP
JXCHOKEUVX JXCHOKEUVX
QKPJXDHQOG QKPJXDHQOG
UOCFOIKPAE UOCFOIKPAE
BYKQZYHTSX BYKQZYHTSX
SPGDRYDSVD SPGDRYDSVD
JALTMULKFL JALTMULKFL
QYQUHZQMUI QYQUHZQMUI
PYCMNKJTZU PYCMNKJTZU
BZVAZTOSXF BZVAZTOSXF
EKTEVVVSSD EKTEVVVSSD
BRWPXTVDZY BRWPXTVDZY
VHPAYZYIWG VHPAYZYIWG
DMNENSWQWI DMNENSWQWI
ITBEQUYLEP ITBEQUYLEP
LDXDUJWLMT LDXDUJWLMT
RNBLTHEKDH RNBLTHEKDH
DACZSJUZSK DACZSJUZSK
IZAIFJYYEF IZAIFJYYEF
PCASKXFXBP PCASKXFXBP
QWXDSEQFCY QWXDSEQFCY
QNVKCUOKQA QNVKCUOKQA
NNEXATPOXL NNEXATPOXL
DWVKSWXWEA DWVKSWXWEA
WILOPAORRK WILOPAORRK
DJHOPWGVRK DJHOPWGVRK
RZRRSCSEIA RZRRSCSEIA
BQJICCZLHD BQJICCZLHD
MBNJNBRUQK MBNJNBRUQK
ZKKJSLIIBG ZKKJSLIIBG
KDCGVCOMLC KDCGVCOMLC
MVVWADIZQL MVVWADIZQL
ASNVZSYOXG ASNVZSYOXG
QPRSWNAPRZ QPRSWNAPRZ
UQZGWUAWWK UQZGWUAWWK
ECNAFJFJBO