In [4]:
%load_ext autoreload
%autoreload 2

import os
import sys


from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU
import numpy as np
import pandas as pd


sys.path.append('../src')

import enigma_challenge as ec

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Generate dataset

In [5]:
power = 15
n_samples = 1<<power

save_file = f"../data/raw/enigma_data_{power}.csv"
# ENIGMA_DATA = ec.generate_data(n_samples=n_samples, save_file=save_file)

ENIGMA_OBJ = ec.EnigmaDataset(n_samples=n_samples, seq_len=42, save_file=save_file)
ENIGMA_OBJ.dataset.head()

Unnamed: 0,PLAIN,CIPHER
0,GOLEARNABLEONE,HMSKLWYPLPTEVO
1,CARRYOTHERSASSUMEMEET,VEXBNYVOCGFGFXVBUCCIO
2,SELLNONEWITHACCEPT,UASHYYYVMHEMDWOGKH
3,GREENAIRBLACKLINEPUSHPAY,HGVKYTXDLPKUWVFYUVLRAHJW
4,PASSSEASONRESPONDMAYABLELESS,OELWWZOWSSUOFMCYHCDNHADNKXYY


# Test/train split (with caching)

In [6]:
n_test_samples = 16384
sent_partition_size = 7

ENIGMA_OBJ.test_train_split(n_test_samples=n_test_samples, 
                            sent_partition_size=sent_partition_size)

In [7]:
ENIGMA_OBJ.train_data_partitioned.head()

Unnamed: 0,ID,PLAIN,CIPHER
0,1,CARRYOT,VEXBNYV
1,1,HERSASS,OCGFGFX
2,1,UMEMEET,VBUCCIO
3,2,SELLNON,UASHYYY
4,2,EWITHAC,VMHEMDW


In [8]:
ENIGMA_OBJ.test_data_partitioned.head()

Unnamed: 0,ID,PLAIN,CIPHER
0,1588,FINALLY,JFPIAUN
1,1588,SUMMERA,WXYCOMH
2,1588,PPROACH,BZJKDVA
3,1588,TENHOTE,XOENYOH
4,1588,LSHAKE,KIDCWL


In [9]:
print(f"# of training examples: {ENIGMA_OBJ.train_data.shape[0]}")
print(f"# of test examples: {ENIGMA_OBJ.test_data.shape[0]}")

# of training examples: 16384
# of test examples: 16384


# Encode the dataset

In [62]:
ENIGMA_ENCODED = ec.EncodedDataset(unencoded_dataset=ENIGMA_OBJ)

In [63]:
print(ENIGMA_ENCODED.plain_train.sentences_processed[0:5])
print(ENIGMA_ENCODED.cipher_train.sentences_processed[0:5])
print(ENIGMA_ENCODED.plain_test.sentences_processed[0:5])
print(ENIGMA_ENCODED.cipher_test.sentences_processed[0:5])

['\tCARRYOT\n', '\tHERSASS\n', '\tUMEMEET\n', '\tSELLNON\n', '\tEWITHAC\n']
['VEXBNYV', 'OCGFGFX', 'VBUCCIO', 'UASHYYY', 'VMHEMDW']
['\tFINALLY\n', '\tSUMMERA\n', '\tPPROACH\n', '\tTENHOTE\n', '\tLSHAKE\n']
['JFPIAUN', 'WXYCOMH', 'BZJKDVA', 'XOENYOH', 'KIDCWL']


In [64]:
print(ENIGMA_ENCODED.plain_train.alphabet)
print(ENIGMA_ENCODED.cipher_train.alphabet)
print(ENIGMA_ENCODED.plain_test.alphabet)
print(ENIGMA_ENCODED.cipher_test.alphabet)

['\t', '\n', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
['\t', '\n', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [67]:
print(ENIGMA_ENCODED.plain_train.target_vector)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


# Create the Ultra Code Breaker model

In [128]:
ultra = ec.UltraCodeBreaker(plain_text=ENIGMA_ENCODED.plain_train, 
                            cipher_text=ENIGMA_ENCODED.cipher_train)

# Train the model

In [129]:
ultra.train(epochs=1, n_nodes=256)
ultra.model.summary()

Train on 46674 samples, validate on 20004 samples
Epoch 1/1
Model: "model_26"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_57 (InputLayer)           (None, None, 26)     0                                            
__________________________________________________________________________________________________
input_58 (InputLayer)           (None, None, 28)     0                                            
__________________________________________________________________________________________________
lstm_57 (LSTM)                  [(None, 256), (None, 289792      input_57[0][0]                   
__________________________________________________________________________________________________
lstm_58 (LSTM)                  [(None, None, 256),  291840      input_58[0][0]                   
                               

# Create the test model

In [130]:
ultra.create_test_model(n_nodes=256)

AttributeError: 'UltraCodeBreaker' object has no attribute '_encoder_input'

# Create the test model 

In [138]:
a

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

# Function to decode the sequence

In [139]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(fra_chars)))
    target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(fra_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent



# Divide and conquer of the test data (DUPLICATED CODE!!!!!)

In [140]:
def divide_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

df = test.copy()
df

enigma_test_partitions = {
    'ID': [], 
    'PLAIN': [], 
    'CIPHER': []
}

n = 10

for index, row in df.iterrows():    
    plain = list(str(row['PLAIN']))
    cipher = list(str(row['CIPHER']))
    
    plain = list(divide_chunks(lst=plain, n=n))
    cipher = list(divide_chunks(lst=cipher, n=n))
    
    for i in range(len(plain)):
        plain_now = "".join(plain[i])
        cipher_now = "".join(cipher[i])
        
        enigma_test_partitions['ID'].append(index)
        enigma_test_partitions['PLAIN'].append(plain_now)
        enigma_test_partitions['CIPHER'].append(cipher_now)
    
enigma_test_partitions = pd.DataFrame(enigma_test_partitions)
enigma_test_partitions.head(10)
len(enigma_test_partitions)

49645

# Create one-hot vectors for the test data (DUPLICATED CODE!!!!!)

In [141]:
nb_samples_test = 49645
eng_sent_test = []
fra_sent_test = []
eng_chars_test = set()
fra_chars_test = set()
# nb_samples = enigma_data.shape[0]

# Process english and french sentences
for index, row in enigma_test_partitions.iterrows():
#     eng_line = str(lines[line]).split('\t')[0]
    eng_line = str(row['CIPHER'])
    
    # Append '\t' for start of the sentence and '\n' to signify end of the sentence
#     fra_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
    fra_line = f"\t{str(row['PLAIN'])}\n"
    
    eng_sent_test.append(eng_line)
    fra_sent_test.append(fra_line)
    
    for ch in eng_line:
        if (ch not in eng_chars_test):
            eng_chars_test.add(ch)
            
    for ch in fra_line:
        if (ch not in fra_chars_test):
            fra_chars_test.add(ch)
fra_chars_test = sorted(list(fra_chars_test))
eng_chars_test = sorted(list(eng_chars_test))

# dictionary to index each english character - key is index and value is english character
eng_index_to_char_dict_test = {}

# dictionary to get english character given its index - key is english character and value is index
eng_char_to_index_dict_test = {}

for k, v in enumerate(eng_chars_test):
    eng_index_to_char_dict_test[k] = v
    eng_char_to_index_dict_test[v] = k

max_len_eng_sent_test = max([len(line) for line in eng_sent_test])
max_len_fra_sent_test = max([len(line) for line in fra_sent_test])



tokenized_eng_sentences_test = np.zeros(shape = (nb_samples_test,max_len_eng_sent_test,len(eng_chars_test)), dtype='float32')
tokenized_fra_sentences_test = np.zeros(shape = (nb_samples_test,max_len_fra_sent_test,len(fra_chars_test)), dtype='float32')
target_data_test = np.zeros((nb_samples_test, max_len_fra_sent_test, len(fra_chars_test)),dtype='float32')


# Vectorize the english and french sentences

for i in range(nb_samples_test):
    for k,ch in enumerate(eng_sent_test[i]):
        tokenized_eng_sentences_test[i,k,eng_char_to_index_dict_test[ch]] = 1
        
#     for k,ch in enumerate(fra_sent_test[i]):
#         tokenized_fra_sentences_test[i,k,fra_char_to_index_dict_test[ch]] = 1

#         # decoder_target_data will be ahead by one timestep and will not include the start character.
#         if k > 0:
#             target_data_test[i,k-1,fra_char_to_index_dict_test[ch]] = 1

In [22]:


for seq_index in range(20):
    inp_seq = tokenized_eng_sentences_test[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-' * 100)
    print('Input sentence:', eng_sent_test[seq_index])
    print('Decoded sentence:', translated_sent.strip())
    print('decode Org sentence:', fra_sent_test[seq_index])



----------------------------------------------------------------------------------------------------
Input sentence: JEJLCWSG
Decoded sentence: FATHERDI
decode Org sentence: 	FATHERDI

----------------------------------------------------------------------------------------------------
Input sentence: OFBNBZBD
Decoded sentence: SCOVERPR
decode Org sentence: 	SCOVERPR

----------------------------------------------------------------------------------------------------
Input sentence: FMCONUCB
Decoded sentence: ICEFIRHT
decode Org sentence: 	ICEFIRST

----------------------------------------------------------------------------------------------------
Input sentence: PJDKYWTT
Decoded sentence: FACTREAD
decode Org sentence: 	FACTREAD

----------------------------------------------------------------------------------------------------
Input sentence: IASHKLVJ
Decoded sentence: WELLFUTU
decode Org sentence: 	WELLFUTU

---------------------------------------------------------------------------

# Predict for the test set

In [142]:
# test.head()
nb_samples_test = 49645


predicted_cipher = []
# actual_cipher = []

# cipher = list(test['CIPHER'])

for seq_index in range(nb_samples_test):
    inp_seq = tokenized_eng_sentences_test[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    predicted_cipher.append(translated_sent)
    

print(enigma_test_partitions.head())

predicted_cipher[0:10]


# Save the divide-and-conquery predictions

In [148]:
enigma_predicted = enigma_test_partitions.copy()
enigma_predicted['DECRYPTED'] = [x.strip() for x in predicted_cipher]
enigma_predicted.head(20)


enigma_predicted.to_csv('./enigma_predicted_n10.csv')

ValueError: Length of values does not match length of index

# Concatenate the predictions (and save)

In [145]:
# foo = enigma_predicted.groupby(['ID'], as_index=False, sort=False) \
#     .agg(
#     {
#         'PLAIN': ''.join,
#         'CIPHER': ''.join,
#         'DECPRYPTED': ''.join
#     })

# foo = enigma_predicted[['ID', 'DECRYPTED']] \
#     .groupby(['ID'], as_index=False, sort=False) \
#     .agg(''.join)

foo = enigma_predicted.groupby(['ID'], as_index=False, sort=False) \
    .agg(''.join)

foo.head()

foo.to_csv('./joined_predicted_cipher_n10.csv')

# Evaluate the test predictions

In [146]:
 
def str_score(str_a, str_b) :
    if len(str_a) != len(str_b):
        return 0

    n_correct = 0

    for a, b in zip(str_a, str_b):
        n_correct += int(a == b)
    # print(f" n_correct {n_correct}")
    # print(f" len  {n_correct}")

    return n_correct / len(str_a)

    
def score(predicted_plain, correct_plain):
    correct = 0

    for p, c in zip(predicted_plain, correct_plain):
#         print(p,c)
#         exit()
        if str_score(p, c) > 0.8:
            correct += 1
    print(f" correct {correct}")
    print(f" len correct_plain {len(correct_plain)}")

    return correct / len(correct_plain)


# print(predicted_cipher)
# print(actual_cipher)

predicted_cipher = list(foo['DECRYPTED'])
actual_cipher = list(foo['PLAIN'])
print(score(predicted_cipher, actual_cipher ))




 correct 12145
 len correct_plain 16384
0.74127197265625
