In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import TimeDistributed, Dense, Embedding, LSTM, Reshape, GRU
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
from sklearn.model_selection import train_test_split
import json
import math
import errno
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [61]:
LEARNING_RATE = 0.001
# Scheduling a learning-rate to produce different effects for gradient of loss wrt to the weights
# Different Learning Rates will affect the model differently by updating different % of weights of the model.
START_CHAR = "\t"
END_CHAR = "\n"
GRU_UNITS = 300
DENSE_UNITS = 100
VOCAB_SIZE = 70
PROB_THRESHOLD = 1e-9
VERBOSITY = 1
EPOCHS = 15
MAX_LENGTH = 32
EMBEDDING_DIMENSION = 16
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 5000
EMBEDDING_METADATA = 'metadata.tsv'
PREFETCH_BATCHES = 10
METRICS = [tf.keras.metrics.SparseCategoricalCrossentropy()]

In [3]:
# Preprocessing the inputs and the outputs for the model --- 
# For Example password 'passwd'
# Adding a \t at the Start for the input - \tpasswd 
# Adding a \n at the End for the model output - passwd\n
# Here \t predicts p, p predicts a and so on..
def loadPreprocessInputOutput(FILE_NAME, START_CHAR, END_CHAR):
    try:
        assert os.path.isfile(FILE_NAME)
        assert isinstance(START_CHAR, str)
        assert isinstance(END_CHAR, str)
    except:
        raise Exception("Incorrect Inputs. Try again.")
        return
    inputPasswords = []
    outputPasswords = []
    listPasswords = []
    with open(FILE_NAME, "r") as pass_file:
        while(True):
            single_pass = pass_file.readline().rstrip("\n")
            if(single_pass == ""):
                break
            else:
                inputPasswords.append(START_CHAR + single_pass)
                outputPasswords.append(single_pass + END_CHAR)
                listPasswords.append(START_CHAR + single_pass + END_CHAR)
        return(inputPasswords, outputPasswords, listPasswords)

In [11]:
inputPasswords, outPasswords, listPasswords = loadPreprocessInputOutput("..\\Embedding\\Data\\ascii_rockyou_less_than_thirty_two_cleaned.txt", START_CHAR, END_CHAR)

In [5]:
# Sanity Check
print(inputPasswords[5:10])
print(len(inputPasswords))
print(outPasswords[5:10])
print(len(outPasswords))
print(listPasswords[5:10])
print(len(listPasswords))

['\tprincess', '\t1234567', '\trockyou', '\t12345678', '\tabc123']
14105697
['princess\n', '1234567\n', 'rockyou\n', '12345678\n', 'abc123\n']
14105697
['\tprincess\n', '\t1234567\n', '\trockyou\n', '\t12345678\n', '\tabc123\n']
14105697


In [4]:
# Save the passwords:
# Do not open file - will not be shown properly
def writePreprocessedPasswordFromList(passwordList, typeList):
    passwordFileName = typeList + "_preprocessed.txt"
    with open(passwordFileName, "w") as pass_file:
        for password in passwordList:
            if(typeList.lower() == "input"):
                pass_file.write(password+"\n")
            else:
                pass_file.write(password)
    print(f"{typeList} passwords written completely to : {passwordFileName}")

In [None]:
# Write the passwords
writePreprocessedPasswordFromList(inputPasswords, "input")
writePreprocessedPasswordFromList(outPasswords, "output")
writePreprocessedPasswordFromList(listPasswords, "list")

In [5]:
# Sanity Check
# Output will not be proper.
!head -n 5 input_preprocessed.txt
!head -n 5 output_preprocessed.txt
!head -n 5 list_preprocessed.txt
!wc -l input_preprocessed.txt
!wc -l output_preprocessed.txt
!wc -l list_preprocessed.txt

	123456
	12345
	123456789
	password
	iloveyou
123456
12345
123456789
password
iloveyou
	123456
	12345
	123456789
	password
	iloveyou
14105697 input_preprocessed.txt
14105697 output_preprocessed.txt
14105697 list_preprocessed.txt


In [5]:
# Specifying the Tokenizer -- leaving num_words as blank to include as many 
# unique characters as possible.
# Fit the tokenizer on the text and then save the tokenizer.
passwordTokenizer = Tokenizer(filters = "", lower = True, char_level = True)

In [6]:
# Utility Function to Save the Tokenizer Configuration
def saveTokenizer(TOKENIZER, OUTPUT_PATH):
    tokenizerConfigString = TOKENIZER.to_json()
    with open(OUTPUT_PATH+".json", "w") as op_file:
        op_file.write(tokenizerConfigString)

In [7]:
# Utility Function to Load the Tokenizer Configuration
def loadTokenizer(TOKENIZER_FILE_PATH):
    _, file_extension = os.path.splitext(TOKENIZER_FILE_PATH)
    if(file_extension != ".json"):
        raise Exception("Incorrect File.")
        return
    else:
        with open(TOKENIZER_FILE_PATH, "r") as tokenizer_cfg_file:
            tokenizer_config = tokenizer_cfg_file.read()
            tokenizer_cfg = json.loads(tokenizer_config)
            passwordTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json.dumps(tokenizer_cfg))
            return passwordTokenizer

In [15]:
# Fitting the Tokenizer on data:
passwordTokenizer.fit_on_texts(listPasswords)
passwordTokenizer.get_config()

{'num_words': None,
 'filters': '',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 14105697,
 'word_counts': '{"\\t": 14105697, "1": 6729506, "2": 5234401, "3": 3765169, "4": 3389487, "5": 3352338, "6": 3116090, "\\n": 14105697, "7": 3098762, "8": 3565308, "9": 3853241, "p": 1619704, "a": 8828625, "s": 4154066, "w": 799568, "o": 5173138, "r": 4576620, "d": 2484237, "i": 5553157, "l": 4460473, "v": 1050793, "e": 7203479, "y": 2373398, "u": 2307207, "n": 4827931, "c": 2608350, "k": 2012166, "b": 2110821, "g": 1717216, "m": 3205286, "j": 1237461, "h": 2335137, "q": 178503, "t": 3425154, "0": 5735322, "f": 981496, "z": 763424, "x": 479316, "!": 142923, ";": 12258, "-": 133010, "*": 123842, ".": 248717, "?": 18301, ",": 29750, "/": 48190, "#": 48873, "@": 107880, "$": 36029, "%": 10254, "^": 6394, "&": 26359, "+": 26989, "\'": 15335, "[": 7682, "]": 10802, "<": 9561, "_": 193006, ">": 2458, "=": 18365, "\\\\": 25832, "\\"": 3637, ":": 6858, "(": 1

In [16]:
# Save Tokenizer -- for later use and load it to avoid re-fitting
saveTokenizer(passwordTokenizer, "prototypeTokenizer")

In [8]:
passwordTokenizer = loadTokenizer("prototypeTokenizer.json")
# Sanity Check
passwordTokenizer.get_config()

{'num_words': None,
 'filters': '',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 14105697,
 'word_counts': '{"\\t": 14105697, "1": 6729506, "2": 5234401, "3": 3765169, "4": 3389487, "5": 3352338, "6": 3116090, "\\n": 14105697, "7": 3098762, "8": 3565308, "9": 3853241, "p": 1619704, "a": 8828625, "s": 4154066, "w": 799568, "o": 5173138, "r": 4576620, "d": 2484237, "i": 5553157, "l": 4460473, "v": 1050793, "e": 7203479, "y": 2373398, "u": 2307207, "n": 4827931, "c": 2608350, "k": 2012166, "b": 2110821, "g": 1717216, "m": 3205286, "j": 1237461, "h": 2335137, "q": 178503, "t": 3425154, "0": 5735322, "f": 981496, "z": 763424, "x": 479316, "!": 142923, ";": 12258, "-": 133010, "*": 123842, ".": 248717, "?": 18301, ",": 29750, "/": 48190, "#": 48873, "@": 107880, "$": 36029, "%": 10254, "^": 6394, "&": 26359, "+": 26989, "\'": 15335, "[": 7682, "]": 10802, "<": 9561, "_": 193006, ">": 2458, "=": 18365, "\\\\": 25832, "\\"": 3637, ":": 6858, "(": 1

In [9]:
def getLength(FILE_PATH, break_num = 0):
    count = 0
    with open(FILE_PATH, "r") as f:
        for count, _ in enumerate(f):
            pass
    count = count + 1
    if(break_num == 0):
        return count
    else:
        return break_num

In [10]:
# Function to preprocess and get validation data
def getValidationData(VALID_X_FILE_PATH, VALID_Y_FILE_PATH, VOCAB_SIZE, TOKENIZER, MAX_LENGTH, VALIDATION_BATCH_SIZE, TRAIN_ALL = None):
    if(TRAIN_ALL is not None):
        total_passwords = getLength(VALID_X_FILE_PATH, TRAIN_ALL)
    else:
        total_passwords = getLength(VALID_X_FILE_PATH)
    total_batches = math.floor(total_passwords / VALIDATION_BATCH_SIZE)
    total_passwords_to_read = total_batches * VALIDATION_BATCH_SIZE
    valid_passwords = []
    count = 0
    valid_y = []
    flag = True
    with open(VALID_X_FILE_PATH, "r") as valid_file:
        for _, password in enumerate(valid_file):
            if(_ < total_passwords_to_read):
                valid_passwords.append(password.rstrip("\n"))
    with open(VALID_Y_FILE_PATH, "r") as valid_true_file:
        for _, true_password in enumerate(valid_true_file):
            if(_ < total_passwords_to_read):
                valid_y.append(true_password)
    valid_encoded_passwords = TOKENIZER.texts_to_sequences(valid_passwords)
    valid_y_encoded_passwords = TOKENIZER.texts_to_sequences(valid_y)
    valid_padded_passwords = pad_sequences(valid_encoded_passwords, padding = "post", maxlen = (MAX_LENGTH + 1))
    valid_y_padded_passwords = pad_sequences(valid_y_encoded_passwords, padding = "post", maxlen = (MAX_LENGTH + 1))
    print(len(valid_padded_passwords))
    print(len(valid_y_padded_passwords))
    for x_valid_password, y_valid_password in zip(valid_padded_passwords, valid_y_padded_passwords):
        count += 1
        if(count % 500 == 0):
            # Give progress feedback
            print(f"Total passwords processed {count}")
        if(flag):
            final_x_valid = np.array(x_valid_password).reshape(1, (MAX_LENGTH + 1))
            temp_y_valid = np.array(y_valid_password).reshape(1, (MAX_LENGTH + 1))
            final_y_valid = to_categorical(temp_y_valid, num_classes = (VOCAB_SIZE + 1))
            flag = False
        else:
            final_x_valid = np.concatenate((final_x_valid, np.array(x_valid_password).reshape(1, (MAX_LENGTH + 1))), axis = 0)
            temp_y_valid = np.array(y_valid_password).reshape(1, (MAX_LENGTH + 1))                                 
            final_y_valid = np.concatenate((final_y_valid, to_categorical(temp_y_valid, num_classes = (VOCAB_SIZE + 1))), axis = 0)
    return(final_x_valid, final_y_valid) 

In [11]:
# Create an input pipeline for feeding encoded passwords -- 
# Pass train_passwords as None if you want to train on entire set of passwords mentioned in password_file
class gruNetworkInputSequence(Sequence):
    def __init__(self, train_passwords, password_file_x, password_file_y, batch_size, tokenizer, max_length, vocab_size):
        self.batch_size = batch_size
        self.train_passwords = train_passwords
        self.password_file = password_file_x
        self.password_file_y = password_file_y
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
    
    def __len__(self):
        with open(self.password_file, "r") as pass_file:
            for count, _ in enumerate(pass_file):
                pass
        total_passwords = count + 1
        if(self.train_passwords is not None):
            if(self.train_passwords < total_passwords):
                return math.floor(self.train_passwords / self.batch_size)
            else:
                return math.floor(total_passwords / self.batch_size)
        else:
            return math.floor(total_passwords / self.batch_size)
        
    def __getitem__(self, index):
        # This could be slow, will try to improve speed later by saving encoded passwords.
        # batch_enc_padded_passwords = None
        # batch_y_true = None
        flag = True
        batch_enc_padded_passwords = []
        batch_y_true = []
        # temp_x = []
        # temp_y = []
        batch_passwords = []
        batch_y_passwords = []
        batch_password_index = list(range((index) * self.batch_size, ((index + 1) * self.batch_size)))
        with open(self.password_file, "r") as pass_file:
            for count, password in enumerate(pass_file):
                if(count in batch_password_index):
                    batch_passwords.append(password.rstrip("\n"))
                else:
                    continue
        with open(self.password_file_y, "r") as pass_file_y:
            for count_y, password_y in enumerate(pass_file_y):
                if(count_y in batch_password_index):
                    batch_y_passwords.append(password_y)
                else:
                    continue
        #print(f"{batch_passwords}\n\n\n{batch_y_passwords}")
        encoded_passwords = self.tokenizer.texts_to_sequences(batch_passwords)
        padded_encoded_passwords = pad_sequences(encoded_passwords, padding = "post", maxlen = (self.max_length + 1))
        encoded_y_passwords = self.tokenizer.texts_to_sequences(batch_y_passwords)
        padded_encoded_y_passwords = pad_sequences(encoded_y_passwords, padding = "post", maxlen = (self.max_length + 1))
        for encoded_password, encoded_y_password in zip(padded_encoded_passwords, padded_encoded_y_passwords):
            reshaped_example = np.array(encoded_password).reshape(1, (self.max_length + 1))
            reshaped_y_example = np.array(encoded_y_password).reshape(1, (self.max_length + 1))
            if(flag):
                batch_enc_padded_passwords = reshaped_example
                batch_y_true = to_categorical(y = reshaped_y_example, num_classes = (self.vocab_size + 1))
                flag = False
            else:
                batch_enc_padded_passwords = np.concatenate((batch_enc_padded_passwords, reshaped_example), axis = 0)
                batch_y_true = np.concatenate((batch_y_true, to_categorical(y = reshaped_y_example,  num_classes = (self.vocab_size + 1))), axis = 0)
        return (np.array(batch_enc_padded_passwords).astype(np.float32), np.array(batch_y_true).astype(np.float32))

In [12]:
def createTFDataInputPipeline(X_INPUT_FILE_PATH, Y_INPUT_FILE_PATH, TOKENIZER, MAX_LENGTH, BUFFER_SIZE, PREFETCH_BATCHES, BATCH_SIZE):
    x_inputs = []
    y_inputs = []
    with open(X_INPUT_FILE_PATH, "r") as x_pass_file:
        with open(Y_INPUT_FILE_PATH, "r") as y_pass_file:
            for count, x_y_pass in enumerate(zip(x_pass_file, y_pass_file)):
                x_pass = x_y_pass[0].rstrip("\n")
                y_pass = x_y_pass[1]
                x_inputs.append(x_pass)
                y_inputs.append(y_pass)
    
    # All passwords read in memory:
    x_tokenized = TOKENIZER.texts_to_sequences(x_inputs)
    y_tokenized = TOKENIZER.texts_to_sequences(y_inputs)
    
    x_padded = pad_sequences(x_tokenized, padding = "post", maxlen = (MAX_LENGTH + 1))
    y_padded = pad_sequences(y_tokenized, padding = "post", maxlen = (MAX_LENGTH + 1))
    
    x_np_tokenized = np.array(x_padded)
    y_np_tokenized = np.array(y_padded)
    
    x_dataset = tf.data.Dataset.from_tensor_slices(x_np_tokenized)
    y_dataset = tf.data.Dataset.from_tensor_slices(y_np_tokenized)
    
    fit_dataset = tf.data.Dataset.zip((x_dataset, y_dataset))
    fit_dataset = fit_dataset.shuffle(buffer_size = BUFFER_SIZE)
    fit_dataset = fit_dataset.cache()
    fit_dataset = fit_dataset.batch(BATCH_SIZE)
    fit_dataset = fit_dataset.prefetch(PREFETCH_BATCHES)
    
    return fit_dataset

In [None]:
# 1. Preparing the Data to splitting into train and validation
# 2. Then prepare the train Data to split again into train and test
# 3. Don't train on test 
# 4. Modify hyperparameters on validate
# 5. Plot using TensorBoard for both train and validation for each epoch 
# 6. Necessary to train if for atleast one epoch, to plot the graphs
# Or we can choose to plot for each batch - but will be very
# Resource intensive and will slow us down considerably.
# 7. train_test_split is given the input and output arrays as X & Y
X_train_test_train, X_validation, Y_train_test_train, Y_validation = train_test_split(inputPasswords, outPasswords, test_size = 0.05, shuffle = True)
X_train, X_test, Y_train, Y_test = train_test_split(X_train_test_train, Y_train_test_train, test_size = 0.1, shuffle = True)

# Sanity Check
print(f"{repr(X_train_test_train[100])}\t{repr(Y_train_test_train[100])}")
print(f"{repr(X_validation[100])}\t{repr(Y_validation[100])}")
print(f"{repr(X_train[100])}\t{repr(Y_train[100])}")
print(f"{repr(X_test[100])}\t{repr(Y_test[100])}")

In [13]:
# Preparing to save models - dedicated methods for it
# We also save them after each epoch using checkpoint callback.
# So saving manually is optional.
def saveKerasModel(MODEL, OUTPUT_MODEL_PATH):
    # Saves the model to the disk, saves both the architecture and 
    # the configuration.
    try:
        assert isinstance(MODEL, Model)
        MODEL.save(OUTPUT_MODEL_PATH)
        print(f"[+] Model has been successfully saved to {OUTPUT_MODEL_PATH}")
    except:
        raise Exception("Model instance is incorrect. Failed!")
        return   

In [14]:
# Utility Function to load the model manually.
# Is Extremely Important!
def loadKerasModel(INPUT_MODEL_PATH):
    loaded_model = tf.keras.models.load_model(INPUT_MODEL_PATH)
    return loaded_model

In [16]:
!mkdir train
!mkdir test
!mkdir validation

mkdir: cannot create directory ‘train’: File exists
mkdir: cannot create directory ‘test’: File exists
mkdir: cannot create directory ‘validation’: File exists


In [15]:
# Utility Function to write the output to file -- 
def writeOutput(FILE_PATH, FILE_TYPE_TRAIN, PASS_LIST_X, PASS_LIST_Y):
    with open(os.path.join(FILE_PATH, FILE_TYPE_TRAIN + "_X" + ".txt"), "w") as x_file:
        for input_password in PASS_LIST_X:
            x_file.write(input_password + "\n")
    with open(os.path.join(FILE_PATH, FILE_TYPE_TRAIN + "_Y" + ".txt"), "w") as y_file:
        for out_password in PASS_LIST_Y:
            y_file.write(out_password)
    print("[+] Done!")

In [None]:
# Save the split data into their respective directories --
writeOutput(".\\train", "train", X_train, Y_train)
writeOutput(".\\test", "test", X_test, Y_test)
writeOutput(".\\validation", "validation", X_validation, Y_validation)

In [18]:
# Sanity Check 
!head -n 2 ./train/train_X.txt 
!head -n 2 ./train/train_Y.txt
!head -n 2 ./test/test_X.txt 
!head -n 2 ./test/test_Y.txt 
!head -n 2 ./validation/validation_X.txt 
!head -n 2 ./validation/validation_Y.txt 
!wc -l ./train/train_X.txt 
!wc -l ./train/train_Y.txt 
!wc -l ./test/test_X.txt
!wc -l ./test/test_Y.txt
!wc -l ./validation/validation_X.txt 
!wc -l ./validation/validation_Y.txt 

	ellatubar
	tupapiruloybofo
ellatubar
tupapiruloybofo
	c.u2033
	goldsun414
c.u2033
goldsun414
	242518
	1001338
242518
1001338
12060370 ./train/train_X.txt
12060370 ./train/train_Y.txt
1340042 ./test/test_X.txt
1340042 ./test/test_Y.txt
705285 ./validation/validation_X.txt
705285 ./validation/validation_Y.txt


In [28]:
# Preparing the file directory structure for callbacks:
!mkdir checkpoints
!mkdir tensorboard_log_dir

mkdir: cannot create directory ‘checkpoints’: File exists
mkdir: cannot create directory ‘tensorboard_log_dir’: File exists


In [16]:
# Creating Optimizer Instance for Training Model
adam_optimizer = Adam(learning_rate = LEARNING_RATE, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07)

In [79]:
# Building teacher-forcing training model
# Must include shared architecture for following layers --
# 1. Embedding Layer
# 2. LSTM_1 
# 3. LSTM_2
# 4. LSTM_3
# 5. Dense_1
# 6. Dense_2 (Softmax - Outputs Probability Distribution over |VOCAB_SIZE + <PAD_TOKEN>|)

# The shared layers are defined below -- 
shared_embedding_layer = Embedding(input_dim = (VOCAB_SIZE + 1), output_dim = EMBEDDING_DIMENSION, mask_zero = True, name = "Embedding_Layer")
shared_lstm_1 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_lstm_2 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_lstm_3 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_dense_1 = TimeDistributed(Dense(DENSE_UNITS, activation = "relu"))
shared_dense_op = TimeDistributed(Dense((VOCAB_SIZE + 1), activation = "softmax"))

In [80]:
# 1. Defining the Teacher-Force Training Model:
# 2. Do not use initial_state call argument for the LSTM layers
# 3. Use the initial_state call arguemnt for 'all' the LSTM layers 
# in the Inference Model

train_input = Input(shape = ((MAX_LENGTH + 1),))
train_emb_op = shared_embedding_layer(train_input)
train_lstm_1_op, train_lstm_1_hidden, train_lstm_1_cell = shared_lstm_1(train_emb_op)
train_lstm_2_op, train_lstm_2_hidden, train_lstm_2_cell = shared_lstm_2(train_lstm_1_op)
train_lstm_3_op, train_lstm_3_hidden, train_lstm_3_cell = shared_lstm_3(train_lstm_2_op)
train_dense_1_op = shared_dense_1(train_lstm_3_op)
train_model_op = shared_dense_op(train_dense_1_op)

train_model = Model(inputs = train_input, outputs = train_model_op)

# Check the model summary 
print(train_model.summary())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 33)]              0         
_________________________________________________________________
Embedding_Layer (Embedding)  (None, 33, 16)            1136      
_________________________________________________________________
lstm_9 (LSTM)                [(None, 33, 300), (None,  380400    
_________________________________________________________________
lstm_10 (LSTM)               [(None, 33, 300), (None,  721200    
_________________________________________________________________
lstm_11 (LSTM)               [(None, 33, 300), (None,  721200    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 33, 100)           30100     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 33, 71)            7171

In [81]:
# Compiling the Model with specified callbacks and metrics:
train_model.compile(optimizer = adam_optimizer, loss = tf.keras.losses.SparseCategoricalCrossentropy(), metrics = METRICS)

In [82]:
# Defining the Inference Model
# Needs the initial_state call argument for all the LSTM layers

inference_input = Input(shape = (1,))
lstm_1_hidden = Input(shape = (LSTM_UNITS,))
lstm_1_cell = Input(shape = (LSTM_UNITS,))
lstm_2_hidden = Input(shape = (LSTM_UNITS,))
lstm_2_cell = Input(shape = (LSTM_UNITS,))
lstm_3_hidden = Input(shape = (LSTM_UNITS,))
lstm_3_cell = Input(shape = (LSTM_UNITS,))
inference_emb_op = shared_embedding_layer(inference_input)
inference_lstm_1_op, inference_lstm_1_hidden, inference_lstm_1_cell = shared_lstm_1(inference_emb_op, initial_state = [lstm_1_hidden, lstm_1_cell])
inference_lstm_2_op, inference_lstm_2_hidden, inference_lstm_2_cell = shared_lstm_2(inference_lstm_1_op, initial_state = [lstm_2_hidden, lstm_2_cell])
inference_lstm_3_op, inference_lstm_3_hidden, inference_lstm_3_cell = shared_lstm_3(inference_lstm_2_op, initial_state = [lstm_3_hidden, lstm_3_cell])
inference_dense_1_op = shared_dense_1(inference_lstm_3_op)
inference_model_op = shared_dense_op(inference_dense_1_op)

inputs_list = [inference_input, lstm_1_hidden, lstm_1_cell, lstm_2_hidden, lstm_2_cell, lstm_3_hidden, lstm_3_cell]
outputs_list = [inference_lstm_1_hidden, inference_lstm_1_cell, inference_lstm_2_hidden, inference_lstm_2_cell, inference_lstm_3_hidden, inference_lstm_3_cell]

inference_model = Model(inputs = inputs_list, outputs = outputs_list)

print(inference_model.summary())

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
Embedding_Layer (Embedding)     multiple             1136        input_17[0][0]                   
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_19 (InputLayer)           [(None, 300)]        0                                            
____________________________________________________________________________________________

In [83]:
inference_model.compile(optimizer = "adam", loss = tf.keras.losses.SparseCategoricalCrossentropy(), metrics = METRICS)

In [92]:
# Creating a metadata file for tensorboard embedding visualization
def createEmbeddingMetadata(TOKENIZER):
    tokenized_chars = [token for token, idx in TOKENIZER.word_index.items()]
    with open(EMBEDDING_METADATA, 'w') as emb_file:
        emb_file.write("<PAD>\n")
        for token in tokenized_chars:
            if (token == '\n'):
                emb_file.write("\\n\n")
                continue
            if(token == '\t'):
                emb_file.write("\\t\n")
                continue
            emb_file.write(token+"\n")

createEmbeddingMetadata(passwordTokenizer)

In [90]:
# Configuring all model callbacks
# 1. LearningRate Callback
# 2. ModelCheckpoint Callback
# 3. TensorBoard Callback
# 4. GradientClipping Callback
# Gradient Clipping is required because LSTMs/GRUs greatly suffer from both
# Exploding and Vanishing Gradients which presents as big problem and sets
# all values of delta-wt, delta-b to NaN causing numerical instability.
# This occurs because of Gradient-Descent through Time or as we call it
# Back-Propogation through time algorithm causing the gradients to be multiplied 
# with <1 or >1 weight values across all timesteps leaving the network extremely
# vulnerable to numerical instability.

# LearningRateScheduler - Using Exponential Learning Rate Decay
# First defining a 'schedule' for learning rate decay
def expLearningRateDecay(epoch):
   initial_lrate = 0.003
   k = 0.1
   lrate = initial_lrate * math.exp(-k * epoch)
   return lrate

# Scheduler Callback:
train_learning_rate_callback = LearningRateScheduler(schedule = expLearningRateDecay, verbose = 1)

# Checkpoint Callback:
train_checkpoint_callback = ModelCheckpoint(".\\checkpoints\\Checkpoint-{epoch:03d}", verbose = 1, save_weights_only = False, save_freq = "epoch")

# TensorBoard Callback:
train_tensorboard_callback = TensorBoard(log_dir = ".\\tensorboard_log_dir", histogram_freq = 1, write_graph = True, write_images = True, update_freq = 'batch', embeddings_freq = 1)



# Custom Callback
# Tf stable version 2.x currently has a bug
# with saved_model format of saving models.
# Loading of saved_model formatted serialized
# model requires tf-nightly-gpu version which 
# contains latest bug fixes.
# We require a custom callback that saves the 
# entire model in h5 format which can be loaded
# without any problem.

# We will also implement to save inference_model 
# at another directory everytime. 

# This custom callback will work with the newer
# saved_model format, thus allowing flexibility.

class saveH5CustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epochs, logs = None):
        # Saving the train model in h5 format. This should always be defined.
        self.model.save(filepath = ".\\h5_train_checkpoints\\H5-Checkpoint-{epoch:03d}.h5".format(epoch = (epochs + 1)), include_optimizer = True, save_format = "h5")
        try:
            # Check if the inference_model is defined.
            # If defined save the model in h5 format
            global inference_model
            inference_model.save(filepath = ".\\h5_inference_checkpoints\\H5-Inference-Checkpoint-{epoch:03d}.h5".format(epoch = (epochs + 1)), include_optimizer = True, save_format = "h5")
        except:
            # If Inference Model not defined, log the message.
            with open("inference_log.txt", "a") as inf_err:
                inf_err.write("Inference Model not defined for epoch {epoch:03d}.\nSaving train model only. Build Inference Model Later.\n\n".format(epoch = (epochs + 1)))

        
# Creating an instance of custom callback.    
saveH5Callback = saveH5CustomCallback()

# Compiling all callback together..
callbacks = [train_learning_rate_callback, train_checkpoint_callback, train_tensorboard_callback, saveH5Callback]

# GradientClipping Callback

# GradientClipping has to be done manually by computing the gradient wrt 
# the loss of each trainable parameter and then clip it.
# After which the gradients have to be applied to the weights / biases / gates
# thus preventing exploding/vanishing gradient.

# This will only be done if the model experiences any sort of numerical 
# instability and throws NaN Exceptions since this process has capacity 
# to cripple model's speed.


In [23]:
# Load the validation data into volatile memory so we 
# don't have to re-perform all the calculation during each 
# validation test run.
# Fitting the model might be very resource intensive
# Due to data fetch from physical disk and prior pre-processing.


###################################################################

# WARNING:
# Sequence class objects as input to the models suffer from
# extensive deadlocks. Coupling this with model.fit arg 
# use_multiprocessing = True can failed memory allocation
# process during fitting. 
# For flexibility, the sequence objects have been implemented.
# However, their use is not recommended as they are unstable 
# and slow down the pre-fetching capabilities.
# tf.data.Dataset implementations are preferred and are defined 
# above. Please use those to implement input and output pipelines
# for both the training and validation sets. 

####################################################################

# DO NOT EXECUTE.
input_pipeline = gruNetworkInputSequence(10000, ".\\train\\train_X.txt", ".\\train\\train_Y.txt", BATCH_SIZE, passwordTokenizer, MAX_LENGTH, VOCAB_SIZE)
print("[+] Sequence Object Created!")
validation_data = getValidationData(".\\validation\\validation_X.txt", ".\\validation\\validation_Y.txt", VOCAB_SIZE, passwordTokenizer, MAX_LENGTH, BATCH_SIZE, 2000)
print("[+] Validation Data is read into volatile memory!")

[+] Sequence Object Created!
1984
1984
Total passwords processed 500
Total passwords processed 1000
Total passwords processed 1500
[+] Validation Data is read into volatile memory!


In [24]:
train_ip_pipeline = createTFDataInputPipeline(".\\train\\train_X.txt", ".\\train\\train_Y.txt", passwordTokenizer, MAX_LENGTH, SHUFFLE_BUFFER_SIZE, PREFETCH_BATCHES, BATCH_SIZE)
validation_ip_pipeline = createTFDataInputPipeline(".\\validation\\validation_X.txt", ".\\validation\\validation_Y.txt", passwordTokenizer, MAX_LENGTH, SHUFFLE_BUFFER_SIZE, PREFETCH_BATCHES, BATCH_SIZE)

In [32]:
train_model_history = train_model.fit(x = train_ip_pipeline, epochs = EPOCHS, callbacks = callbacks, verbose = VERBOSITY, validation_data = validation_ip_pipeline, validation_freq = 1)
# Callbacks and validation will be done every epoch.
# To change this, 
# change definitions for callbacks &,
# change the validation_freq 
# arg in model.fit

Train for 376887 steps, validate for 22041 steps

Epoch 00001: LearningRateScheduler reducing learning rate to 0.001.
Epoch 1/15
Epoch 00001: saving model to .\checkpoints\Checkpoint-001
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-001\assets

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0009048374180359595.
Epoch 2/15
Epoch 00002: saving model to .\checkpoints\Checkpoint-002
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-002\assets

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0008187307530779819.
Epoch 3/15
Epoch 00003: saving model to .\checkpoints\Checkpoint-003
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-003\assets

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0007408182206817179.
Epoch 4/15
Epoch 00004: saving model to .\checkpoints\Checkpoint-004
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-004\assets

Epoch 00005: LearningRateScheduler reducing learning rate to 0.000

Epoch 00011: saving model to .\checkpoints\Checkpoint-011
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-011\assets

Epoch 00012: LearningRateScheduler reducing learning rate to 0.00033287108369807955.
Epoch 12/15
Epoch 00012: saving model to .\checkpoints\Checkpoint-012
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-012\assets

Epoch 00013: LearningRateScheduler reducing learning rate to 0.00030119421191220205.
Epoch 13/15
Epoch 00013: saving model to .\checkpoints\Checkpoint-013
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-013\assets

Epoch 00014: LearningRateScheduler reducing learning rate to 0.0002725317930340126.
Epoch 14/15
Epoch 00014: saving model to .\checkpoints\Checkpoint-014
INFO:tensorflow:Assets written to: .\checkpoints\Checkpoint-014\assets

Epoch 00015: LearningRateScheduler reducing learning rate to 0.00024659696394160646.
Epoch 15/15
Epoch 00015: saving model to .\checkpoints\Checkpoint-015
INFO:tensorflow:Assets written to: 