# Imports 
This project is implemented using tensorflow vesion 2.7.0 and numpy version 1.19.5 

In [4]:
import sys 
import random 
import numpy as np 
import tensorflow as tf
from tensorflow.keras import layers, models  

print(tf.__version__)
print(np.__version__)

# Data Preprocessing 
Once we load our data we will create a vocabulary which is composed unique characters from the data. One thing to note here is that these exploits will include a number of special characters and we have to keep them in there ans sometimes even upper case letter. They are part of the more diversified exploit content. We just load the data and vectorize its content
### Vectorization
Inorder to feed our data we have to convert the string form of the data into numeric tensors or numeric vectors. for this we can use StringLookup layer form tensorflow or do it manually using simple dictionary operations. Inorder to convert generated numeric IDs into human readable form we can use the StringLookup layer of tensorflow with invert parameter set to True.      

```
# without tensorflow 
chars_to_indices = tf.keras.layers.StringLookup(
    vocabulary=list(vocabulary), mask_token=None)

indices_to_chars = tf.keras.layers.StringLookup(
    vocabulary=chars_to_indices.get_vocabulary(), invert=True, mask_token=None
)
```

# Character Mappings 
mapping character is just giving a unique id to a character from the data we just loaded ofcourse for those which are unique. This is important later when we feed the data into our model. The model will use the ids to map each character thats coming through the sequence.  

In [1]:
# Read our data 
data = ""
with open("../input/payloads/payloads.txt", 'r', encoding='utf-8') as f:
  data += f.read()

# Create  our vocabulary 
vocabulary = sorted(list(set(data)))
print(f"Length our vocabulary {len(vocabulary)}")

# Create mapping chars to indices and reverse indices to chars 
chars_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_chars = dict((i, c) for i, c in enumerate(vocabulary))

# Split the data into training examples and targets 

The data is splitted into smaller sequences of a fixed length which makes all input sequences the same length and one character shifted to the right. The shifted character will be the target value which we want to predict.

```
# with Tensorflow
# get the mapping of all characters from our input file 
vector_data = chars_to_indices(tf.strings.unicode_split(data, 'UTF-8')) 

vector_dataset = tf.data.Dataset.from_tensor_slices(vector_data)
```



# Create a batch of characters 
the batch method of tensorflow creates a sequence of characters with desired length in this case the sequence length is set to 200



```
# Tensorflow implementation 

seq_length = 200
sequences = vector_dataset.batch(seq_length+1, drop_remainder=True)

# utility function to split input sequence into sample and target 
def split_sequence(sequence):
  in_seq = sequence[:-1]
  target_seq = sequence[1:]
  return in_seq, target_seq

# then create dataset of input sample and target over the entire sequence 
new_dataset = sequences.map(split_sequence)

```



In [2]:
seq_length = 200
step = 3
sequences = []
next_chars = []
for i in range(0, len(data) - seq_length, step):
    sequences.append(data[i: i + seq_length])
    next_chars.append(data[i + seq_length])

# Create One-Hot-Encoding matrix for X and Y

After the sequences are created we want to make sure that these sequences are in the right form for our model by encoding them. One-Hot-Encoding scheme is used where each character i from our vocabulary is represented in the form of binary 0 and 1. To denore the i-th word from our vector, the value at the i-th element is set to 1 where all other values are set to 0.

One thing to note here the X stores encode form of a sequence which is shifted by one and the y store the shifted character to represent the next incoming character from the input X sequence. As an example 



```
+----------------------------------------------+-----+
| [', ;, a, =, p, r, o, m, p, t, a, (, ), / ]  | [/] |
+----------------------------------------------+-----+

# or as a hello
+--------------+-------+
|      X       |   Y   |
+--------------+-------+
| [h, e, l, l] | [o]   |
| [e, l, l, o] | [ ]   |
| [l, l, o,  ] | [i]   |
| [l, o,  , i] | [n]   |
| ...          | ...   |
+--------------+-------+
```



In [5]:
x = np.zeros((len(sequences), seq_length, len(vocabulary)), dtype=np.bool)
y = np.zeros((len(sequences), len(vocabulary)), dtype=np.bool)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i, t, chars_to_indices[char]] = 1
    y[i, chars_to_indices[next_chars[i]]] = 1

# convert the integer values into floating point
x = np.asarray(x).astype('float32')
y = np.asarray(y).astype('float32')

# Validation set 
The validation set will be used to validate if the training went well like expected, mainly used for testing weather our model overfits or not.

In [6]:
# split our data into training set and validation set  
x_val = x[:10000]
x_train = x[10000:]
y_val = y[:10000]
y_train = y[10000:] 

# Building the model 
The two functions build and compile two different model with the same architecture. The build_rnn_model builds Simple RNN model with two Dropout layers and 1 Dense layer as an output. the activation function used is softmax because the output is multinomial probability distribution. the loss function is categorical_crossentropy and optimizer rmsprop. both function define the same architecture execpt the first function use SimpleRNN model and the second function defines LSTM model. This is done purposefully to study the performance of each model and do comparision between them.  

In [7]:
vocab_length = len(vocabulary) # 116
embedding_dimension = 256 
units = 128

def build_rnn_model():
  model = models.Sequential()
  model.add(layers.SimpleRNN(units, input_shape=(seq_length, vocab_length), return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.SimpleRNN(units))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(vocab_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
  return model 


def build_lstm_model():
  model = models.Sequential()
  model.add(layers.LSTM(units, input_shape=(seq_length, vocab_length), return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.LSTM(units))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(vocab_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
  return model

In [9]:
rnn_model = build_rnn_model()
rnn_model.summary()

In [10]:
lstm_model = build_lstm_model()
lstm_model.summary()

# Utility Functions 
The sample function and SampleExploit class are used to sample an index from a probability array and print out predictions from our model after each epoch. This will also give us a good idea if our model is overfitting or not since we can see what each epoch generated code snippet looks like.   

In [11]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


class SampleExploit(tf.keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs={}):
        start_index = random.randint(0, len(data) - seq_length - 1)

        for diversity in [0.5, 1.2]:
            generated = ''
            sentence = data[start_index: start_index + seq_length]
            generated += sentence
            sys.stdout.write(generated)
            for i in range(200):
                x_pred = np.zeros((1, seq_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, chars_to_indices[char]] = 1.
                preds = rnn_model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_to_chars[next_index]
                generated += next_char
                sentence = sentence[1:] + next_char
                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()



# Training Model 
Both Simple RNN model and LSTM are trained with the same parameters and validation data. The epochs are set to 20 and batch_size 80 with validation dataset x_val and y_val. The result of each epoch is stored inside a dictionary called history which we will use later for plotting graphs.  

In [12]:
import os 

rnn_training_checkpoint_path = "rnn_training_1/cp.ckpt"
lstm_training_checkpoint_path = "lstm_training_1/cp.ckpt"
gru_training_checkpoint_path = "gru_training_1/cp.ckpt"

rnn_checkpoint_dir = os.path.dirname(rnn_training_checkpoint_path)
lstm_checkpoint_dir = os.path.dirname(lstm_training_checkpoint_path)
gru_checkpoint_dir = os.path.dirname(gru_training_checkpoint_path)


# Create a callback that saves the model's weights
rnn_cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=rnn_training_checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
lstm_cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=lstm_training_checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
gru_cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=gru_training_checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [13]:
# Run our rnn model with 20 epochs and batch_size of 80
EPOCHS = 20 
rnn_history = rnn_model.fit(x_train, y_train, 
                        epochs=EPOCHS, 
                        batch_size=80, 
                        callbacks=[SampleExploit(), rnn_cp_callback],
                        validation_data=(x_val, y_val))

In [None]:
# Run our lstm model with 20 epochs and batch_size of 80
EPOCHS = 20 
lstm_history = lstm_model.fit(x_train, y_train 
                         ,epochs=EPOCHS, 
                         batch_size=80, 
                         callbacks=[SampleExploit(), lstm_cp_callback],
                         validation_data=(x_val, y_val))

# Training and Validation loss plot 
For the Simple RNN the model starts to overfit after the 9-th epochs from these information we can use the EarlyStopping to minize overfitting. 
As for LSTM the overfitting starts from the 12-th epoch and apply Early stopping and retrain the model, meaning retrain the model using fewer number of epochs.  

In [14]:
 
# plot the training and validation loss for the RNN model
import matplotlib.pyplot as plt

loss_values = rnn_history.history['loss']
validation_loss = rnn_history.history['val_loss']
epochs = range(1, len(loss_values)+1)

# SimpleRNN validation loss plot
plt.plot(epochs, loss_values, 'bo', label='Training Loss')
plt.plot(epochs, validation_loss, 'b', label='Validation Loss')
plt.title("Training and Validation Loss for Simple RNN")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()


In [None]:
# plot the training and validation loss for the RNN model
import matplotlib.pyplot as plt

# LSTM validation loss plot 
lstm_loss = lstm_history.history['loss']
lstm_validation_loss = lstm_history.history['val_loss']
epochs = range(1, len(lstm_loss)+1)

# LSTM validation loss plot 
plt.plot(epochs, lstm_loss, 'bo', label="Training Loss")
plt.plot(epochs, lstm_validation_loss, 'b', label='Validation Loss')
plt.title("Training and Validation Loss for LSTM")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Retrain the model Early stoping 
Here the both models are trained with fewer number of epochs to avoid overfitting. 

In [None]:
# Create fresh Simple RNN model and retrain it will less number of epochs 
rnn_model = build_rnn_model()
rnn_model.load_weights(rnn_training_checkpoint_path)
rnn_model.fit(x_train, y_train,
              epochs=15,
              batch_size=80,
              callbacks=[SampleExploit(), rnn_cp_callback],
              validation_data=(x_val, y_val))

In [None]:
# Create fresh Simple LSTM model and retrain it will less number of epochs 
lstm_model = build_lstm_model()
lstm_model.load_weights(lstm_training_checkpoint_path)
lstm_model.fit(x_train, y_train, 
               epochs=15,
               batch_size=80,
               callbacks=[SampleExploit(), lstm_cp_callback],
               validation_data=(x_val, y_val))

# GRU model
Let's build GRU model and train it with our trining data. GRUs are less complicated compared to LSTM and RNN models and much faster. Later they can be compared with the prior two models implemented above. The model has the same architecture except here GRU layer is used instead of SimpleRNN and LSTM.

In [None]:
def build_gru_model():
  model = models.Sequential()
  model.add(layers.GRU(units, input_shape=(seq_length, vocab_length), return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.GRU(units))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(vocab_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
  return model

In [None]:
gru_model = build_gru_model()
gru_model.summary()

In [None]:
gru_history = gru_model.fit(x_train, y_train, 
                epochs=20,
                batch_size=80,
                callbacks=[SampleExploit(), gru_cp_callback],
                validation_data=(x_val, y_val))

In [None]:
import matplotlib.pyplot as plt

# GRU validation loss plot 
gru_loss = gru_history.history['loss']
gru_validation_loss = gru_history.history['val_loss']
epochs = range(1, len(gru_loss)+1)

# GRU validation loss plot 
plt.plot(epochs, gru_loss, 'bo', label="Training Loss")
plt.plot(epochs, gru_validation_loss, 'b', label='Validation Loss')
plt.title("Training and Validation Loss for GRU")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Retrain the model with less number epochs 
gru_model = build_gru_model()
gru_model.load_weights(gru_training_checkpoint_path)
gru_model.fit(x_train, y_train,
              epochs=15,
              batch_size=80,
              callbacks=[SampleExploit()],
              validation_data=(x_val, y_val))

In [None]:
seq_length = 200
step = 3
sequences = []
next_chars = []

test_data = ""

with open("../input/test-data/test_payloads.txt", 'r', encoding='utf-8') as f:
  test_data += f.read()

def process_test_data(data):
  for i in range(0, len(data) - seq_length, step):
    sequences.append(data[i: i + seq_length])
    next_chars.append(data[i + seq_length])
  x_test = np.zeros((len(sequences), seq_length, len(vocabulary)), dtype=np.bool)
  y_test = np.zeros((len(sequences), len(vocabulary)), dtype=np.bool)
  for i, sequence in enumerate(sequences):
      for t, char in enumerate(sequence):
          x_test[i, t, chars_to_indices[char]] = 1
      y_test[i, chars_to_indices[next_chars[i]]] = 1

  # convert the integer values into floating point
  x_test = np.asarray(x_test).astype('float32')
  y_test = np.asarray(y_test).astype('float32')
  return x_test, y_test


x_test, y_test = process_test_data(test_data)
x_test.shape

In [None]:
# Retrain the model with less number epochs 
gru_model = build_gru_model()
gru_model.load_weights(gru_training_checkpoint_path)
gru_model.fit(x_test, y_test,
              epochs=15,
              batch_size=80,
              callbacks=[SampleExploit()])