In [1]:
import pandas as pd
import numpy as np
import requests
import os

from keras.engine.topology import Layer, InputSpec
from keras.layers import Dense, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

from utils.vocab import Vocabulary
from sklearn.cluster import KMeans
from joblib import load

from utils.plot_utils import plot_and_save_history
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Utils

In [2]:
class AutoEncoder(object):
    """ 4-layer LSTM Autoencoder
    Parameters
        vocab: Vocabulary object that stores char-to-int mappings 
            and text processing functions.
        hidden_size: Dimension of latent space between encoder and decoder.
            Default: 64.
        dropout: Fraction of units to be randomly ignored during training.
            Both linear and recurrent dropout are used.
            Default: 0.2.
    """
    DEFAULT_HIDDEN_UNITS = 64
    DEFAULT_DROPOUT = 0.2
    DEFAULT_BATCH_SIZE = 64
    DEFAULT_EPOCHS = 100
    DEFAULT_MODEL_WEIGHT_PATH = './models/autoencoder-weights.h5'
    model_name = 'autoencoder'

    def __init__(self, vocab, hidden_size=None, dropout=None):
        if not hidden_size:
            self.hidden_size = AutoEncoder.DEFAULT_HIDDEN_UNITS
        if not dropout:
            self.dropout = AutoEncoder.DEFAULT_DROPOUT
            
        self.vocab = vocab
        self.vocab_size = vocab.vocab_size
        self.max_input_seq_length = vocab.max_input_seq_length

        encoder_inputs = Input(shape=(None,), name='encoder_inputs')
        encoder_embedding = Embedding(input_dim=self.vocab_size,
                                      output_dim=self.hidden_size,
                                      input_length=self.max_input_seq_length,
                                      name='encoder_embedding')
        encoder_lstm = LSTM(units=self.hidden_size,
                            return_state=True,
                            dropout=self.dropout,
                            recurrent_dropout=self.dropout,
                            name='encoder_lstm')
        encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
        encoder_states = [encoder_state_h, encoder_state_c]

        decoder_inputs = Input(shape=(None, self.vocab_size), name='decoder_inputs')
        decoder_lstm = LSTM(units=self.hidden_size, return_state=True,
                            return_sequences=True, name='decoder_lstm')
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                         initial_state=encoder_states)
        decoder_dense = Dense(units=self.vocab_size, activation='softmax', name='decoder_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

        self.model = model

        self.encoder_model = Model(encoder_inputs, encoder_states)

        decoder_state_inputs = [Input(shape=(self.hidden_size,)), Input(shape=(self.hidden_size,))]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,
                                                         initial_state=decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model([decoder_inputs] + decoder_state_inputs,
                                   [decoder_outputs] + decoder_states)

    def load_weights(self, weight_file_path):
        if os.path.exists(weight_file_path):
            self.model.load_weights(weight_file_path)

    def generate_batch(self, samples, batch_size):
        num_batches = len(samples) // batch_size
        while True:
            for batchIdx in range(0, num_batches):
                start = batchIdx * batch_size
                end = (batchIdx + 1) * batch_size
                encoder_input_data_batch = pad_sequences(samples[start:end], self.max_input_seq_length)
                decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_input_seq_length,
                                                            self.vocab_size))
                decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_input_seq_length,
                                                           self.vocab_size))
                for textIdx, chars in enumerate(samples[start:end]):
                    for idx, char in enumerate(chars):
                        if char != 1: # not <UNK>
                            decoder_input_data_batch[textIdx, idx, char] = 1
                            if idx > 0:
                                decoder_target_data_batch[textIdx, idx - 1, char] = 1
                yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

    def fit(self, Xtrain, Xtest, epochs=None, batch_size=None, weight_file_path=None):
        """
        Train model.
        Parameters:
            Xtrain: Array of strings for training.
            Xtest: Array of strings for validation.
            epochs: Number of full forward passes of all training samples.
                Default: 50.
            batch_size: Number of samples to pass for one iteration.
                Default: 64.
            weight_file_path: File path with extension '.h5'
                for saving weights of the network. (https://www.h5py.org/)
                Default: ./models/autoencoder-weights.h5
        """

        if not epochs:
            epochs = AutoEncoder.DEFAULT_EPOCHS
        if not batch_size:
            batch_size = AutoEncoder.DEFAULT_BATCH_SIZE
        if not weight_file_path:
            weight_file_path = AutoEncoder.DEFAULT_WEIGHT_FILE_PATH

        checkpoint = ModelCheckpoint(weight_file_path)

        Xtrain = self.vocab.transform_texts(Xtrain)
        Xtest = self.vocab.transform_texts(Xtest)

        train_gen = self.generate_batch(Xtrain, batch_size)
        test_gen = self.generate_batch(Xtest, batch_size)

        train_num_batches = len(Xtrain) // batch_size
        test_num_batches = len(Xtest) // batch_size

        history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                           epochs=epochs, verbose=True,
                                           validation_data=test_gen, validation_steps=test_num_batches,
                                           callbacks=[checkpoint])
        self.model.save_weights(weight_file_path)
        return history

    def predict(self, input_text):
        """
        Process input texts and reconstruct them.
        Used to check how well model "understands" input texts.
        Parameters:
             input_text: 
        """
        input_seq = self.vocab.transform_texts([input_text])
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.vocab_size))
        target_seq[0, 0, self.vocab.char2idx['<START>']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_char = self.vocab.idx2char[sample_token_idx]
            target_text_len += 1

            if sample_char != '<START>' and sample_char != '<END>':
                target_text += ' ' + sample_char

            if sample_char == '<END>' or target_text_len >= self.max_input_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.vocab_size))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip()

In [3]:
def get_features(vectors):
    # vector: string -> vector_embedding: [char_indices]
    # [char_indices] are padded with 0 to the right
    vector_embedding = vocab.transform_texts([vector])
    
    # vector_embedding: [char_indices] -> encoders_states: [state_h, state_c]
    encoder_states = encoder.predict([vector_embedding])
    
    # use state_h as feature
    vector_features = encoder_states[0]
    
    return vector_features

In [4]:
def predict_class(vector_features):
    vector_class = kmeans.predict(vector_features)[0]
    # "0"-class is invalid
    if vector_class == 0:
        vector_class = 25
    return vector_class

## Prediction

### Load data

In [5]:
pd.set_option('display.max_colwidth', 100)

In [9]:
data = pd.read_csv('../data/old-wallarm.csv', encoding='utf-8', lineterminator='\n')
data = data.fillna('<BLANK>')

### Load autoencoder

In [7]:
from keras import backend as K

K.set_learning_phase(0) #set testing phase

In [8]:
os.environ["CUDA_VISIBLE_DEVICES"] = ''

In [11]:
vocab = Vocabulary()
vocab.vocab_file = './vocab-wallarm.json'
vocab.make_vocab(data['text'])
autoencoder = AutoEncoder(vocab)
autoencoder.load_weights('./models/train-wallarm.h5')
encoder = autoencoder.encoder_model

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


### Train K-Means

In [12]:
vector_embeddings = vocab.transform_texts(data['text'])
encoder_states = encoder.predict(vector_embeddings)
vector_features = encoder_states[0]

### Predict for eps=0.5

In [14]:
from sklearn.cluster import DBSCAN

In [16]:
dbscan = DBSCAN(n_jobs=-1)
labels = dbscan.fit_predict(vector_features)
result = pd.DataFrame(data)
result['class'] = labels

In [71]:
results = result['class'].value_counts()

In [72]:
results

 0      27703
 2      26820
-1       8519
 41       152
 23       120
 7        102
 36        79
 30        79
 1         71
 4         66
 28        64
 14        57
 16        53
 47        43
 83        42
 51        35
 12        34
 20        34
 59        34
 26        34
 39        33
 101       33
 24        33
 111       32
 5         31
 96        29
 66        28
 68        26
 11        26
 18        25
        ...  
 86         5
 87         5
 145        5
 151        5
 144        5
 152        5
 154        5
 155        5
 94         5
 158        5
 34         5
 35         5
 169        5
 168        5
 162        5
 139        5
 99         5
 75         5
 140        5
 192        4
 189        4
 174        4
 185        4
 153        4
 195        4
 186        3
 124        3
 181        3
 72         3
 170        2
Name: class, Length: 200, dtype: int64

In [92]:
result[result['class'] == 2]

Unnamed: 0,id,text,injection,class
3,28821,D8j+oNbylTIGw=,False,2
5,74446,""" union/*M!719 */",True,2
8,76096,""" /*M!union*/",True,2
11,33516,kEC+oNOwUkgJOk4bn7Jm9wOWOT6V ... f9g=,False,2
14,44176,https://example.example.ru/krasota/massage/?seamless=7&tabName=ORDER#,False,2
17,76039,' union/*M!*/,True,2
20,53171,%F3%3F%74%BF,False,2
22,94425,%5Ca%65HrEf=VbScRiPt:MsgBox(14527)%7E,True,2
27,81271,view' aNd '8'='6,True,2
28,60015,755 union/*mgjrer qqypxc */distinct\tselectdpohuefrom\texample.example,True,2


In [96]:
result[(result['class'] == 2) & (result['injection'] == True)]

Unnamed: 0,id,text,injection,class
5,74446,""" union/*M!719 */",True,2
8,76096,""" /*M!union*/",True,2
17,76039,' union/*M!*/,True,2
22,94425,%5Ca%65HrEf=VbScRiPt:MsgBox(14527)%7E,True,2
27,81271,view' aNd '8'='6,True,2
28,60015,755 union/*mgjrer qqypxc */distinct\tselectdpohuefrom\texample.example,True,2
32,56144,941 or/**/tvguaf()=768,True,2
33,57671,664 and 'esprrx'>exelbw#gahbcp 71 mvnfng lxdmep bovfon rwabxc \r(),True,2
37,93692,%4Ca%19HrEf=VbScRiPt:MsgBox(75523)%2E,True,2
41,74994,' union/*M!*/,True,2


In [101]:
homogeneity

{0: 0.574,
 1: 0.507,
 2: 1.0,
 3: 0.518,
 4: 1.0,
 5: 1.0,
 6: 1.0,
 7: 0.5,
 8: 1.0,
 9: 0.857,
 10: 1.0,
 11: 1.0,
 12: 1.0,
 13: 0.971,
 14: 1.0,
 15: 1.0,
 16: 1.0,
 17: 0.887,
 18: 1.0,
 19: 1.0,
 20: 1.0,
 21: 1.0,
 22: 1.0,
 23: 1.0,
 24: 1.0,
 25: 1.0,
 26: 1.0,
 27: 1.0,
 28: 1.0,
 29: 0.688,
 30: 1.0,
 31: 0.987,
 32: 1.0,
 33: 1.0,
 34: 1.0,
 35: 1.0,
 36: 1.0,
 37: 0.785,
 38: 1.0,
 39: 1.0,
 40: 1.0,
 41: 1.0,
 42: 1.0,
 43: 1.0,
 44: 1.0,
 45: 1.0,
 46: 1.0,
 47: 1.0,
 48: 1.0,
 49: 1.0,
 50: 1.0,
 51: 1.0,
 52: 0.971,
 53: 1.0,
 54: 1.0,
 55: 1.0,
 56: 1.0,
 57: 1.0,
 58: 1.0,
 59: 1.0,
 60: 1.0,
 61: 1.0,
 62: 1.0,
 63: 0.833,
 64: 1.0,
 65: 1.0,
 66: 1.0,
 67: 0.964,
 68: 1.0,
 69: 0.962,
 70: 1.0,
 71: 1.0,
 72: 1.0,
 73: 1.0,
 74: 1.0,
 75: 1.0,
 76: 1.0,
 77: 1.0,
 78: 0.909,
 79: 1.0,
 80: 1.0,
 81: 1.0,
 82: 1.0,
 83: 1.0,
 84: 1.0,
 85: 1.0,
 86: 1.0,
 87: 1.0,
 88: 1.0,
 89: 1.0,
 90: 1.0,
 91: 1.0,
 92: 1.0,
 93: 1.0,
 94: 1.0,
 95: 1.0,
 96: 1.0,
 97: 1.0,
 9

In [113]:
homogeneity[2]

0.54

In [111]:
result[result['class'] == 2]

Unnamed: 0,id,text,injection,class
3,28821,D8j+oNbylTIGw=,False,2
5,74446,""" union/*M!719 */",True,2
8,76096,""" /*M!union*/",True,2
11,33516,kEC+oNOwUkgJOk4bn7Jm9wOWOT6V ... f9g=,False,2
14,44176,https://example.example.ru/krasota/massage/?seamless=7&tabName=ORDER#,False,2
17,76039,' union/*M!*/,True,2
20,53171,%F3%3F%74%BF,False,2
22,94425,%5Ca%65HrEf=VbScRiPt:MsgBox(14527)%7E,True,2
27,81271,view' aNd '8'='6,True,2
28,60015,755 union/*mgjrer qqypxc */distinct\tselectdpohuefrom\texample.example,True,2


In [82]:
clusters['counts'] = pd.concat()

In [19]:
n_clusters = result['class'].unique().size

In [18]:
result[result['class'] == 0]

Unnamed: 0,id,text,injection,class
0,59290,' and/**/38>( select\t(622)/**/from/*362 emhgpv gpnqdn odpxkn qgnoyb */htipaw),True,0
1,54992,"shqpkt"" union /*!426 all\t*/(select kwicwt(\t(\tyaaoda\t()), (236), (konjlq()), 619, nyknas ())...",True,0
2,64287,"nhnqag"" uniondistinct--154 298 plhlre exaloq unyote \r(select (rpfeir#15 256 15 rttlat 66 gsjxjq...",True,0
6,71873,"194 union#93 yguxsk 669 393 \r\tselect\t\texlgjh(), cirwsi((\t616), (#\r'qbtidf'), (#\r'xhqujm')...",True,0
7,87153,/example.example?%7Cmeta%82http-equiv=Set-Cookie%59content=%11testizfx=5232%28%6E,True,0
9,42354,/handler_sync_example.example?i=j4sR/mhZpekzcpKcC5cUoFHo5e42TQeA6YzVZuR3mhhuaivOgsA/oncCkpo=,False,0
12,101157,%24%3E%7Cscript%3Ealert(%12sap_its_urlmime_example.example%45)%4C/script%5E%1Cimg%78src=%33,True,0
13,74110,"794 and bjwdqp((318), \t/*M!354 932, (625),*/ (275), (91), 365, (22))=672",True,0
15,16094,/handler_sync_example.example?i=Ypg//ONuVSKGXMnxmNJk8d6NQDTWwBs9OUBxw7FuiARcp4PPJBes4cIR26Q=,False,0
16,83110,"ru1527924556435"" UNION SELECT CHAR(78,655,28,88,91,84),CHAR(68,568,67,78,68,22),CHAR(35,976,31,5...",True,0


In [95]:
cluster_sizes = [len(result[result['class'] == x]) for x in range(-1, n_clusters-1)]
injection_part_sizes = [len(result[(result['class'] == x) & (result['injection'] == True)]) for x in range(-1, n_clusters-1)]
percentage = [injection_part_size / cluster_size for cluster_size, injection_part_size in zip(cluster_sizes, injection_part_sizes)]
homogeneity = [x if x > 0.5 else 1 - x for x in percentage]

homogeneity = {i: round(x, 3) for i, x in enumerate(homogeneity)}
mean = np.mean(list(homogeneity.values()))
homogeneity

{0: 0.574,
 1: 0.507,
 2: 1.0,
 3: 0.518,
 4: 1.0,
 5: 1.0,
 6: 1.0,
 7: 0.5,
 8: 1.0,
 9: 0.857,
 10: 1.0,
 11: 1.0,
 12: 1.0,
 13: 0.971,
 14: 1.0,
 15: 1.0,
 16: 1.0,
 17: 0.887,
 18: 1.0,
 19: 1.0,
 20: 1.0,
 21: 1.0,
 22: 1.0,
 23: 1.0,
 24: 1.0,
 25: 1.0,
 26: 1.0,
 27: 1.0,
 28: 1.0,
 29: 0.688,
 30: 1.0,
 31: 0.987,
 32: 1.0,
 33: 1.0,
 34: 1.0,
 35: 1.0,
 36: 1.0,
 37: 0.785,
 38: 1.0,
 39: 1.0,
 40: 1.0,
 41: 1.0,
 42: 1.0,
 43: 1.0,
 44: 1.0,
 45: 1.0,
 46: 1.0,
 47: 1.0,
 48: 1.0,
 49: 1.0,
 50: 1.0,
 51: 1.0,
 52: 0.971,
 53: 1.0,
 54: 1.0,
 55: 1.0,
 56: 1.0,
 57: 1.0,
 58: 1.0,
 59: 1.0,
 60: 1.0,
 61: 1.0,
 62: 1.0,
 63: 0.833,
 64: 1.0,
 65: 1.0,
 66: 1.0,
 67: 0.964,
 68: 1.0,
 69: 0.962,
 70: 1.0,
 71: 1.0,
 72: 1.0,
 73: 1.0,
 74: 1.0,
 75: 1.0,
 76: 1.0,
 77: 1.0,
 78: 0.909,
 79: 1.0,
 80: 1.0,
 81: 1.0,
 82: 1.0,
 83: 1.0,
 84: 1.0,
 85: 1.0,
 86: 1.0,
 87: 1.0,
 88: 1.0,
 89: 1.0,
 90: 1.0,
 91: 1.0,
 92: 1.0,
 93: 1.0,
 94: 1.0,
 95: 1.0,
 96: 1.0,
 97: 1.0,
 9

In [65]:
np.set_printoptions(threshold=10000)

In [68]:
np.round(vector_features[:8], 3)

array([[-0.   , -0.057, -0.081,  0.501,  0.422, -0.014, -0.394,  0.   ,
        -0.629,  0.049,  0.073,  0.316,  0.676,  0.517, -0.966, -0.573,
         0.249, -0.596,  0.189, -0.541,  0.384, -0.321, -1.   ,  0.424,
         0.275,  0.   ,  0.261, -0.08 , -0.   ,  0.007,  0.012, -0.088,
         0.229,  0.   ,  0.104, -0.026, -0.128,  0.   ,  0.315,  0.587,
         0.37 ,  0.891, -0.394, -0.216, -0.177, -0.033, -0.661,  0.97 ,
        -0.496, -0.331, -0.386, -0.392,  0.311, -0.389,  0.722, -0.611,
         0.   , -0.518,  0.281, -0.387, -0.292, -0.042, -0.608, -0.367],
       [-0.   ,  0.057,  0.106,  0.555,  0.325, -0.006, -0.334,  0.   ,
        -0.666,  0.011,  0.034,  0.343,  0.629,  0.524, -1.   , -0.604,
         0.464, -0.699,  0.107, -0.5  ,  0.458, -0.28 , -1.   ,  0.653,
         0.225,  0.059,  0.39 , -0.165, -0.   , -0.146,  0.174,  0.093,
         0.223,  0.01 ,  0.043, -0.163, -0.104,  0.   ,  0.309,  0.517,
         0.494,  0.827, -0.285, -0.484,  0.063, -0.071, -0.601,