In [1]:
import pandas as pd
import numpy as np
import requests
import os

from keras.engine.topology import Layer, InputSpec
from keras.layers import Dense, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

from utils.vocab import Vocabulary
from sklearn.cluster import KMeans
from joblib import load

from utils.plot_utils import plot_and_save_history
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### Utils for working with API

In [0]:
def download_sample():
    download_url = ""

    try:
        resp = requests.get(download_url)
        sample = resp.json()

        # Sort out invalid samples like {'error': 'You already downloaded this vector'}
        if set(sample.keys()) != set(valid_keys):
            failed["num"] += 1
            failed["responses"].append(resp.text)
            print("Invalid sample")
            print(failed)
            return None

    # Sometimes decoded samples are not valid json objects
    except ValueError:
        failed["num"] += 1
        try:
            failed["responses"].append(resp)
            print("Invalid response")
        # If download failed, resp is undound
        except UnboundLocalError:
            print("Bad URL")
            
        return None

    return sample

In [0]:
def upload_sample(classified_sample):
    upload_url = ""

    try:
        resp = requests.post(upload_url, classified_sample)
        return resp.text
    except:
        print("Download failed")
        return None

### Autoencoder model

In [2]:
class AutoEncoder(object):
    """ 4-layer LSTM Autoencoder
    Parameters
        vocab: Vocabulary object that stores char-to-int mappings 
            and text processing functions.
        hidden_size: Dimension of latent space between encoder and decoder.
            Default: 64.
        dropout: Fraction of units to be randomly ignored during training.
            Both linear and recurrent dropout are used.
            Default: 0.2
    """
    DEFAULT_HIDDEN_UNITS = 64
    DEFAULT_DROPOUT = 0.2
    DEFAULT_BATCH_SIZE = 64
    DEFAULT_EPOCHS = 100
    DEFAULT_MODEL_WEIGHT_PATH = './models/autoencoder-weights.h5'
    model_name = 'autoencoder'

    def __init__(self, vocab, hidden_size=None, dropout=None):
        if not hidden_size:
            self.hidden_size = AutoEncoder.DEFAULT_HIDDEN_UNITS
        if not dropout:
            self.dropout = AutoEncoder.DEFAULT_DROPOUT
            
        self.vocab = vocab
        self.vocab_size = vocab.vocab_size
        self.max_input_seq_length = vocab.max_input_seq_length

        encoder_inputs = Input(shape=(None,), name='encoder_inputs')
        encoder_embedding = Embedding(input_dim=self.vocab_size,
                                      output_dim=self.hidden_size,
                                      input_length=self.max_input_seq_length,
                                      name='encoder_embedding')
        encoder_lstm = LSTM(units=self.hidden_size,
                            return_state=True,
                            dropout=self.dropout,
                            recurrent_dropout=self.dropout,
                            name='encoder_lstm')
        encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
        encoder_states = [encoder_state_h, encoder_state_c]

        decoder_inputs = Input(shape=(None, self.vocab_size), name='decoder_inputs')
        decoder_lstm = LSTM(units=self.hidden_size, return_state=True,
                            return_sequences=True, name='decoder_lstm')
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                         initial_state=encoder_states)
        decoder_dense = Dense(units=self.vocab_size, activation='softmax', name='decoder_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

        self.model = model

        self.encoder_model = Model(encoder_inputs, encoder_states)

        decoder_state_inputs = [Input(shape=(self.hidden_size,)), Input(shape=(self.hidden_size,))]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,
                                                         initial_state=decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model([decoder_inputs] + decoder_state_inputs,
                                   [decoder_outputs] + decoder_states)

    def load_weights(self, weight_file_path):
        if os.path.exists(weight_file_path):
            self.model.load_weights(weight_file_path)

    def generate_batch(self, samples, batch_size):
        num_batches = len(samples) // batch_size
        while True:
            for batchIdx in range(0, num_batches):
                start = batchIdx * batch_size
                end = (batchIdx + 1) * batch_size
                encoder_input_data_batch = pad_sequences(samples[start:end], self.max_input_seq_length)
                decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_input_seq_length,
                                                            self.vocab_size))
                decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_input_seq_length,
                                                           self.vocab_size))
                for textIdx, chars in enumerate(samples[start:end]):
                    for idx, char in enumerate(chars):
                        if char != 1: # not <UNK>
                            decoder_input_data_batch[textIdx, idx, char] = 1
                            if idx > 0:
                                decoder_target_data_batch[textIdx, idx - 1, char] = 1
                yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

    def fit(self, Xtrain, Xtest, epochs=None, batch_size=None, weight_file_path=None):
        """
        Train model.
        Parameters:
            Xtrain: Array of strings for training.
            Xtest: Array of strings for validation.
            epochs: Number of full forward passes of all training samples.
                Default: 50.
            batch_size: Number of samples to pass for one iteration.
                Default: 64.
            weight_file_path: File path with extension '.h5'
                for saving weights of the network. (https://www.h5py.org/)
                Default: ./models/autoencoder-weights.h5
        """

        if not epochs:
            epochs = AutoEncoder.DEFAULT_EPOCHS
        if not batch_size:
            batch_size = AutoEncoder.DEFAULT_BATCH_SIZE
        if not weight_file_path:
            weight_file_path = AutoEncoder.DEFAULT_WEIGHT_FILE_PATH

        checkpoint = ModelCheckpoint(weight_file_path)

        Xtrain = self.vocab.transform_texts(Xtrain)
        Xtest = self.vocab.transform_texts(Xtest)

        train_gen = self.generate_batch(Xtrain, batch_size)
        test_gen = self.generate_batch(Xtest, batch_size)

        train_num_batches = len(Xtrain) // batch_size
        test_num_batches = len(Xtest) // batch_size

        history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                           epochs=epochs, verbose=True,
                                           validation_data=test_gen, validation_steps=test_num_batches,
                                           callbacks=[checkpoint])
        self.model.save_weights(weight_file_path)
        return history

    def predict(self, input_text):
        """
        Process input texts and reconstruct them.
        Used to check how well model "understands" input texts.
        Parameters:
             input_text: 
        """
        input_seq = self.vocab.transform_texts([input_text])
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.vocab_size))
        target_seq[0, 0, self.vocab.char2idx['<START>']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_char = self.vocab.idx2char[sample_token_idx]
            target_text_len += 1

            if sample_char != '<START>' and sample_char != '<END>':
                target_text += ' ' + sample_char

            if sample_char == '<END>' or target_text_len >= self.max_input_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.vocab_size))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip()    

### Text processing

In [3]:
def get_features(vectors):
    # vector: string -> vector_embedding: [char_indices]
    # [char_indices] are padded with 0 to the right
    vector_embedding = vocab.transform_texts([vector])
    
    # vector_embedding: [char_indices] -> encoders_states: [state_h, state_c]
    encoder_states = encoder.predict([vector_embedding])
    
    # use state_h as feature
    vector_features = encoder_states[0]
    
    return vector_features

### Prediction

In [4]:
def predict_class(vector_features):
    vector_class = kmeans.predict(vector_features)[0]
    # "0"-class is invalid
    if vector_class == 0:
        vector_class = 25
    return vector_class

### Train

In [5]:
data = pd.read_csv('../data/wallarm.csv', encoding='utf-8', lineterminator='\n')
data = data.fillna('<BLANK>')

In [6]:
clean_data = data[~data['text'].apply(lambda x: x.startswith('/handler_sync_example'))]

In [7]:
clean_data = clean_data[~clean_data['text'].apply(lambda x: x.startswith('Category=%example.example%'))]

In [8]:
clean_data = clean_data[~clean_data['text'].apply(lambda x: x.startswith('/example.example'))]

In [9]:
data = clean_data.reset_index(drop=True)

In [13]:
train_data, test_data = train_test_split(data, test_size=0.2)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

**Train autoencoder**

In [14]:
os.environ["CUDA_VISIBLE_DEVICES"] = ''

In [15]:
from keras import backend as K

K.set_learning_phase(1) #set learning phase

In [17]:
vocab = Vocabulary()
vocab.vocab_file = './vocab-wallarm.json'
vocab.make_vocab(data['text'])
train_autoencoder = AutoEncoder(vocab)
train_encoder = train_autoencoder.encoder_model
weight_file_path = "./models/train-modified-wallarm.h5"
report_dir_path = "./reports"

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [18]:
print('start fitting ...')
history = train_autoencoder.fit(data['text'], test_data['text'], 
                                epochs=15, batch_size=64,
                                weight_file_path=weight_file_path)

comment = "wallarm-modified"
history_plot_file_path = f"{ report_dir_path }/{ AutoEncoder.model_name }-{ comment }-history.png"
plot_and_save_history(history, f"{AutoEncoder.model_name}-{comment}", history_plot_file_path, metrics={'loss', 'acc'})

start fitting ...
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/15

  str(node.arguments) + '. They will not be included '


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
 21/851 [..............................] - ETA: 5:42 - loss: 1.3506 - acc: 0.6317

KeyboardInterrupt: 

In [25]:
train_autoencoder.predict(data['text'][])

'9 3 9   u n i o n # \r s e l e c t \t ( / * ! o u s e l e * / ( ) ) ,   ( \t s e l e c t ( ) ) ,   ( \t s e l e c t ( ) ) ,   ( \t s e l e c'

**Train K-Means**

In [17]:
# vector: 1d array of strings -> vector_embeddings: 2d array of char_indices for each string
# [char_indices] are padded with 0 to the right
vector_embeddings = vocab.transform_texts(train_data['text'])


# vector_embeddings: [[char_indices]] -> encoders_states: [state_h, state_c]
encoder_states = train_encoder.predict(vector_embeddings)

# use state_h as feature
vector_features = encoder_states[0]

In [0]:
vector_embeddings

array([[14, 53,  3, ..., 55, 41, 15],
       [14, 26, 27, ...,  0,  0,  0],
       [14, 45, 10, ...,  0,  0,  0],
       ...,
       [14, 13, 16, ..., 31,  4, 15],
       [14, 35, 23, ..., 13,  2, 15],
       [14,  8,  2, ...,  6,  2, 15]], dtype=int32)

In [0]:
encoder_states

[array([[ 4.09461595e-02, -5.36398068e-02,  9.03753415e-02, ...,
          4.17304263e-02, -3.84349264e-02,  1.20123602e-01],
        [ 7.29063451e-02, -2.17253849e-01,  5.46008989e-04, ...,
          8.83590654e-02,  2.67494977e-01,  3.77607256e-01],
        [ 1.30734441e-06, -4.82622236e-01,  3.34630115e-03, ...,
          7.21165771e-03, -4.23140228e-01, -3.12113583e-01],
        ...,
        [-2.04506010e-04,  2.81437457e-01,  2.24746332e-01, ...,
          3.48136127e-02, -9.58355293e-02, -2.54354537e-01],
        [ 2.99642775e-02,  2.35155806e-01,  2.86811471e-01, ...,
          8.81032273e-02, -1.05080426e-01, -1.87096268e-01],
        [ 2.37311958e-03,  1.35119095e-01,  1.15045808e-01, ...,
          1.09946236e-01, -2.40389258e-01, -2.63211668e-01]], dtype=float32),
 array([[ 6.8864867e-02, -1.4695564e-01,  1.6731679e-01, ...,
          3.9815277e-01, -1.7921761e-01,  3.8406411e-01],
        [ 1.0018063e-01, -4.9414784e-01,  1.5700178e-03, ...,
          1.0011814e+00,  4.3144

In [0]:
vector_features

array([[ 4.09461595e-02, -5.36398068e-02,  9.03753415e-02, ...,
         4.17304263e-02, -3.84349264e-02,  1.20123602e-01],
       [ 7.29063451e-02, -2.17253849e-01,  5.46008989e-04, ...,
         8.83590654e-02,  2.67494977e-01,  3.77607256e-01],
       [ 1.30734441e-06, -4.82622236e-01,  3.34630115e-03, ...,
         7.21165771e-03, -4.23140228e-01, -3.12113583e-01],
       ...,
       [-2.04506010e-04,  2.81437457e-01,  2.24746332e-01, ...,
         3.48136127e-02, -9.58355293e-02, -2.54354537e-01],
       [ 2.99642775e-02,  2.35155806e-01,  2.86811471e-01, ...,
         8.81032273e-02, -1.05080426e-01, -1.87096268e-01],
       [ 2.37311958e-03,  1.35119095e-01,  1.15045808e-01, ...,
         1.09946236e-01, -2.40389258e-01, -2.63211668e-01]], dtype=float32)

In [18]:
train_kmeans = KMeans(n_clusters=8, n_init=10, n_jobs=-1)
train_labels = train_kmeans.fit_predict(vector_features)
train_kmeans_result = pd.DataFrame(train_data)
train_kmeans_result['class'] = train_labels

In [0]:
train_kmeans_result

Unnamed: 0,id,text,injection,class
0,37054,~ûÃ<?C(ÂF¥,False,2
1,19384,/api/pro/v9/49456392/contact/?date=4624553941588&aantal=35&onlyContainingGevEindverslagen=false&...,False,1
2,67149,536 union/*M!*/,True,4
3,95445,jtdfgkxo'%15()&%42%2Cacx%6E%7CScRiPt%66%6EPlnT(2276)%2C/ScRiPt%9E,True,3
4,39692,timesheet_extra=; timesheet_columns=name; access_token=22174f6a84bdd532677b46716b2633342a2c6f53;...,False,0
5,57355,"761 and#\r""ixlxiy""<vxynsm((anebnd#wsvaou fpovxg \r()))",True,5
6,61631,56 union--lylfsg tjqxty uxyygf icbtox kvsvwg 63 \rdistinct#737 okhpmq 655 17 5 11 \r(--\rselect-...,True,5
7,65022,'unionselect\t(--839 vvwijn \r537)--ovgbgy qdeemt suwqrt dislqf \rexample.example,True,5
8,39050,/handler_sync_example.example?i=IWdc6mNverZ/MqlXeOq+oNGvCVmUaMr1kQ751Q1kwtSOoIy17gf6YR5yLVc=,False,7
9,31432,"F?\tÉÁ§T¼r1ØSrC,nKÆ ... &#6 ªÈ]IôÁ",False,2


In [0]:
pd.set_option('display.max_colwidth', 100)

In [0]:
train_kmeans_result[train_kmeans_result['class'] == 0]

Unnamed: 0,id,text,injection,class
4,39692,timesheet_extra=; timesheet_columns=name; access_token=22174f6a84bdd532677b46716b2633342a2c6f53;...,False,0
10,42065,"<img src=""&#x9F;&#x6F;example.example.com&#x6F;dialogs&#x6F;smiles&#x5F;unknown-KkJhbmFuYSo="" c...",False,0
17,42559,"<objects><object scored=""yes"" id=""a_2267952441339815595""><scoring><scoreboards><scoreboard id=""t...",False,0
18,7698,"{""providers"":[{""provider"":""streams"",""rules"":[{""rule"":""key"",""type"":""select"",""value"":[""EASBUG""],""o...",False,0
30,83362,"118fcae498e866ee6dc28bdf1ebf64caads|a:9:{s:2:""num"";s:471:""*/ union select 4,2x382f7a,3,1,6,9,8,3...",True,0
32,33013,(PhoneNumber~eq~'+18345663766'~and~((TransactionType~substringof~'CentralRefillBalance'~or~Trans...,False,0
36,28409,"><a href=""ht ... \n<p><img src=""ht ... The&nbsp;<a ... n's style has ... US:&nbsp;<a",False,0
39,679,U mean to be real lovers ........ kissing each other on lips ....... etc,False,0
41,25181,"[{""group"":""calendars"",""path"":""/calendars"",""value"":{""calendar_id"":521349,""select"":true,""accountId...",False,0
45,39896,PHPSESSID=494tu43vdc2qm4r6heleqi7ea6; tk_or=%68%59; tk_r9d=%27%91; tk_lr=%68%87; tk_ai=gHswq2Ugg...,False,0


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 0) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
30,83362,"118fcae498e866ee6dc28bdf1ebf64caads|a:9:{s:2:""num"";s:471:""*/ union select 4,2x382f7a,3,1,6,9,8,3...",True,0
53,77887,3 AND (SELECT CHAR(385)||CHAR(976)||CHAR(615)||CHAR(493) FROM INFORMATION_example.example_USERS)...,True,0
100,79915,"-4392)/**/UNION/**/ALL/**/SELECT/**/23,87,77,98,42,33,82,29,22--/**/MNFs",True,0
103,80183,"326421543933779877546' UNION SELECT CHAR(37,974,91,17,71,27),CHAR(33,231,95,94,73,16),CHAR(79,66...",True,0
205,83920,"wows6194344686175"" UNION SELECT CHAR(13,733,36,42,31,85),CHAR(96,216,45,43,59,78),CHAR(82,581,57...",True,0
214,64835,"vyuseh"" union select--\rutpmddfrom example.example",True,0
221,76371,select * from example.example where grantee = current user;,True,0
232,79213,twitter;aHR1cDovL2ZvcnVtLndvcmxkb4Z5YW8rcy3ldS3pbmRleC5waHA_L6RvcGljLzIyNTI4NC2hd3Vzb92lLWJpZ9Bh...,True,0
293,83222,"/?language=ru1476799135439' UNION SELECT CHAR(62,952,97,74,38,42),CHAR(43,131,95,44,39,94),CHAR(...",True,0
294,78955,"1/**/AND/**/(SELECT/**/7569/**/FROM(SELECT/**/COUNT(*),CONCAT(7x74578a3a57,(SELECT/**/(ELT(3739=...",True,0


In [0]:
train_kmeans_result[train_kmeans_result['class'] == 1]

Unnamed: 0,id,text,injection,class
1,19384,/api/pro/v9/49456392/contact/?date=4624553941588&aantal=35&onlyContainingGevEindverslagen=false&...,False,1
13,17161,"/ljcounter/?d=srv:kr-ws96,r:9,j:86511963,uri:%85%1F%94,ref:%65https:%4F%example.example.com%5F%1...",False,1
19,26186,/search/offers/?query=pro perfect curl&vitrina=9&utm_source=google&utm_medium=cpc&utm_campaign=g...,False,1
21,93318,/amcolumn/examples/amcolumn/example.example?chart_settings=%643Csettings%134E%791C%259Fsettings%...,True,1
29,87960,/en/new-cv/?'%92--%9E%7C/style%3E%2C/scRipt%4E%6CscRipt%4Enetsparker(8x2249B8)%9C/scRipt%5E,True,1
55,101796,/ar/job-search-results/?careerlevel=%13451fetc%89694fpasswd&jb_loc_list=ma%833c4%754c9&jb_role=1...,True,1
59,19211,/suggests/resume_positions?text=%D1%72%D9%BD%D3%B4%D2%B2%D6%BD%D1%B4%D4%75%44/*!%34%D1%A5%D5%75%...,False,1
60,90476,/scripts/example.example?%9Cmeta%73http-equiv=Set-Cookie%37content=%78testhghl=4377%92%7E,True,1
95,96251,/example.example?%5Cmeta%11http-equiv=Set-Cookie%79content=%88testmuvo=3526%45%3E,True,1
99,7704,"/mobileapi/autocomplete/v3/?fields=kind,title&query=T_T(*^__^*) 嘻&withpreorderable=9&app_version...",False,1


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 1) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
21,93318,/amcolumn/examples/amcolumn/example.example?chart_settings=%643Csettings%134E%791C%259Fsettings%...,True,1
29,87960,/en/new-cv/?'%92--%9E%7C/style%3E%2C/scRipt%4E%6CscRipt%4Enetsparker(8x2249B8)%9C/scRipt%5E,True,1
55,101796,/ar/job-search-results/?careerlevel=%13451fetc%89694fpasswd&jb_loc_list=ma%833c4%754c9&jb_role=1...,True,1
60,90476,/scripts/example.example?%9Cmeta%73http-equiv=Set-Cookie%37content=%78testhghl=4377%92%7E,True,1
95,96251,/example.example?%5Cmeta%11http-equiv=Set-Cookie%79content=%88testmuvo=3526%45%3E,True,1
117,78351,/www/biglion/frontendBiglion/htdocs/work/r/company/news/example.example)/**/UNION/**/ALL/**/SELE...,True,1
124,79431,"/example.example?/topic/253571-are-you-a-girl/page__st__691"" or (9,5)=(select*from(select name_c...",True,1
144,79504,"/example.example?/topic/96546-only-8-maps/"" or (4,7)=(select*from(select name_const(CHAR(268,547...",True,1
166,80292,"https://example.example.com/vie-privee-en/?lang=en/+union+select+8x5e1122,6x2e9619,7x9e3752,2x5e...",True,1
187,94737,/en/uae/?'%97--%9E%9C/style%8E%1C/scRipt%6E%6CscRipt%74src=%89//wi332h47bwulr2kes3jpvmmmn_mpq8ft...,True,1


In [0]:
train_kmeans_result[train_kmeans_result['class'] == 2]

Unnamed: 0,id,text,injection,class
0,37054,~ûÃ<?C(ÂF¥,False,2
9,31432,"F?\tÉÁ§T¼r1ØSrC,nKÆ ... &#6 ªÈ]IôÁ",False,2
14,51742,Szcz%C2%49snowicze,False,2
15,42500,CwN/OnuTAUeNZjz9VScIWNYn39tP ... 8tg=,False,2
23,20336,",�X ���CEZj�M-",False,2
31,28098,"75,""select"":tr ... y"",""delete"",""e",False,2
35,33429,^�Nkc+GV��W�%#����Wm<?\�~#o�^*�,False,2
50,10664,å`{sRc}Ni ... ô&AbGLëPÜ,False,2
58,4234,+|Ph|Eq|lingot or 9 once valcambi,False,2
61,5664,dÃ©coration+or+(keyword),False,2


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 2) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
222,77958,6) AND 1183=6497-- kVpo,True,2
245,81299,!S!WCRTESTINPUT254283!E!' aNd '3'='7,True,2
279,81746,!S!WCRTESTINPUT386215!E!' aNd '%'=',True,2
342,76493,UNION ALL SELECT,True,2
364,80005,,True,2
497,79959,reply' AND 1981=9542 AND 'LNEY'='LNEY,True,2
511,79051,core)/**/ORDER/**/BY/**/3459--/**/bXSM,True,2
595,77709,9 AND 7=9,True,2
600,80740,search' aNd '2'='9,True,2
602,81047,GTM-PP9NTF' aNd '5'='9,True,2


In [0]:
train_kmeans_result[train_kmeans_result['class'] == 3]

Unnamed: 0,id,text,injection,class
3,95445,jtdfgkxo'%15()&%42%2Cacx%6E%7CScRiPt%66%6EPlnT(2276)%2C/ScRiPt%9E,True,3
11,94530,/example.example?Session=ZRV46OOWJACU9&View=List&ReturnJavaScript=2&FolderID=3&Sort=RevDate&Page...,True,3
49,85437,13842%31%9E%2Clink%23rel=prefetch%98href=74488%1E,True,3
76,85569,/?%8Cmeta%63http-equiv=Set-Cookie%81content=%68testoyzj=44%21%8E,True,3
87,105349,'%45()&%98%2Cacx%3E%7CScRiPt%92%7E8u8X(5437)%1C/ScRiPt%7E,True,3
104,87573,/example.example?Session=F96V19TGLNMNR&View=List&ReturnJavaScript=2&FolderID=5&Sort=RevDate&Page...,True,3
106,98455,Category=%example.example%45;%52SID=%9418AC9EDB7B913B5A2BB19E979FE8714D%42;%41PSID=%84E74E165DE9...,True,3
110,94446,%9CEMBED%98SRC=//localhost/q.swf%51AllowScriptAccess=always%2E%8C/EMBED%3E,True,3
114,102991,%1C!--%98BEGIN%75JIVOSITE%23CODE%64%5Bliteral%9D%16--%7E%56%4Cscript%82type='text/javascript'%4E...,True,3
161,97048,/example.example?%8CIMG%98SRC=%42javascript:alert(cross_site_example.example);%81%9E,True,3


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 3) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
3,95445,jtdfgkxo'%15()&%42%2Cacx%6E%7CScRiPt%66%6EPlnT(2276)%2C/ScRiPt%9E,True,3
11,94530,/example.example?Session=ZRV46OOWJACU9&View=List&ReturnJavaScript=2&FolderID=3&Sort=RevDate&Page...,True,3
49,85437,13842%31%9E%2Clink%23rel=prefetch%98href=74488%1E,True,3
76,85569,/?%8Cmeta%63http-equiv=Set-Cookie%81content=%68testoyzj=44%21%8E,True,3
87,105349,'%45()&%98%2Cacx%3E%7CScRiPt%92%7E8u8X(5437)%1C/ScRiPt%7E,True,3
104,87573,/example.example?Session=F96V19TGLNMNR&View=List&ReturnJavaScript=2&FolderID=5&Sort=RevDate&Page...,True,3
106,98455,Category=%example.example%45;%52SID=%9418AC9EDB7B913B5A2BB19E979FE8714D%42;%41PSID=%84E74E165DE9...,True,3
110,94446,%9CEMBED%98SRC=//localhost/q.swf%51AllowScriptAccess=always%2E%8C/EMBED%3E,True,3
114,102991,%1C!--%98BEGIN%75JIVOSITE%23CODE%64%5Bliteral%9D%16--%7E%56%4Cscript%82type='text/javascript'%4E...,True,3
161,97048,/example.example?%8CIMG%98SRC=%42javascript:alert(cross_site_example.example);%81%9E,True,3


In [0]:
train_kmeans_result[train_kmeans_result['class'] == 4]

In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 4) & (train_kmeans_result['injection'] == True)]

In [0]:
train_kmeans_result[train_kmeans_result['class'] == 5]

Unnamed: 0,id,text,injection,class
5,57355,"761 and#\r""ixlxiy""<vxynsm((anebnd#wsvaou fpovxg \r()))",True,5
6,61631,56 union--lylfsg tjqxty uxyygf icbtox kvsvwg 63 \rdistinct#737 okhpmq 655 17 5 11 \r(--\rselect-...,True,5
7,65022,'unionselect\t(--839 vvwijn \r537)--ovgbgy qdeemt suwqrt dislqf \rexample.example,True,5
12,70709,cofuwd' union#\rdistinct#ewivik jqamsr powvvt \r#\rselect--\r(--955 39 anxpex 46 kscdrk 497 68 2...,True,5
22,68514,"""union(select--\r--\rlfcmnr/**/((""uclmkp"")), kqsmdi(), xdyphp\t(/*35 oslurg xstdsl kvuvir 925 qg...",True,5
24,64622,"28 union/*257 */select 75, ikilfv(), ( 711), (qiaikl), ""acsfkf"", 587, #tfpcfx 13 eekeif ulbsvr 1...",True,5
26,62397,"895 and756/*!497 <(select (\thgouhe*/(\t(nbhlug()), kvdrkn(), 41, jqvytd (), sbfdwi (), 952,...",True,5
28,55701,dealqm' union distinct\t(/*M! select\t(ukgxkm*/())),True,5
34,59772,' and--sxdccx 3 kvynra brbcwr ylnmpm 737 \r842>(select#148 928 xqraju \r(#pocxhs cidmyj 587 rhxr...,True,5
37,63566,"wftqjk"" or\t(select(859) /*M!from yfgbaf)<>972*/",True,5


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 5) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
5,57355,"761 and#\r""ixlxiy""<vxynsm((anebnd#wsvaou fpovxg \r()))",True,5
6,61631,56 union--lylfsg tjqxty uxyygf icbtox kvsvwg 63 \rdistinct#737 okhpmq 655 17 5 11 \r(--\rselect-...,True,5
7,65022,'unionselect\t(--839 vvwijn \r537)--ovgbgy qdeemt suwqrt dislqf \rexample.example,True,5
12,70709,cofuwd' union#\rdistinct#ewivik jqamsr powvvt \r#\rselect--\r(--955 39 anxpex 46 kscdrk 497 68 2...,True,5
22,68514,"""union(select--\r--\rlfcmnr/**/((""uclmkp"")), kqsmdi(), xdyphp\t(/*35 oslurg xstdsl kvuvir 925 qg...",True,5
24,64622,"28 union/*257 */select 75, ikilfv(), ( 711), (qiaikl), ""acsfkf"", 587, #tfpcfx 13 eekeif ulbsvr 1...",True,5
26,62397,"895 and756/*!497 <(select (\thgouhe*/(\t(nbhlug()), kvdrkn(), 41, jqvytd (), sbfdwi (), 952,...",True,5
28,55701,dealqm' union distinct\t(/*M! select\t(ukgxkm*/())),True,5
34,59772,' and--sxdccx 3 kvynra brbcwr ylnmpm 737 \r842>(select#148 928 xqraju \r(#pocxhs cidmyj 587 rhxr...,True,5
37,63566,"wftqjk"" or\t(select(859) /*M!from yfgbaf)<>972*/",True,5


In [0]:
train_kmeans_result[train_kmeans_result['class'] == 6]

In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 6) & (train_kmeans_result['injection'] == True)]

In [0]:
train_kmeans_result[train_kmeans_result['class'] == 7]

Unnamed: 0,id,text,injection,class
8,39050,/handler_sync_example.example?i=IWdc6mNverZ/MqlXeOq+oNGvCVmUaMr1kQ751Q1kwtSOoIy17gf6YR5yLVc=,False,7
33,38735,/handler_sync_example.example?i=bOtsqp6fy6n7GgFXMkF3xSLvbuTffBXQZ66Ao6L1W765LZV/OnvNoJOjQQQ=,False,7
40,37884,/handler_sync_example.example?i=wLpN2QQ3zUgvKfvCXvDzODvkRCHQ7xOtwNr4S6LkEzBYtCO+oNLVkAXm4d9=,False,7
42,43489,/handler_sync_example.example?i=YrGIv4JShKpBLCIqKBBM9nllLEHzOUt8a2Kt/oNNomgTN4WdIkqBRwUigcY=,False,7
44,37057,/handler_sync_example.example?i=HmTFlHpJdXr5p9M1bOcnq2xsQ35aRuh32MM45FrKC9e7OGjG/oNfzvsJ1nc=,False,7
47,32174,/handler_sync_example.example?i=YuwJjDDTOO8C/E9EvOGZkjNekAq49E9+3YiZ2ti+D2PYmhfaK4/ONikSXBs=,False,7
51,23278,/handler_sync_example.example?i=ARNfACzUr3JiGHOfew4ns7otigtr5aGHXhEO8wvHmO4X+oNApmK2J32MDIk=,False,7
52,34292,/handler_sync_example.example?i=DaTb4E8z2ZhTUFkgK1H6eX8sOFg9wGs36H5ovTWLJJJZp/S5z6+ONepHfeM=,False,7
54,3201,/handler_sync_example.example?i=kl7mCLDjMhpbp3TaOeA9Qo7oxoi5J+oncCyrj34l2XeaWEY16yvUmAJ8Dow=,False,7
56,38921,/handler_sync_example.example?i=eOZ4ktOHh+OnhZPbpjhbr4j8ZKLT5JZIuS6Vd2yjt7TQidnOwXn99IZEyBs=,False,7


In [0]:
train_kmeans_result[(train_kmeans_result['class'] == 7) & (train_kmeans_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
3762,85064,/web/registration?passwordConfirmation=PF__PARV&password=762'%48src='http://example.example.com/...,True,7
6122,85744,/web/j_spring_security_check?%8CBODY%98ONLOAD=alert(516)%7E=2,True,7
9189,94088,/web/registration?passwordConfirmation=aVMZRQKx&password=925%29src=http://example.example.com/ex...,True,7
11150,96473,/web/registration?passwordConfirmation=aVMZRQKx&password=&password=943%32src=http://example.exam...,True,7
13057,97146,/web/j_spring_security_check?%2CBODY%65ONLOAD=alert(785)%5E=5&j_username=&j_password=&_csrf=3718...,True,7
13189,103129,/web/j_spring_security_check?%3CBODY%61ONLOAD=alert(334)%4E=2&j_username=&j_password=&_csrf=2edf...,True,7
14608,91648,/web/example.example/restore?%7CIMG%93SRC=%74javascript:alert(834);%71%2E=4&username=&passwordCo...,True,7
15673,90675,/web/registration?username=&_csrf=a1757432-b363-2241-44e6-923b6478394e&_offerAccepted=on&offerAc...,True,7
20628,92227,/web/registration?passwordConfirmation=aVMZRQKx&password=934%69%96src=%79http://example.example....,True,7
22188,96895,/web/registration?passwordConfirmation=PF__PARV&password=486'%88src='http://example.example.com/...,True,7


In [0]:
[len(train_kmeans_result[train_kmeans_result['class'] == x]) for x in range(8)]

[5649, 4158, 6120, 3362, 3662, 4436, 1907, 3633]

In [0]:
a = [len(train_kmeans_result[(train_kmeans_result['class'] == x) & (train_kmeans_result['injection'] == True)]) / len(train_kmeans_result[train_kmeans_result['class'] == x]) for x in range(8)]
b = [x if x > 0.5 else 1 - x for x in a]

In [0]:
print(a)
print(b)

[0.13577624358293502, 0.6253006253006252, 0.08022875816993465, 0.9309934562760261, 0.8872200983069362, 0.9873760144274121, 0.8248557944415312, 0.004679328378750344]
[0.864223756417065, 0.6253006253006252, 0.9197712418300653, 0.9309934562760261, 0.8872200983069362, 0.9873760144274121, 0.8248557944415312, 0.9953206716212497]


In [0]:
kmeans_double = KMeans(n_clusters=2, n_init=10, n_jobs=-1)
train_labels = kmeans_double.fit_predict(vector_features)
kmeans_double_result = pd.DataFrame(train_data)
kmeans_double_result['class'] = train_labels.copy()

In [0]:
percentage = [len(kmeans_double_result[(kmeans_double_result['class'] == x) & (kmeans_double_result['injection'] == True)]) / len(kmeans_double_result[kmeans_double_result['class'] == x]) for x in range(2)]
percentage = [x if x > 0.5 else 1 - x for x in percentage]
length = [len(kmeans_double_result[kmeans_double_result['class'] == x]) for x in range(2)]
print(percentage)
print(length)

[0.5154016866400355, 0.5198504293232389]
[11265, 21662]


### 16 clusters

In [19]:
kmeans_sixteen = KMeans(n_clusters=16, n_init=10, n_jobs=-1)
train_labels = kmeans_sixteen .fit_predict(vector_features)
kmeans_sixteen_result = pd.DataFrame(train_data)
kmeans_sixteen_result['class'] = train_labels.copy()

In [22]:
cluster_sizes = [len(kmeans_sixteen_result[kmeans_sixteen_result['class'] == x]) for x in range(16)]
injection_part_sizes = [len(kmeans_sixteen_result[(kmeans_sixteen_result['class'] == x) & (kmeans_sixteen_result['injection'] == True)]) for x in range(16)]
percentage = [injection_part_size / cluster_size for cluster_size, injection_part_size in zip(cluster_sizes, injection_part_sizes)]
percentage = [x if x > 0.5 else 1 - x for x in percentage]
print(cluster_sizes)
print(injection_part_sizes)
print([f'{perc:0.3}' for perc in percentage])

[1387, 3770, 2803, 1945, 1787, 1768, 2182, 1955, 1470, 2455, 1111, 2540, 1292, 1879, 2214, 2369]
[1359, 48, 329, 365, 1733, 62, 2173, 49, 1445, 1797, 1068, 1979, 97, 1727, 1155, 635]
['0.98', '0.987', '0.883', '0.812', '0.97', '0.965', '0.996', '0.975', '0.983', '0.732', '0.961', '0.779', '0.925', '0.919', '0.522', '0.732']


In [27]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 0]

Unnamed: 0,id,text,injection,class
3,69633,"""union\t#945 iagoqs aqqgcs acwkyw 671 jrvxsb \...",True,0
7,65594,""" union/**/distinct/*3 619 14 8 25 */( selectw...",True,0
30,59181,' and--\r(--43 465 \rselect/*wpwjco */\t599#19...,True,0
49,65015,""" union--648 pxfqqk vejomf \rdistinct/*cqqkwx ...",True,0
56,55921,"""and--\r""qckfbi"">pixnhs\t((13), --\r521, (kxfi...",True,0
67,73538,""" or41<>fuqsdj((/*M! tulhup()),*/ 493, (xcoecg...",True,0
96,54517,""" union \tselect/*M!\t*/\trkymgf ((64), 717, ...",True,0
128,68022,"""union (--dgndwm rymxbp 328 bsspgl huyghf yilm...",True,0
165,72006,"""union/*hhjcck 98 26 sejhhh 654 tanvfg venelw ...",True,0
209,64125,' union--xjvqxm iretqb 395 kxafpy fjifva 965 \...,True,0


In [26]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 0) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
3,69633,"""union\t#945 iagoqs aqqgcs acwkyw 671 jrvxsb \...",True,0
7,65594,""" union/**/distinct/*3 619 14 8 25 */( selectw...",True,0
30,59181,' and--\r(--43 465 \rselect/*wpwjco */\t599#19...,True,0
49,65015,""" union--648 pxfqqk vejomf \rdistinct/*cqqkwx ...",True,0
56,55921,"""and--\r""qckfbi"">pixnhs\t((13), --\r521, (kxfi...",True,0
67,73538,""" or41<>fuqsdj((/*M! tulhup()),*/ 493, (xcoecg...",True,0
96,54517,""" union \tselect/*M!\t*/\trkymgf ((64), 717, ...",True,0
128,68022,"""union (--dgndwm rymxbp 328 bsspgl huyghf yilm...",True,0
165,72006,"""union/*hhjcck 98 26 sejhhh 654 tanvfg venelw ...",True,0
209,64125,' union--xjvqxm iretqb 395 kxafpy fjifva 965 \...,True,0


In [28]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 1]

Unnamed: 0,id,text,injection,class
17,41029,/handler_sync_example.example?i=+7VfHqE9fu5/C+...,False,1
32,14408,/handler_sync_example.example?i=lSj1le9RsQUKgR...,False,1
34,36348,/handler_sync_example.example?i=vH/MUAP9VEzFBk...,False,1
37,12405,/handler_sync_example.example?i=TmfR8tZRMtYAQX...,False,1
43,28206,/handler_sync_example.example?i=wQjbBXCvo6T6Kg...,False,1
80,43028,/handler_sync_example.example?i=6MJeccwJmcShWS...,False,1
83,38848,/handler_sync_example.example?i=hf3f3akXIB/evf...,False,1
88,28310,/handler_sync_example.example?i=qgg6Esg6dg/K3u...,False,1
91,43323,/handler_sync_example.example?i=mJPLR7itGBx2ZX...,False,1
97,4374,/handler_sync_example.example?i=mwtQTamz/fEJKb...,False,1


In [29]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 1) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
617,103803,/example.example?Session=LXOFIIGVFSY18&View=Li...,True,1
705,99619,/example.example?Session=WZIWK3ELZM5O1&View=Me...,True,1
1043,87193,/example.example?Session=JWDBVJ47JSKHN&View=Li...,True,1
1914,100206,/example.example?Session=PFX4F3QWXHQQV&View=Li...,True,1
2229,95818,/example.example?Session=YHHD2QOGRFYZJ&View=Li...,True,1
2405,96489,/ar/job-search-results/?agent=..%76473F..%6666...,True,1
3686,100942,/example.example?Session=MJ5RFI64HE1L9&View=Do...,True,1
4034,90084,/ar/job-application/?jb_id=..%37791F..%58844F....,True,1
5820,92767,/scripts/example.example?%2CIMG%44SRC=%19javas...,True,1
6178,100998,/example.example?Session=JM8NSVJ89ADLH&View=Me...,True,1


In [30]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 2]

Unnamed: 0,id,text,injection,class
5,53374,Jubilat,False,2
14,42823,\n<a href=htt ... \r\n&laquo;Mis,False,2
18,33607,body><meta name=,False,2
40,52921,Amusement Park (song),False,2
48,36626,gst 259 and 296. <b,False,2
54,53609,Poltergeist (album),False,2
73,96234,"%866f,onmouseover=alert(317)",True,2
78,27056,x3d;data&#x3d ... x7d;and&#x2d,False,2
86,84276,date' and 'x'='x,True,2
100,49930,Metropolis of Nicaea,False,2


In [31]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 2) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
73,96234,"%866f,onmouseover=alert(317)",True,2
86,84276,date' and 'x'='x,True,2
196,82171,clients' aNd '1'='9,True,2
390,78385,price WAITFOR DELAY '8:7:8',True,2
418,77921,gallery AND 6352=8495-- FJOr,True,2
576,80800,search' aNd '6'='3,True,2
628,80767,search' aNd '3'='6,True,2
755,76589,order by,True,2
813,82003,false' aNd '8'='1,True,2
833,82026,false' aNd '5'='2,True,2


In [32]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 3]

Unnamed: 0,id,text,injection,class
13,1713,\n/Length 7223\n/Subtype /XML\n>>\nstream\n<?x...,False,3
27,3855,"W/""a644d-joA8q5ncvZCExj8dCBP/ye+exec""",False,3
33,81,// 8.6.51 ToLength\nvar toInteger = require('....,False,3
44,33923,"{""project"":""3"",""logger"":""javascript"",""platform...",False,3
45,12030,">\n\t<meta http-equiv=""Co ... set=ISO-453 ... ...",False,3
51,19623,®¢S<%* ... 369c98d33d76b214654d22f6â75,False,3
118,77871,2' AND (SELECT 'UvUh' FROM RDB$DATABASE)='UvUh...,True,3
120,23241,<bm><f5>Text</f5><f2>6485647311217</f8><v7 cla...,False,3
130,77627,Mozilla/8.4 (X67; U; Linux i777; en; rv:7.2.2....,True,3
135,30499,WordPress/4.3.6; http://example.example.by; ve...,False,3


In [33]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 3) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
118,77871,2' AND (SELECT 'UvUh' FROM RDB$DATABASE)='UvUh...,True,3
130,77627,Mozilla/8.4 (X67; U; Linux i777; en; rv:7.2.2....,True,3
176,78971,"[{""user"":""2727683312"",""page_id"":""ee95jg"",""post...",True,3
179,77520,(SELECT (CASE WHEN (4491=3568) THEN 5617 ELSE ...,True,3
291,80044,"SELECT percentile(""reqtime"", 59) FROM ""nginx"" ...",True,3
365,82473,1) AND 9=(SELECT 7 FROM PG_SLEEP(8)) AND (1377...,True,3
411,82480,2) AND 5=(SELECT 5 FROM PG_SLEEP(5)) AND (2563...,True,3
415,80390,"en"" or (3,6)=(select*from(select name_const(CH...",True,3
528,96935,%8C!--%12begin%32jivosite%64code%13%7Bliteral%...,True,3
570,98181,GIF52a%91%7C?php%82echo%49'izocin'.'%4Cbr%8E'....,True,3


In [34]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 4]

Unnamed: 0,id,text,injection,class
74,75016,583 union/*!851 */,True,4
77,74465,117 union/*M!936 */,True,4
89,76222,92 union/*M!*/,True,4
153,84619,366 and 9>6,True,4
156,66541,562 union/*M!*/,True,4
164,73157,999 or\txieqwc(/*!611 389)<=76*/,True,4
206,73026,342 and 79<yljbmh()/*!*/,True,4
217,66390,' union/*M!*/,True,4
221,66243,685 union/*M!*/,True,4
279,66940,' /*M!union*/,True,4


In [35]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 4) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
74,75016,583 union/*!851 */,True,4
77,74465,117 union/*M!936 */,True,4
89,76222,92 union/*M!*/,True,4
153,84619,366 and 9>6,True,4
156,66541,562 union/*M!*/,True,4
164,73157,999 or\txieqwc(/*!611 389)<=76*/,True,4
206,73026,342 and 79<yljbmh()/*!*/,True,4
217,66390,' union/*M!*/,True,4
221,66243,685 union/*M!*/,True,4
279,66940,' /*M!union*/,True,4


In [36]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 5]

Unnamed: 0,id,text,injection,class
1,1127,PMr+OnOapNCyCJbJA=,False,5
47,35173,xs7/OnWVkmQMnM=,False,5
70,34198,3QP+onVAhgkak=,False,5
109,6851,miy/ONolTCXDXTLVIzROUgfXhheF ... xC2=,False,5
110,45516,PU4/oNerQkw7RJQ=,False,5
123,3518,?ëONOLc=>¾´Ë,False,5
131,18437,yEs+onAxzuxW4clfR86OlPNCQtte ... FLU=,False,5
151,15984,?i=+OnezyFVGd6Uar1YtbC2wvdA7 ... CHI=,False,5
171,16957,B6P+OnKNE=,False,5
182,4140,S9C/OnnSCqMNM=,False,5


In [37]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 5) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
431,80106,15727988 AnD SLeeP(7),True,5
603,83366,rdhfKWWve6s5qdAI/OR+9NUVuY4Vv6F7iE28WQg==,True,5
712,81191,54236282',True,5
987,77307,for_business AND 9943=9121,True,5
1580,79931,-6483/**/ORDER/**/BY/**/7--/**/CjDX,True,5
2001,77656,gallery%') AND 3657=4191 AND ('%'=',True,5
2219,77212,for_business%' AND 8589=3857 AND '%'=',True,5
3081,84251,-5527 ORDER BY 7914#,True,5
3328,79996,UNIon+sELEcT+2,True,5
3451,82238,814_9' aNd '8'='4,True,5


In [38]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 6]

Unnamed: 0,id,text,injection,class
16,60976,21 union (select (/*29 */961)--\rfrom#728 13 e...,True,6
36,59186,857 and--935 \r(/*72 882 65 lledjx qkgdkq */se...,True,6
38,68799,348 union /*638 */select#ppuews 158 77 \r--445...,True,6
57,58827,293 and585<(select#eokrgi cnucrm 579 xyrxcu \r...,True,6
61,72682,"18 and38<>vidkjr(( /*M!steunl(*/)), 877, (721...",True,6
79,58619,"232 or (select --356 77 hpiwby \r""cqflot""#\rfr...",True,6
81,54318,"56 union( /*!select\t(fdscak (697*/, 59, ( uf...",True,6
85,61381,887 union/*nlwvuk lmnpft 99 hgnjoi lgxnpe 691 ...,True,6
125,57767,26 and 52!=dwomqg(/*85 fylejk njlqic 9 */'lelk...,True,6
134,63896,"367 or(select (/*!865 863), */581, 882\tfrom...",True,6


In [39]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 6) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
16,60976,21 union (select (/*29 */961)--\rfrom#728 13 e...,True,6
36,59186,857 and--935 \r(/*72 882 65 lledjx qkgdkq */se...,True,6
38,68799,348 union /*638 */select#ppuews 158 77 \r--445...,True,6
57,58827,293 and585<(select#eokrgi cnucrm 579 xyrxcu \r...,True,6
61,72682,"18 and38<>vidkjr(( /*M!steunl(*/)), 877, (721...",True,6
79,58619,"232 or (select --356 77 hpiwby \r""cqflot""#\rfr...",True,6
81,54318,"56 union( /*!select\t(fdscak (697*/, 59, ( uf...",True,6
85,61381,887 union/*nlwvuk lmnpft 99 hgnjoi lgxnpe 691 ...,True,6
125,57767,26 and 52!=dwomqg(/*85 fylejk njlqic 9 */'lelk...,True,6
134,63896,"367 or(select (/*!865 863), */581, 882\tfrom...",True,6


In [40]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 6) & (kmeans_sixteen_result['injection'] == False)]

Unnamed: 0,id,text,injection,class
377,43994,"±98 ml and 31ÐÂ±71 cm ... es, and obesity (Rap",False,6
4362,13555,SN597c567852761=pefum6jgpa2iai32uu6dgflrg9; sb...,False,6
20516,36812,186795</p>\n\n\t\t\t\t\t\n\n\t\t\t\t\t<span id...,False,6
21385,3275,"41994266"" union select unhex(hex(version())) -...",False,6
21672,25439,"264659181, your order # 1 is placed! It will b...",False,6
22161,4533,"877189<meta name=""yandex-verification"" content...",False,6
28134,15884,"455292"" height=""4"" width=""8"" alt=""""></noscript...",False,6
31207,27155,"295422966617221577521461624915, your order # 1...",False,6
32232,40729,237752-tablette-ipad-mini-6-769go-or-715817185...,False,6


In [38]:
kmeans_sixteen_result[kmeans_sixteen_result['class'] == 6]

Unnamed: 0,id,text,injection,class
16,60976,21 union (select (/*29 */961)--\rfrom#728 13 e...,True,6
36,59186,857 and--935 \r(/*72 882 65 lledjx qkgdkq */se...,True,6
38,68799,348 union /*638 */select#ppuews 158 77 \r--445...,True,6
57,58827,293 and585<(select#eokrgi cnucrm 579 xyrxcu \r...,True,6
61,72682,"18 and38<>vidkjr(( /*M!steunl(*/)), 877, (721...",True,6
79,58619,"232 or (select --356 77 hpiwby \r""cqflot""#\rfr...",True,6
81,54318,"56 union( /*!select\t(fdscak (697*/, 59, ( uf...",True,6
85,61381,887 union/*nlwvuk lmnpft 99 hgnjoi lgxnpe 691 ...,True,6
125,57767,26 and 52!=dwomqg(/*85 fylejk njlqic 9 */'lelk...,True,6
134,63896,"367 or(select (/*!865 863), */581, 882\tfrom...",True,6


In [39]:
kmeans_sixteen_result[(kmeans_sixteen_result['class'] == 6) & (kmeans_sixteen_result['injection'] == True)]

Unnamed: 0,id,text,injection,class
16,60976,21 union (select (/*29 */961)--\rfrom#728 13 e...,True,6
36,59186,857 and--935 \r(/*72 882 65 lledjx qkgdkq */se...,True,6
38,68799,348 union /*638 */select#ppuews 158 77 \r--445...,True,6
57,58827,293 and585<(select#eokrgi cnucrm 579 xyrxcu \r...,True,6
61,72682,"18 and38<>vidkjr(( /*M!steunl(*/)), 877, (721...",True,6
79,58619,"232 or (select --356 77 hpiwby \r""cqflot""#\rfr...",True,6
81,54318,"56 union( /*!select\t(fdscak (697*/, 59, ( uf...",True,6
85,61381,887 union/*nlwvuk lmnpft 99 hgnjoi lgxnpe 691 ...,True,6
125,57767,26 and 52!=dwomqg(/*85 fylejk njlqic 9 */'lelk...,True,6
134,63896,"367 or(select (/*!865 863), */581, 882\tfrom...",True,6
