In [205]:
import sys
import os
import json
import pandas
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from sklearn.model_selection import train_test_split 

In [206]:
requests_data = pandas.read_csv('Datasets/all_requests_csv_formatted_v3.csv', engine='python', quotechar='|', error_bad_lines=False, header=None)
requests_data.sample(5)

Unnamed: 0,0,1
8266,"{""searchText=%2e%2e%2f%2e%2e%2f%2e%2e%2f%2e%2e...",1
535,"{""searchText=modo=registro&login=naresh&passwo...",0
7453,"{""searchText=modo=registro&login=vish&password...",0
7189,"{""searchText=modo=registro&login=ansorger&pass...",0
13965,"{""searchText=&tenantKey=61&timeRangeScalar=1&t...",1


In [207]:
dataset = requests_data.sample(frac=1).values # convert data into array
dataset

array([['{"searchText=errorMsg=Credenciales+incorrectas&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
        0],
       ['{"searchText=id=2&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
        0],
       ['{"searchText=&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=%22%3e%3cscript%3ealert(1)%3c%2fscript%3e&resultSize=30"}',
        1],
       ...,
       ['{"searchText=modo=entrar&login=delbert&pwd=08enmand15a&remember=on&B1=Entrar&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
        0],
       ['{"searchText=%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5

In [208]:
# Preprocess dataset
X = dataset[:,0]
X

array(['{"searchText=errorMsg=Credenciales+incorrectas&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
       '{"searchText=id=2&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
       '{"searchText=&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=%22%3e%3cscript%3ealert(1)%3c%2fscript%3e&resultSize=30"}',
       ...,
       '{"searchText=modo=entrar&login=delbert&pwd=08enmand15a&remember=on&B1=Entrar&tenantKey=61&timeRangeScalar=1&timeRangeField=hours&timeRangeType=Last&timeRangeStart=&timeRangeEnd=&indexes=&filters=&resultSize=30"}',
       '{"searchText=%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%25%5c%2e%2e%%20%20%20%2025%5c%2e%2e%25%5c%2e%2e%255cboot%

In [209]:
Y = dataset[:,1]
Y

array([0, 0, 1, ..., 0, 1, 1], dtype=object)

In [210]:
#for index, item in enumerate(X):
#        # Quick hack to space out json elements
#        reqJson = json.loads(item, object_pairs_hook=OrderedDict)
#        del reqJson['contentLength']
#        del reqJson['cacheControl']
#        del reqJson['index']
#        del reqJson['label']
#        X[index] = json.dumps(reqJson, separators=(',', ':'))

In [211]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [212]:
tokenizer.word_index

{'!': 77,
 '"': 31,
 '#': 70,
 '$': 83,
 '%': 10,
 '&': 7,
 "'": 58,
 '(': 55,
 ')': 52,
 '*': 84,
 '+': 37,
 ',': 59,
 '-': 56,
 '.': 62,
 '0': 21,
 '1': 20,
 '2': 14,
 '3': 25,
 '4': 43,
 '5': 36,
 '6': 32,
 '7': 44,
 '8': 46,
 '9': 45,
 '=': 4,
 '@': 82,
 'A': 23,
 'B': 48,
 'C': 47,
 'D': 60,
 'E': 34,
 'F': 35,
 'G': 68,
 'H': 72,
 'I': 57,
 'J': 73,
 'K': 39,
 'L': 33,
 'M': 53,
 'N': 54,
 'O': 69,
 'P': 65,
 'Q': 76,
 'R': 17,
 'S': 22,
 'T': 29,
 'U': 61,
 'V': 64,
 'W': 79,
 'X': 71,
 'Y': 78,
 'Z': 75,
 '[': 80,
 '\\': 51,
 ']': 81,
 '_': 67,
 'a': 2,
 'b': 42,
 'c': 16,
 'd': 15,
 'e': 1,
 'f': 19,
 'g': 13,
 'h': 26,
 'i': 5,
 'j': 63,
 'k': 66,
 'l': 12,
 'm': 11,
 'n': 6,
 'o': 18,
 'p': 27,
 'q': 74,
 'r': 8,
 's': 9,
 't': 3,
 'u': 24,
 'v': 49,
 'w': 50,
 'x': 28,
 'y': 30,
 'z': 38,
 '{': 40,
 '}': 41,
 '~': 85}

In [213]:
# Extract and save word dictionary
word_dict_file = 'build/word-dictionary.json'

In [214]:
if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [215]:
num_words = len(tokenizer.word_index)+1
num_words

86

In [216]:
X = tokenizer.texts_to_sequences(X)

In [217]:
max_log_length = 1024
train_size = int(len(dataset) * .75)

X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [218]:
# split data into training set and a temporary set using sklearn.model_selection.traing_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=23)  

In [219]:
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

In [220]:
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [225]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))

#model.add(Dropout(0.5))
#model.add(LSTM(64, recurrent_dropout=0.5))
#model.add(Dropout(0.5))

model.add(LSTM(64, dropout_U = 0.2, dropout_W = 0.2, return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_split=0.25, epochs=5, batch_size=128, callbacks=[tb_callback])

  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1024, 32)          2752      
_________________________________________________________________
lstm_8 (LSTM)                (None, 1024, 64)          24832     
_________________________________________________________________
lstm_9 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_15 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 60,673
Trainable params: 60,673
Non-trainable params: 0
_________________________________________________________________
None
Train on 8118 samples, validate on 2706 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f375b98c978>

In [226]:
# Evaluate model
score, acc = model.evaluate(numpy.array(X_test), numpy.array(Y_test), verbose=1, batch_size=128)



In [227]:
print("Model Accuracy: {:0.2f}%".format(acc * 100))

Model Accuracy: 98.20%


In [228]:
# Save model
model.save_weights('model-weights.h5')
model.save('model.h5')
with open('model.json', 'w') as outfile:
    outfile.write(model.to_json())

In [245]:
X_prediction = tokenizer.texts_to_sequences(["searchText=test=123&filters=test&resultSize=20"])
X_prediction_processed = sequence.pad_sequences(X_prediction, maxlen=max_log_length)
print(X_prediction_processed)

prediction = model.predict_classes(X_prediction_processed, verbose=0)
prediction_proba = model.predict_proba(X_prediction_processed, verbose=0)
#model.predict(X)
#model.predict_proba(X)
print(prediction)
print(prediction_proba)

[[ 0  0  0 ...  4 14 21]]
[[0]]
[[0.07512387]]
