In [2]:
import sys
import os
import json
import pandas
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from sklearn.model_selection import train_test_split 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
requests_data = pandas.read_csv('Datasets/csic_2010_csv.csv', engine='python', quotechar='|', header=None)
requests_data.head(5)

Unnamed: 0,0,1
0,"{""cacheControl"": ""no-cache"", ""index"": ""0"", ""co...",1
1,"{""cacheControl"": ""no-cache"", ""index"": ""0"", ""co...",1
2,"{""cacheControl"": ""no-cache"", ""index"": ""0"", ""co...",1
3,"{""cacheControl"": ""no-cache"", ""index"": ""0"", ""co...",1
4,"{""cacheControl"": ""no-cache"", ""index"": ""0"", ""co...",1


In [4]:
dataset = requests_data.sample(frac=1).values # convert data into array
dataset

array([['{"cacheControl": "no-cache", "index": "23798", "cookie": "JSESSIONID=BBF5977E13C1F6DA72F305A81E87B8FB", "connection": "close", "contentLength": "277", "host": "localhost:8080", "payload": "nombre=Jer%F3nimo", "accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", "acceptLanguage": "en", "acceptCharset": "utf-8, utf-8;q=0.5, *;q=0.5", "label": "anom", "pragma": "no-cache", "protocol": "HTTP/1.1", "userAgent": "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)", "acceptEncoding": "x-gzip, x-deflate, gzip, deflate", "url": "http://localhost:8080/tienda1/publico/registro.jsp", "method": "POST", "contentType": "application/x-www-form-urlencoded"}',
        1],
       ['{"cacheControl": "no-cache", "index": "24498", "cookie": "JSESSIONID=C31D3D3093B8C64686DBA3A5459D4E85", "connection": "close", "contentLength": "63", "host": "localhost:8080", "payload": "pwd=mentirilLa", "accept": "text/xml,application/x

In [5]:
# Preprocess dataset
X = dataset[:,0]
X

array(['{"cacheControl": "no-cache", "index": "23798", "cookie": "JSESSIONID=BBF5977E13C1F6DA72F305A81E87B8FB", "connection": "close", "contentLength": "277", "host": "localhost:8080", "payload": "nombre=Jer%F3nimo", "accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", "acceptLanguage": "en", "acceptCharset": "utf-8, utf-8;q=0.5, *;q=0.5", "label": "anom", "pragma": "no-cache", "protocol": "HTTP/1.1", "userAgent": "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)", "acceptEncoding": "x-gzip, x-deflate, gzip, deflate", "url": "http://localhost:8080/tienda1/publico/registro.jsp", "method": "POST", "contentType": "application/x-www-form-urlencoded"}',
       '{"cacheControl": "no-cache", "index": "24498", "cookie": "JSESSIONID=C31D3D3093B8C64686DBA3A5459D4E85", "connection": "close", "contentLength": "63", "host": "localhost:8080", "payload": "pwd=mentirilLa", "accept": "text/xml,application/xml,application

In [6]:
Y = dataset[:,1]
Y

array([1, 1, 1, ..., 1, 0, 0], dtype=object)

In [7]:
for index, item in enumerate(X):
        # Quick hack to space out json elements
        reqJson = json.loads(item, object_pairs_hook=OrderedDict)
        del reqJson['contentLength']
        del reqJson['cacheControl']
        del reqJson['index']
        del reqJson['label']
        X[index] = json.dumps(reqJson, separators=(',', ':'))

In [8]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [9]:
tokenizer.word_index

{' ': 16,
 '"': 1,
 '%': 69,
 '(': 57,
 ')': 58,
 '*': 44,
 '+': 63,
 ',': 8,
 '-': 29,
 '.': 15,
 '/': 13,
 '0': 14,
 '1': 31,
 '2': 45,
 '3': 35,
 '4': 47,
 '5': 24,
 '6': 48,
 '7': 46,
 '8': 20,
 '9': 37,
 ':': 12,
 ';': 27,
 '=': 28,
 'A': 39,
 'B': 49,
 'C': 38,
 'D': 40,
 'E': 33,
 'F': 51,
 'G': 59,
 'H': 55,
 'I': 54,
 'J': 65,
 'K': 56,
 'L': 43,
 'M': 52,
 'N': 64,
 'O': 61,
 'P': 60,
 'Q': 74,
 'R': 71,
 'S': 36,
 'T': 32,
 'U': 73,
 'V': 72,
 'W': 78,
 'X': 77,
 'Y': 79,
 'Z': 76,
 '_': 75,
 'a': 5,
 'b': 50,
 'c': 6,
 'd': 26,
 'e': 3,
 'f': 34,
 'g': 21,
 'h': 22,
 'i': 11,
 'j': 66,
 'k': 42,
 'l': 7,
 'm': 18,
 'n': 10,
 'o': 4,
 'p': 9,
 'q': 30,
 'r': 17,
 's': 25,
 't': 2,
 'u': 23,
 'v': 70,
 'w': 62,
 'x': 19,
 'y': 53,
 'z': 41,
 '{': 67,
 '}': 68,
 '~': 80}

In [10]:
# Extract and save word dictionary
word_dict_file = 'build/word-dictionary.json'

In [11]:
if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [12]:
num_words = len(tokenizer.word_index)+1
num_words

81

In [13]:
X = tokenizer.texts_to_sequences(X)
#X

In [14]:
max_log_length = 1024
train_size = int(len(dataset) * .75)

X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
#X = sequence.pad_sequences(X)

In [14]:
# split data into training set and a temporary set using sklearn.model_selection.traing_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=23)  

In [15]:
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

In [16]:
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [17]:
len(X)

223577

In [27]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length, dropout = 0.2))
#model.add(LSTM(32, dropout_U = 0.2, dropout_W = 0.2, return_sequences=True))
model.add(LSTM(64, dropout_U = 0.2, dropout_W = 0.2, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_split=0.25, epochs=1, batch_size=128, callbacks=[tb_callback])

  
  after removing the cwd from sys.path.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1024, 32)          2592      
_________________________________________________________________
lstm_13 (LSTM)               (None, 1024, 64)          24832     
_________________________________________________________________
lstm_14 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 60,513
Trainable params: 60,513
Non-trainable params: 0
_________________________________________________________________
None
Train on 125761 samples, validate on 41921 samples
Epoch 1/1

KeyboardInterrupt: 

In [None]:
# Evaluate model
score, acc = model.evaluate(numpy.array(X_test), numpy.array(y_test), verbose=1, batch_size=128)

In [None]:
print("Model Accuracy: {:0.2f}%".format(acc * 100))

In [None]:
# Save model
model.save_weights('model-weights.h5')
model.save('model.h5')
with open('model_json', 'w') as outfile:
    outfile.write(model.to_json())