# Project Description

Objective:

- Detect security breaches by predicting suspicious access using an RNN model.

Methods Used:

- Simple RNN
- LSTM
- Dropout
- GRU

Result:

- 97% accuracy achieved

# Part 1: Data Processing

a. import necessary libraries

In [1]:
import sys

In [2]:
import os

In [3]:
import json

In [4]:
import pandas as pd
import numpy as np

In [5]:
import optparse

In [6]:
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [7]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout

In [8]:
from keras.layers.embeddings import Embedding

In [9]:
from keras.preprocessing import sequence

In [10]:
from keras.preprocessing.text import Tokenizer

In [11]:
from collections import OrderedDict

b. read data in

In [15]:
dataframe = pd.read_csv(r'C:\Users\mhime\Downloads\dev-access.csv', engine = 'python', quotechar = '|', header = None)

In [16]:
dataframe.head()

Unnamed: 0,0,1
0,"{""timestamp"":1502738402847,""method"":""post"",""qu...",0
1,"{""timestamp"":1502738402849,""method"":""post"",""qu...",0
2,"{""timestamp"":1502738402852,""method"":""post"",""qu...",0
3,"{""timestamp"":1502738402852,""method"":""post"",""qu...",0
4,"{""timestamp"":1502738402853,""method"":""post"",""qu...",0


c. convert to array

In [17]:
dataset = dataframe.values

d. check shape

In [21]:
dataset.shape

(26773, 2)

e. create feature dataset

In [22]:
X = dataset[:,0]

In [26]:
X

array(['{"timestamp":1502738402847,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237","referer":"http://localhost:8002/enter"},"route":"/login","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","content-type":"application/json","content-length":"36"},"requestPayload":{"username":"Carl2","password":"bo"},"responsePayload":{"statusCode":401,"error":"Unauthorized","message":"Invalid Login"}}',
       '{"timestamp":1502738402849,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-type":"applicat

f. create target variable

In [23]:
Y = dataset[:,1]

In [27]:
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=object)

g. clean up the predictor dataset. remove unvaluable features

In [28]:
for index, item in enumerate(X):
    # quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook = OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators = (',', ':'))

In [29]:
X

array(['{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Carl2","password":"bo"}}',
       '{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"pafzah","password":"worldburn432"}}',
       '{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Panos1","password":"najrijkom"}}',
       ...,
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<script src=\\"http://attacker/malicious\\u00e2\\u20ac\\u2018script.js\\"></script>"}}',
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<meta http-equiv=\\"refresh\\">"}}',
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<meta http-equiv=\\"refresh\\">"}}'],
      dtype=object)

h. tokenize/vectorize our text for entry into RNN.  Tokenize every character (char_level = True)

In [30]:
tokenizer = Tokenizer(filters = '\t\n', char_level = True)

In [31]:
tokenizer.fit_on_texts(X)

In [32]:
num_words = len(tokenizer.word_index)+1

In [33]:
X = tokenizer.texts_to_sequences(X)

i. Pad our data because each observation has a different length

In [34]:
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

j. create train set to be 75% of data and test set to be 25% of data

In [41]:
X_processed.shape

(26773, 1024)

In [43]:
from sklearn.model_selection import train_test_split

In [45]:
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=object)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size = 0.25, random_state = 42)

# Part 2: Model 1- RNN

a. Your first model will be a pretty minimal RNN with only an embedding layer, simple RNN and Dense layer.  Start by creating an instance of a Sequential model

In [47]:
model = Sequential()

b. add an Embedding layer

In [48]:
input_dim = num_words
output_dim = 32
input_length = max_log_length

In [50]:
model.add(Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length ))

c. Add a simpleRNN layer

In [52]:
from keras.layers import SimpleRNN

In [53]:
model.add(SimpleRNN(32, activation = 'relu'))

d. Add a Dense layer

In [54]:
model.add(Dense(1, activation = 'sigmoid'))

e. Compile model

In [56]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

f. print model summary

In [57]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________


g. Use the fit method to fit the model on the train data. Use a validation split of 0.25, epochs = 3, batch size = 128

In [58]:
batch_size = 128
epochs = 3
validation_split = 0.25

In [59]:
model.fit(X_train, y_train, batch_size= batch_size, epochs = epochs, validation_split = validation_split )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2886b5cb748>

h. Use evaluate method to get the loss value and accuracy value on the test data. use batch size of 128 again.

In [60]:
model.evaluate(X_test, y_test, batch_size = batch_size)



[0.16269680272653134, 0.9263519644737244]

loss value of 0.16, accuracy value of 0.926 on test data

# Part 3: LSTM + Dropout Layers

a. Add a few new layers to our RNN and incorporate the more powerful LSTM.  Creating a new model so name it differently than Part2.  Need the following layers - Embedding (same params as before), LSTM (units =64, recrrent_dropout =0.5), Dropout (value of 0.5), Dense (same params as before).

In [61]:
lstm_model = Sequential()

In [62]:
lstm_model.add(Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length))

In [63]:
lstm_model.add(LSTM(64, recurrent_dropout =0.5))

In [64]:
lstm_model.add(Dropout(0.5))

In [65]:
lstm_model.add(Dense(1, activation = 'sigmoid'))

b. Compile model using compile method with params(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [66]:
lstm_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

c. Print model summary

In [67]:
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


d. Fit model on train data. Use validation split of 0.25, epochs =3, batch size = 128

In [68]:
lstm_model.fit(X_train, y_train, validation_split = validation_split, epochs = epochs, batch_size = batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x288749b2b38>

e. Use the evaluate method to get the loss value and the accuracy value on the test data.  Use batch size of 128 again.

In [69]:
lstm_model.evaluate(X_test, y_test, batch_size = batch_size)



[0.13517956480201337, 0.972662091255188]

loss of 0.135 and accuracy 0f 0.97 on test data

# Part 4: RNN Model 3

In [70]:
my_model = Sequential()

In [71]:
my_model.add(Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length))

In [None]:
my_model.add(GRU)