# ABD final project: Model training
## Part 1: Data encoding
### Python version
3.5 onwards

### Modules needed
* pandas
* numpy
* xlrd
* pickle

In [1]:
# Uncoment to install modules inside the notebook
# !pip install xlrd
# !pip install pandas
# !pip install pickle

import numpy as np
import pandas as pd
import pickle
import random
import re

from io import StringIO
from pandas import Series, DataFrame
from collections import Counter

import abd_utils

In [2]:
# Loading and reordering data
df_train = pd.read_excel('data/log_data_evil.xlsx', index_col=0)
df_train.sort_index(ascending=True, inplace=True)

# Droping columns that will not be used
df_train.drop(['http_user_agent'], inplace=True, axis=1)

In [3]:
# Filtering and formatting the data with regex
df_train.request_url.replace({r'^/$': '<BASE_URL>'}, regex=True, inplace=True)
df_train.request_url.replace({r'\%+[\%0-9A-Za-z]*': ' <PERCENT_URL> '},regex=True, inplace=True)
df_train.request_url.replace({r'\w+\d+\w+': ' '},regex=True, inplace=True)
df_train.request_url.replace({r'\d{2,}': ' '},regex=True, inplace=True)
df_train.request_url.replace({r'\[\d*\]': ' '},regex=True, inplace=True)
df_train.request_url.replace({'/': ' ', ':': ' ', '\.': ' ', '\?': ' ', '=': ' ', '\|': ' ', '&': ' '},regex=True, inplace=True)

In [4]:
# Dropping extra spaces in the url strings
list_aux = [' '.join(word.split()).lower() for word in df_train['request_url'].tolist()]

In [5]:
# Creating a vocabulary for the neuronal network based on the url DataFrame column

# Joinning all the url's in a String and separating all the individual words in an Array
as_one = ' '.join(list_aux)
words = as_one.split()

# Dropping duplicate words
counts = Counter(words)
vocab = sorted(counts, key=counts.get)

# Creating the vocabulary dictionary and predefining some reserved words
word_index = {word: (index + 9) for (index, word) in enumerate(vocab)}
word_index['<PAD>'] = 0
word_index["<URL_START>"] = 1
word_index['<GET>'] = 2
word_index['<POST>'] = 3
word_index['<TEMPORARY_REDIRECT>'] = 4
word_index['<BAD_REQUEST>'] = 5
word_index['<NOT_FOUND>'] = 6
word_index['<OK>'] = 7
word_index['<MOVED_PERMANTLY>'] = 8

# Creating the reverse vocabulary
reverse_word_index = {value: key for (key, value) in word_index.items()}

In [6]:
# Saving both vocabularys on disk
with open('model/abd_variables.pkl', 'wb') as f:
    pickle.dump([word_index, reverse_word_index], f)
    
# Saving word_index on disk
with open('model/abd_w_index.pkl', 'wb') as f:
    pickle.dump(word_index, f)
    
# Saving reverse_word_index on disk
with open('model/abd_rw_index.pkl', 'wb') as f:
    pickle.dump(reverse_word_index, f)

In [7]:
# Loading functions for handling (encoding and decoding) the urls based on the vocabulary: ABDUtils class
utils = abd_utils.ABDUtils(word_index, reverse_word_index)

In [8]:
# To lower case, then enconding and inserting a '<PAD>' encoded String to every url in the DataFrame
df_train['request_url'] = df_train['request_url'].str.lower()
df_train['request_url'] = df_train['request_url'].apply(utils.encode)
df_train['request_url'] = df_train['request_url'].apply(abd_utils.insert_start)

In [9]:
# Encoding every request in the DataFrame
df_train['request_method'].replace({'GET': 2, 'POST': 3}, inplace=True)

In [10]:
# From status code to String, then encoding every status code in the DataFrame
df_train['status'].replace({307: '<TEMPORARY_REDIRECT>', 400: '<BAD_REQUEST>', 404: '<NOT_FOUND>', 200: '<OK>', 301: '<MOVED_PERMANTLY>'}, inplace=True)
df_train['status'] = df_train['status'].apply(utils.encode_single)

In [11]:
# Separating and dropping the training labels from the DataFrame
total_labels = df_train['is_evil'].values
df_train.drop('is_evil', axis=1, inplace=True)

In [12]:
# Getting the length of the the largest url in the DataFrame
max_length = 0
for i in df_train.values:
    if len(i[2]) > max_length:
        max_length = len(i[2])

print('the largest url have', max_length, 'words')

the largest url have 11 words


# ABD final project: Model training
## Part 2: Model definition and training
### Python version
3.5 onwards

### Modules needed
* tensorflow
* keras

In [13]:
# Model training: model definition and training

# Importing modules

# Uncoment to install modules inside the notebook
# !pip install tensorflow

from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

print('Tensorflow version:', tf.__version__)

# Printing available computing devices 
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print('Available computing devices:', get_available_devices())

Tensorflow version: 1.14.0
Available computing devices: ['/device:CPU:0', '/device:XLA_CPU:0']


In [14]:
# Normalizing the url length
aux_request_data = keras.preprocessing.sequence.pad_sequences(df_train['request_url'].values,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=max_length)
df_train['request_url'] = Series(aux_request_data.tolist())

In [15]:
# Creating 'max_length' new columns in the DataFrame
for i in range(max_length):
    df_train['request_url' + str(i)] = 0

# Moving the url encoded data from its column to the new columns, one encoded String per column
for i in df_train.itertuples():
    for j in range(max_length):
        df_train.at[i[0], 'request_url' + str(j)] = i[3][j]

# Dropping the request url column in the DataFrame
df_train.drop(['request_url'], axis=1, inplace=True)

# Getting the total data Array from the values of the DataFrame
total_data = df_train.values

In [16]:
# View the dimensions of the total data and labels variables
print(total_data.shape, total_labels.shape)

# Defining the vocabulary size
vocab_size = len(word_index.keys())
print(vocab_size)

(7716, 13) (7716,)
727


In [17]:
# Defining the neuronal network model
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

# Printing a summary of the model
model.summary()

W0718 19:50:44.891616 140694330087232 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 19:50:44.922627 140694330087232 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          11632     
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 11,921
Trainable params: 11,921
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

W0718 19:50:44.979377 140694330087232 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [19]:
# Declaring 75% index of the total data
half_length_data = int(len(total_data) * 0.75)

# Separing the data: train data and labels (first 75% of total)
train_data = total_data[:half_length_data]
train_labels = total_labels[:half_length_data]

# Separing the data: test data and labels (second 25% of total)
test_data = total_data[half_length_data:]
test_labels = total_labels[half_length_data:]

In [20]:
# Viewing the lengths of the train and test data
print('Train data lengths:', len(train_data), len(train_labels))
print('Test data lengths:', len(test_data), len(test_labels))

Train data lengths: 5787 5787
Test data lengths: 1929 1929


In [21]:
# Declaring 40% index of the train data
train_length = int(len(train_data) * 0.4)

# Separing the train data: validation data (first 40% of the train data)
x_val = train_data[:train_length]
# Separing the train data: train data to fit the model (second 60% of the train data)
partial_x_train = train_data[train_length:]

# Separing the train labels: validation labels (first 40% of the train labels)
y_val = train_labels[:train_length]
# Separing the train labels: labels to fit the model (second 60% of the train labels)
partial_y_train = train_labels[train_length:]

In [22]:
# Viewing the lengths of the separed train data
print('Fit data and label lengths:', len(partial_x_train), len(partial_y_train))
print('Validate data and label lengths:', len(x_val), len(y_val))

Fit data and label lengths: 3473 3473
Validate data and label lengths: 2314 2314


In [23]:
# Training the model
history = model.fit(partial_x_train,
                    partial_y_train,
                    # Number of epochs to train the model
                    # One epoch: one forward pass and one backward pass of all the training examples
                    # One epoch: one pass over the entire dataset
                    #epochs=250,
                    epochs=len(partial_x_train)+len(x_val),
                    # Number of samples per gradient update
                    # 32 is the default size in the Keras framework
                    # The higher the batch size, the more memory space needed
                    # The smaller the batch size (until the length of the training data), the less accurate the estimate of the gradient will be
                    #batch_size=32,
                    batch_size=len(partial_x_train),
                    validation_data=(x_val, y_val),
                    # Printing the process: 0for silent, 1 for progress bar and 2 for one line per epoch
                    verbose=0)

In [24]:
# Evaluating the model on the test data
results = model.evaluate(test_data, test_labels)



In [25]:
# Saving the model to disk
try:
    model.save("model/abd_model.h5")
    print("model saved to disk")
except Exception:
    print("something went wrong")

model saved to disk
