# ABD final project: Model implementation
## Part 1: Data encoding
### Python version
3.5 onwards

### Modules needed
* keras
* numpy
* pandas
* pickle

In [1]:
# Uncoment to install modules inside the notebook
# !pip install numpy
# !pip install pandas
# !pip install tensorflow
# !pip install pickle

from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

from io import StringIO
from pandas import Series, DataFrame

import numpy as np
import pandas as pd
import pickle

import abd_utils

In [2]:
# Loading the model from disk
model = load_model('model/abd_model.h5')

# Loading the vocabularys from disk
with open('model/abd_variables.pkl', 'rb') as f:
    word_index, reverse_word_index = pickle.load(f)

# Loading functions for handling (encoding and decoding) the urls based on the vocabulary: ABDUtils class
utils = abd_utils.ABDUtils(word_index, reverse_word_index)

W0718 19:49:48.157860 139964249794368 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 19:49:48.177998 139964249794368 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 19:49:48.178653 139964249794368 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init

In [3]:
# Defining a function to evaluate a single log (String) and return the probability of it being evil
def evaluate_log(log):
    # Load the log in a way that read_csv can use it as a parameter
    test = StringIO(log)
    
    # String format error handling
    try:
        df_evaluate = pd.read_csv(test, sep=" ", header=None)
        df_evaluate[11], df_evaluate[12], df_evaluate[13] = df_evaluate[5].str.split(' ', 2).str
        df_evaluate.drop([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 13], axis=1, inplace=True)
    except ValueError:
        return -1
    except KeyError:
        return -1
    
    # URL request method format error handling
    if df_evaluate.at[0,11] != 'GET' and df_evaluate.at[0,11] != 'POST':
        return -1
    
    # Formating the String
    df_evaluate[12].replace(r'^/$', '<BASE_URL>', regex=True, inplace=True)
    df_evaluate[12].replace(r'\%+[\%0-9A-Za-z]*', ' <PERCENT_URL> ',regex=True, inplace=True)
    df_evaluate[12].replace(r'\w+\d+\w+', ' ',regex=True, inplace=True)
    df_evaluate[12].replace(r'\d{2,}', ' ',regex=True, inplace=True)
    df_evaluate[12].replace(r'\[\d*\]', ' ',regex=True, inplace=True)
    df_evaluate[12].replace({'/': ' ', ':': ' ', '\.': ' ', '\?': ' ', '=': ' ', '\|': ' ', '&': ' '},regex=True, inplace=True)
    
    # Formating the HTTP request method
    df_evaluate[11].replace({'GET': 2, 'POST': 3}, inplace=True)
    
    # Encoding the HTTP request method
    df_evaluate[12] = df_evaluate[12].str.lower()
    df_evaluate[12] = df_evaluate[12].apply(utils.encode)
    df_evaluate[12] = df_evaluate[12].apply(abd_utils.insert_start)
    
    # Encoding the HTTP response status
    df_evaluate[6].replace({307: '<TEMPORARY_REDIRECT>', 400: '<BAD_REQUEST>', 404: '<NOT_FOUND>', 200: '<OK>', 301: '<MOVED_PERMANTLY>'}, inplace=True)
    df_evaluate[6] = df_evaluate[6].apply(utils.encode_single)
    
    # Reordering the columns
    df_evaluate.columns = ['status', 'method', 'r_url']
    
    # Normalizing the request url length
    if len(df_evaluate.at[0, 'r_url']) < 11:
        for i in range (len(df_evaluate.at[0,'r_url']), 11):
            df_evaluate.at[0,'r_url'].append(0)
    
    # Moving the request url encoded data from its column to new columns, one encoded String per column
    for i in range(11):
        df_evaluate.at[0, 'r_url' + str(i)] = df_evaluate.at[0, 'r_url'][i]
    
    # Dropping the request url column
    df_evaluate.drop(['r_url'], axis=1, inplace=True)
    
    # Getting the single log data Array from the values of the DataFrame
    single_evaluate = df_evaluate.values[0]
    
    # Printing the encoded, then decoded request url
    print('Handled URL:')
    print(utils.decode(single_evaluate))
    
    # Adding a dimension (to fit the model input dimension) to the log, then evaluate it
    single_evaluate = (np.expand_dims(single_evaluate, 0))
    
    # Returning the 4 digits rounded percentage prediction
    return round(model.predict(single_evaluate)[0][0], 4)

In [4]:
log = '''
88.147.113.229 - - [06/Jul/2019:03:53:25 -0500] "GET /favicon.ico.php HTTP/1.1" 400 173 "-" "Mozilla/5.0" "-"
'''

evaluate_log(log)

Handled URL:
<BAD_REQUEST> <GET> <URL_START> favicon ico php <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


0.9999

In [5]:
def evaluate_logs(logs_filename):
    max_lenght = 11
    
    df_evaluate = pd.read_csv(logs_filename, sep=" ", header=None)
    len_before = len(df_evaluate)
    
    df_evaluate[11], df_evaluate[12], df_evaluate[13] = df_evaluate[5].str.split(' ', 2).str
    
    df_evaluate.drop([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 13], axis=1, inplace=True)
    
    df_evaluate[11].replace(r'^(?!GET$|POST$).*', pd.np.nan, inplace=True, regex=True)
    
    df_evaluate[12].replace({r'^/$': '<BASE_URL>'}, regex=True, inplace=True)
    df_evaluate[12].replace({r' \%[\%0-9A-Za-z]*': ' <PERCENT_URL> '},regex=True, inplace=True)
    df_evaluate[12].replace({r'\w+\d+\w+': ''},regex=True, inplace=True)
    df_evaluate[12].replace({r'\d{2,}': ''},regex=True, inplace=True)
    df_evaluate[12].replace({r'\[\d*\]': ' '},regex=True, inplace=True)
    df_evaluate[12].replace({'/': ' ', ':': ' ', '\.': ' ', '\?': ' ', '=': ' ', '\|': ' ', '&': ' '},regex=True, inplace=True)
    df_evaluate[11].replace({'GET': 2, 'POST': 3}, inplace=True)
    
    df_aux = df_evaluate[df_evaluate.isna().any(axis=1)]
    len_aux = len(df_aux)
    df_evaluate.dropna(inplace=True)
    
    df_evaluate[12] = df_evaluate[12].str.lower()
    df_evaluate[12] = df_evaluate[12].apply(encode)
    df_evaluate[12] = df_evaluate[12].apply(insert_start)
    
    df_evaluate[6].replace({307: '<TEMPORARY_REDIRECT>', 400: '<BAD_REQUEST>', 404: '<NOT_FOUND>', 200: '<OK>', 301: '<MOVED_PERMANTLY>'}, inplace=True)
    df_evaluate[6] = df_evaluate[6].apply(encode_single)
    
    df_evaluate.columns = ['status', 'method', 'r_url']
    aux_request_data = pad_sequences(df_evaluate['r_url'].values,
                                     value=word_index["<PAD>"],
                                     padding='post',
                                     maxlen=max_lenght)
    df_evaluate['r_url'] = Series(aux_request_data.tolist())
    
    
    df_aux = df_evaluate[df_evaluate.isna().any(axis=1)]
    len_aux += len(df_aux)
    df_evaluate.dropna(inplace=True)
    
    len_total = len(df_evaluate)
    
    for i in range(max_lenght):
        df_evaluate['r_url' + str(i)] = 0

    for i in df_evaluate.itertuples():
        for j in range(max_lenght):
            df_evaluate.at[i[0], 'r_url' + str(j)] = i[3][j]

    df_evaluate.drop(['r_url'], axis=1, inplace=True)

    to_evaluate_data = df_evaluate.values
    
    result = model.predict(to_evaluate_data)
    
    bad_logs = 0
    good_logs = 0
    for i in result:
        if round(i[0] * 100, 4) > 50:
            bad_logs += 1
        else:
            good_logs += 1
            
    print(len_before, len_total, len_aux)    
    print(bad_logs, good_logs)
    


In [6]:
evaluate_logs('data/partial_log.log')

NameError: name 'encode' is not defined