### LOADING ALL LIBRARIES

In [7]:
# Load Libraries - Make sure to run this cell!
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection

#import gensim
import tensorflow as tf
from keras.models import Sequential, Model, model_from_json, load_model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Input, ELU, LSTM, Embedding, Convolution2D, MaxPooling2D, \
BatchNormalization, Convolution1D, MaxPooling1D, concatenate
from keras.preprocessing import sequence
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
from keras import backend as K

from pathlib import Path
import json

import warnings
warnings.filterwarnings("ignore")


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### LOADING DATA

In [6]:
## Load data 
rel_path = 'Data/'
df = pd.read_csv(rel_path + 'url_data_mega_deep_learning.csv')
df.sample(n=25).head(25)

Unnamed: 0,url,isMalicious
32528,dsm.com/corporate/science/bright-science-award...,0
155808,cba.pl/lean,1
62150,taz.de/Wahrheit/!p4644,0
98007,oh081zmdg.top/index.php?xH2AcreVKBzPCIA=l3SMfP...,1
183214,cloudfile1012.com,1
72014,letao.com//static/ff80808157ffc1c2015800534d4c...,0
125066,afrodome.com/sites/default/files/js/login/cob/...,1
67367,aoyou.com/hd/weiyvxing,0
123609,acclot.co.za/part/portfolio_zip_0/dropbox,1
186396,homespottersf.com/sm/googledocs1/index.html,1


### DATA PREPROCESSING

In [8]:
# Initial Data Preparation URL

# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer 
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
 
# Step 3: Extract labels form df to numpy array
target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (194798, 75) Vector dimension of target:  (194798,)


In [9]:
#Split the data set into training and test data, 70-30
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.30, random_state=42)

In [10]:
# GENERAL get layer dimensions for any model!
def print_layers_dims(model):
    l_layers = model.layers
    # Note None is ALWAYS batch_size
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)

# GENERAL save model to disk function!
def save_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    #have h5py installed
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)
    

# GENERAL load model from disk function!
def load_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    with open(fileModelJSON, 'r') as f:
         model_json = json.load(f)
         model = model_from_json(model_json)
    
    model.load_weights(fileWeights)
    return model

### MODEL 1: LSTM 

In [11]:
def simple_lstm(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input Layer
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                dropout=0.2, W_regularizer=W_reg)(main_input) 

    # LSTM layer
    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [12]:
# Fit model and Cross-Validation, Model 1 SIMPLE LSTM
epochs = 5
batch_size = 32

model = simple_lstm()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.8461156741957563 

<keras.engine.topology.InputLayer object at 0x127c266d8>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x127a781d0>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.recurrent.LSTM object at 0x127c26cc0>
Input Shape:  (None, 75, 32) Output Shape:  (None, 32)
<keras.layers.core.Dropout object at 0x11062ae10>
Input Shape:  (None, 32) Output Shape:  (None, 32)
<keras.layers.core.Dense object at 0x14186be80>
Input Shape:  (None, 32) Output Shape:  (None, 1)


In [13]:
# SAVING MODEL AS h5
model_name = "deeplearning_LSTM"
rel_path= 'Models/'
save_model(rel_path + model_name + ".json", rel_path + model_name + ".h5")
model = load_model(rel_path + model_name + ".json", rel_path + model_name + ".h5")

### MODEL 2: 1D CONVOLUTION and LSTM

In [15]:
def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    
    # Input Layer
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    # Conv layer
    conv = Convolution1D(kernel_size=5, filters=256, \
                     border_mode='same')(emb)
    conv = ELU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM layer
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# Fit model and Cross-Validation, Model CONV + LSTM
epochs = 5
batch_size = 32

model = lstm_conv()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.9013518138261465 

<keras.engine.topology.InputLayer object at 0x14252f6a0>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x14252f7b8>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.core.Dropout object at 0x14252f748>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 32)
<keras.layers.convolutional.Conv1D object at 0x142525320>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.advanced_activations.ELU object at 0x14288e518>
Input Shape:  (None, 75, 256) Output Shape:  (None, 75, 256)
<keras.layers.pooling.MaxPooling1D object at 0x14252f940>
Input Shape:  (None, 75, 256) Output Shape:  (None, 18, 256)
<keras.layers.core.Dropout object at 0x142519cc0>
Input Shape:  (None, 18, 256) Output Shape:  (None, 18, 256)
<keras.layers.recurrent.LSTM object at 0x14198da58>
Input Shape:  (None, 18, 256) Output Shape:

In [17]:
model_name = "deeplearning_1DConvLSTM"
save_model(rel_path + model_name + ".json", rel_path + model_name + ".h5")
model = load_model(rel_path + model_name + ".json", rel_path + model_name + ".h5")

### PREDICTIONS

In [18]:
# get probabilities of target predictions
target_proba = model.predict(X_test, batch_size=1)

In [19]:
target_proba[0:10]

array([[0.00548537],
       [0.4545563 ],
       [0.13824415],
       [0.9486797 ],
       [0.75061285],
       [0.04453439],
       [0.8875102 ],
       [0.08649283],
       [0.99714416],
       [0.4230319 ]], dtype=float32)

In [21]:
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "benign"

### TESTING SOME KNOWN RESULTS

In [27]:
test_urls= ["naureen.net/etisalat.ae/index2.php", "sixt.com/php/reservation?language=en_US"]

In [32]:
# preprocess and create features from URL
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in test_urls]
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [33]:
target_proba = model.predict(X, batch_size=1)

for i,x in enumerate(target_proba):
    print("Test URL:", test_urls[i], "is", print_result(x))

Test URL: naureen.net/etisalat.ae/index2.php is malicious
Test URL: sixt.com/php/reservation?language=en_US is benign
