In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense
from keras.layers import Input
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import model_from_json

from keras.regularizers import l1

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import datetime

import plotly as py
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.graph_objs import Data,Figure

from sklearn.preprocessing import  StandardScaler, MinMaxScaler



Using TensorFlow backend.


In [2]:
# PREPROCESSING

# attach the column names to the dataset
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

# open the csv dataset
df = pd.read_csv("kddcup.data_10_percent.gz",compression='gzip',header=None, names = col_names)

# Remove smurf and neptune attacks
df = df[df.label != 'smurf.']
df = df[df.label != 'neptune.']

# Remove duplicates
df.drop_duplicates(subset=None, keep='first', inplace=True)


#data = df.drop("label", axis=1) #labels serviront pour évaluer la qualité du modèle
target = df["label"]



# One hot encoding
cols_to_dummify = ['protocol_type','service','flag']
data_dummy = pd.get_dummies(df, columns=cols_to_dummify, prefix=cols_to_dummify)
target_dummy = pd.get_dummies(target)
target_categories = target_dummy.columns

# Splitting train/test
X = data_dummy.values
y = target_dummy.values
RANDOM_SEED = 87 #penser a changer la seed a posteriori pour voir
X_train, X_test = train_test_split(data_dummy, test_size=0.2, random_state = RANDOM_SEED)

X_train = X_train[X_train['label'] == 'normal.' ] # Only train on normal data (no intrusion)
# X_test = X_test[X_test['label'] == 'normal.']
X_train = X_train.drop(['label'], axis=1)
y_test  = X_test['label']
X_test  = X_test.drop(['label'], axis=1)
X_train = X_train.values
X_test  = X_test.values

# Standardize/Normalize dataset?
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print('Training data size   :', X_train.shape)
print('Validation(test) data size :', X_test.shape)
data_dummy.head()


Training data size   : (70306, 109)
Validation(test) data size : (18625, 109)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,181,5450,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,239,486,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,0,235,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,219,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
#MODEL BUILDING
input_dim = X_train.shape[1]
encoding_dim = 6

# Sequential
autoencoder_seq = Sequential()
autoencoder_seq.add(Dense(input_dim, input_shape=(input_dim,)))
autoencoder_seq.add(Dense(encoding_dim, activation="tanh"))
autoencoder_seq.add(Dense(int(encoding_dim/2),activation="tanh"))
autoencoder_seq.add(Dense(2,activation="tanh"))
autoencoder_seq.add(Dense(int(encoding_dim/2),activation="tanh"))
autoencoder_seq.add(Dense(encoding_dim,activation="tanh"))
autoencoder_seq.add(Dense(input_dim,activation="tanh"))
autoencoder_seq.compile(optimizer='adam', loss='binary_crossentropy',
             metrics=['accuracy'])
autoencoder_seq.summary()

# Functional 
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh",activity_regularizer=l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="tanh")(encoder)
encoder = Dense(int(2), activation="tanh")(encoder)
decoder = Dense(int(encoding_dim/ 2), activation='tanh')(encoder)
decoder = Dense(int(encoding_dim), activation='tanh')(decoder)
decoder = Dense(input_dim, activation='tanh')(decoder)
autoencoder_func = Model(inputs=input_layer, outputs=decoder)
autoencoder_func.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'] )
autoencoder_func.summary()


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 109)               11990     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 660       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 21        
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 8         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 9         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 24        
_________________________________________________________________
dens

In [4]:
# MODEL TRAINING

nb_epoch = 20
batch_size = 50


t_ini = datetime.datetime.now()
history = autoencoder_func.fit(X_train, X_train,
                        epochs=nb_epoch,
                        batch_size=batch_size,
                        validation_split=0.1,
                        shuffle=True
                        )

t_fin = datetime.datetime.now()
print('Time to run the model: {} Sec.'.format((t_fin - t_ini).total_seconds()))


df_history = pd.DataFrame(history.history)

Instructions for updating:
Use tf.cast instead.
Train on 63275 samples, validate on 7031 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Time to run the model: 53.011124 Sec.


In [5]:
# MODEL EVALUATION

pyo.init_notebook_mode(connected=True)
trace = []
for label, loss in zip(['Train', 'Validation'], ['loss', 'val_loss']):
    trace0 = {'type' : 'scatter', 
              'x'    : df_history.index.tolist(),
              'y'    : df_history[loss].tolist(),
              'name' : label,
              'mode' : 'lines'
              }
        
    trace.append(trace0)
data = Data(trace)
layout = {'title' : 'Model train-vs-validation loss', 'titlefont':{'size' : 30},
          'xaxis' : {'title':  '<b> Epochs', 'titlefont':{ 'size' : 25}},
          'yaxis' : {'title':  '<b> Loss', 'titlefont':{ 'size' : 25}},
          }
fig = Figure(data = data, layout = layout)
    
pyo.iplot(fig)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [22]:
# PREDICTION

def line_plot(df, col):
    
    x = df.index.tolist()
    y = df[col].tolist()
    
    trace = {'type':  'scatter', 
             'x'   :  x,
             'y'   :  y,
             'mode' : 'markers'
            }
    data   = Data([trace])
    layout = {'title': 'Line plot of {}'.format(col), 'titlefont': {'size': 30},
              'xaxis' : {'title' :'Data Index', 'titlefont': {'size' : 20}},
              'yaxis' : {'title': col, 'titlefont' : {'size': 20}},
              'hovermode': 'closest'
             }
    fig = Figure(data = data, layout = layout)
    return fig

predictions = autoencoder_func.predict(X_test)
predictions_train = autoencoder_func.predict(X_train[:18625])

mse = np.mean(np.power(X_test - predictions, 2), axis=1)
mse_train = np.mean(np.power(X_train[18625] - predictions_train, 2), axis=1)

df_error = pd.DataFrame({'reconstruction_error': mse, 'Label': y_test, 
                         'reconstruction_error_train':mse_train}, index=y_test.index)
print(df_error.describe())

outliers = df_error.index[df_error.reconstruction_error > 1000000].tolist()
print(len(outliers))

pyo.iplot(line_plot(df_error,'reconstruction_error'))


       reconstruction_error  reconstruction_error_train
count          1.862500e+04                18625.000000
mean           1.992263e+08                11779.619814
std            6.675690e+09                    0.000002
min            2.149359e-01                11779.619689
25%            2.574442e+03                11779.619814
50%            1.393621e+04                11779.619814
75%            8.736975e+04                11779.619815
max            2.436821e+11                11779.619817
1680


In [58]:
# INTERPRETATION

data_n = pd.DataFrame(X_test, index= y_test.index)

def compute_error_per_dim(point):
    
    initial_pt = np.array(data_n.loc[point,:]).reshape(1,109)
    reconstructed_pt = autoencoder_func.predict(initial_pt)
    
    return abs(np.array(initial_pt  - reconstructed_pt)[0])

def bar_plot(df, data_pt):
    x = df.columns.tolist()
    y = df.loc[data_pt]
    
    trace = {'type': 'bar',
             'x'   : x,
             'y'   : y}
    data = Data([trace])
    layout = {'title' : "<b>Reconstruction error in each dimension for data poitn {}".format(data_pt),
              'titlefont':{'size' : 20},
              'xaxis' : {'title': '<b>Features',
                         'titlefont':{'size' : 20},
                         'tickangle': -45, 'tickfont': {'size':15} },
              'yaxis' : {'title': '<b>Reconstruction Error',
                         'titlefont':{'size' : 20},
                         'tickfont': {'size':15}},
              'margin' : {'l':100, 'r' : 1, 'b': 200, 't': 100, 'pad' : 1},
              'height' : 600, 'width' : 800,
             }
    
    fig = Figure(data = data, layout = layout)
    
    return pyo.iplot(fig)

RE_per_dim = {}
for ind in outliers:
    RE_per_dim[ind] = compute_error_per_dim(ind)
    
RE_per_dim = pd.DataFrame(RE_per_dim).T


for pt in outliers[0:5]:
    bar_plot(RE_per_dim,pt)



        0        1        2         3         4         5         6    \
77047   1.0  14051.0      1.0  0.073441  0.035558  0.083261  0.124111   
33859   1.0    308.0  11732.0  0.073440  0.035557  0.083258  0.124108   
79567   1.0    221.0  15198.0  0.073440  0.035557  0.083258  0.124108   
23748   1.0    203.0  14948.0  0.073440  0.035557  0.083258  0.124108   
452728  1.0    182.0  13193.0  0.073440  0.035557  0.083258  0.124108   

             7         8         9    ...       99        100       101  \
77047   0.035953  0.782878  0.116673  ...  0.724956  0.082072  0.051608   
33859   0.035950  0.217106  0.116680  ...  0.724972  0.082069  0.051614   
79567   0.035950  0.217106  0.116680  ...  0.724972  0.082069  0.051613   
23748   0.035950  0.217106  0.116680  ...  0.724972  0.082069  0.051613   
452728  0.035950  0.217106  0.116680  ...  0.724972  0.082069  0.051613   

             102       103       104       105       106       107       108  
77047   0.204384  0.090460  0.1

Conclusion: les caractéristiques permettant de discriminer un point comme intrusion sont: src_bytes, dst_bytes, dst_host_srv_count, et dst_host_count 
