# Test models using new data

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
print('pandas', pd.__version__)
print('matplotlib', matplotlib.__version__)
print('numpy', np.__version__)
print('scikit-learn', sklearn.__version__)

pandas 0.22.0
matplotlib 2.2.2
numpy 1.14.1
scikit-learn 0.19.1


# 1. Load and Prepare Data

In [2]:
%store -r properties

In [3]:
data = pd.read_csv("../data/corrected", names=properties, nrows=10000)
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


In [4]:
# From object to numerical
for properti in ['protocol_type', 'service', 'flag', 'attack_type']:
    data[properti] = data[properti].astype('category')
    
for properti in ['protocol_type', 'service', 'flag']:
    data[properti] = data[properti].cat.codes

In [5]:
attack_categories = sorted(list(set(data.attack_type)))

In [6]:
# data[:][data['attack_type'] == 'back.'].head()
normal_test = data[:]

attacks = normal_test['attack_type'].tolist()
attacks = [[1, 0] if attacks[i] == 'normal.' else [0, 1] for i in range(len(attacks))]
normal_test['attack_type'] = pd.Series(attacks)
normal_test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,2,12,5,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
1,0,2,12,5,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
2,0,2,12,5,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
3,0,2,12,5,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[0, 1]"
4,0,2,12,5,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,"[0, 1]"


Data normalization

In [7]:
for properti in properties[:-1]: # Not include the output (attack_type)
    x = normal_test[properti].values.reshape(-1, 1) #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normal_test[properti] = pd.DataFrame(x_scaled)
normal_test.head()



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0.0,1.0,0.8,1.0,3e-06,0.000381,0.0,0.0,0.0,0.0,...,0.996063,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
1,0.0,1.0,0.8,1.0,3e-06,0.000381,0.0,0.0,0.0,0.0,...,0.996063,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
2,0.0,1.0,0.8,1.0,3e-06,0.000381,0.0,0.0,0.0,0.0,...,0.996063,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0]"
3,0.0,1.0,0.8,1.0,3e-06,0.000381,0.0,0.0,0.0,0.0,...,0.996063,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"[0, 1]"
4,0.0,1.0,0.8,1.0,3e-06,0.000381,0.0,0.0,0.0,0.0,...,0.996063,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,"[0, 1]"


# 2. Load Model and Evaluate

In [8]:
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
# Split data into X (input) and Y (output)
X_test = normal_test[properties[:-1]]
Y_test = normal_test[properties[-1]]
Y_test = np.array(Y_test.tolist()) # this will ensure the shape is of (. , 2)


In [10]:
%store X_test
%store Y_test

Stored 'X_test' (DataFrame)
Stored 'Y_test' (ndarray)


In [11]:
model = load_model('../logs/models/ids-model-9973.h5')

In [12]:
loss, acc = model.evaluate(x=X_test, y=Y_test)



In [13]:
print("Accuracy: ", acc)

Accuracy:  0.9805


In [15]:
# Let's do some manual testing
index = 3104 # index of a line in the corrected.csv file
print(X_test[index-1:index])
print("Actual: ")
print(Y_test[index-1:index])
print("Predicted: ")
print(model.predict(X_test[index-1:index]))

      duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
3103       0.0            0.0      0.4   1.0   0.000033        0.0   0.0   

      wrong_fragment  urgent  hot            ...             dst_host_count  \
3103             0.0     0.0  0.0            ...                        1.0   

      dst_host_srv_count  dst_host_same_srv_rate  dst_host_diff_srv_rate  \
3103                 1.0                     1.0                     0.0   

      dst_host_same_src_port_rate  dst_host_srv_diff_host_rate  \
3103                          1.0                          0.0   

      dst_host_serror_rate  dst_host_srv_serror_rate  dst_host_rerror_rate  \
3103                   0.0                       0.0                   0.0   

      dst_host_srv_rerror_rate  
3103                       0.0  

[1 rows x 41 columns]
Actual: 
[[0 1]]
Predicted: 
[[7.832229e-14 1.000000e+00]]


In [17]:
# Let's do some manual testing
index = 4 # index of a line in the corrected.csv file (this is a new attack type)
print(X_test[index-1:index])
print("Actual: ")
print(Y_test[index-1:index])
print("Predicted: ")
print(model.predict(X_test[index-1:index]))

   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
3       0.0            1.0      0.8   1.0   0.000003   0.000381   0.0   

   wrong_fragment  urgent  hot            ...             dst_host_count  \
3             0.0     0.0  0.0            ...                        1.0   

   dst_host_srv_count  dst_host_same_srv_rate  dst_host_diff_srv_rate  \
3            0.996063                     1.0                    0.01   

   dst_host_same_src_port_rate  dst_host_srv_diff_host_rate  \
3                          0.0                          0.0   

   dst_host_serror_rate  dst_host_srv_serror_rate  dst_host_rerror_rate  \
3                   0.0                       0.0                   0.0   

   dst_host_srv_rerror_rate  
3                       0.0  

[1 rows x 41 columns]
Actual: 
[[0 1]]
Predicted: 
[[1.0000000e+00 1.2794821e-09]]


Seems like the model can predict with high accuracy a type of attack it has been trained on, but consider normal any other attack it hasn't seen before.

Next we will try to find out what are the properties relevant in determining a specific type of attack