In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline 
%config IPCompleter.greedy=True

import collections
import io
import math
import tensorflow as tf
from IPython import display
from sklearn import metrics


## Reading data from file

In [2]:
from collections import Counter

### main dataset ###
data = pd.read_csv("20151219.txt", sep = "\t", header = None)
data.columns = ["conn_len", "service", "src_bytes", "dst_bytes", "conn_numb", "Same_srv_rate", "Serror_rate", "Srv_serror_rate", "Dst_host_count", "Dst_host_srv_count", "Dst_host_same_src_port_rate", "Dst_host_serror_rate", "Dst_host_srv_serror_rate", "Conn_state", "IDS_detection", "Malware_detection", "Ashula_detection", "attack_flag", "src_IP", "src_port", "dst_IP", "dst_port", "start_time", "proto"]

print(Counter(data["attack_flag"]))

Counter({-1: 149915, 1: 67713})


## Transform to binary

In [3]:
data.loc[:, "attack_flag"] = data.loc[:,"attack_flag"].replace(-2, -1)
data.loc[:, "attack_flag"] = data.loc[:, "attack_flag"] * -1
data.loc[:, "attack_flag"] = data.loc[:,"attack_flag"].replace(-1, 0)

print(Counter(data["attack_flag"]))

Counter({1: 149915, 0: 67713})


In [4]:
# removing NaNs
data = data.dropna()
data.shape

(217628, 24)

# a bit of feature engineering

## port numbers classification

In [5]:
t1 = 1023  # well-known port numbers
t2 = 49151 # registered ports
t3 = 65535 # client ports

def wk(data_row):
    if (data_row["src_port"] <= t1): 
        value = 1
    elif ((data_row["src_port"] > t1) and (data_row["src_port"] <= t2)):
        value = 0
    elif ((data_row["src_port"] > t2) and (data_row["src_port"] <= t3)):
        value = 0
    return value

def reg(data_row):
    if (data_row["src_port"] <= t1): 
        value = 0
    elif ((data_row["src_port"] > t1) and (data_row["src_port"] <= t2)):
        value = 1
    elif ((data_row["src_port"] > t2) and (data_row["src_port"] <= t3)):
        value = 0
    return value

def cli(data_row):
    if (data_row["src_port"] <= t1): 
        value = 0
    elif ((data_row["src_port"] > t1) and (data_row["src_port"] <= t2)):
        value = 0
    elif ((data_row["src_port"] > t2) and (data_row["src_port"] <= t3)):
        value = 1
    return value

data["well_known_src_pool"] = data.apply(wk, axis=1)
data["registered_src_pool"] = data.apply(reg, axis=1)
data["cli_src_pool"] = data.apply(cli, axis=1)

In [6]:
data["src_port"] = data["src_port"].apply(str)
data["dst_port"] = data["dst_port"].apply(str)

In [7]:
train, validate, test = np.split(data, [int(.7*len(data)), int(.85*len(data))])
train = train.sample(frac=1)
validate = validate.sample(frac=1)
test = test.sample(frac=1)
y_train = train.loc[:, "attack_flag"].values.ravel()
X_train = train.drop(["attack_flag"], axis=1)
y_validate = validate.loc[:, "attack_flag"].values.ravel()
X_validate = validate.drop(["attack_flag"], axis=1)
y_test = test.loc[:, "attack_flag"].values.ravel()
X_test = test.drop(["attack_flag"], axis=1)
print(train.shape)
print(validate.shape)
print(test.shape)

(152339, 27)
(32644, 27)
(32645, 27)


In [None]:
deep_numeric_feat = ["Same_srv_rate", "Serror_rate", "Srv_serror_rate", "Dst_host_count", "Dst_host_srv_count", "Dst_host_same_src_port_rate", "Dst_host_serror_rate", "Dst_host_srv_serror_rate", "well_known_src_pool", "registered_src_pool", "cli_src_pool"]
wide_numeric_feat = ["conn_len",  "src_bytes", "dst_bytes", "conn_numb"]
categorical_feat = ["service", "Conn_state", "src_port", "dst_port", "proto"]

In [None]:
deep_numeric_feat_tf = [tf.feature_column.numeric_column(k) for k in deep_numeric_feat]
wide_numeric_feat_tf = [tf.feature_column.numeric_column(k) for k in wide_numeric_feat]

In [None]:
#data["src_port"].unique().tolist()

In [None]:
src_port_mod = ["src_" + s for s in data["src_port"].unique().tolist()]
src_port = tf.feature_column.categorical_column_with_vocabulary_list(
    'src_port', src_port_mod)

dst_port_mod = ["dst_" + s for s in data["dst_port"].unique().tolist()]
dst_port = tf.feature_column.categorical_column_with_vocabulary_list(
    'dst_port', dst_port_mod)

service = tf.feature_column.categorical_column_with_vocabulary_list(
    'service', data["service"].unique().tolist())

Conn_state = tf.feature_column.categorical_column_with_vocabulary_list(
    'Conn_state', data["Conn_state"].unique().tolist())

proto = tf.feature_column.categorical_column_with_vocabulary_list(
    'proto', data["proto"].unique().tolist())

In [361]:
deep_columns = [
    tf.feature_column.indicator_column(service),
    tf.feature_column.indicator_column(Conn_state),
    tf.feature_column.indicator_column(proto),
    tf.feature_column.embedding_column(src_port, dimension=150),
    tf.feature_column.embedding_column(dst_port, dimension=100),
]

In [443]:
crossed_columns = [
      tf.feature_column.crossed_column(
          ['src_port', 'service'], hash_bucket_size=10000),
      tf.feature_column.crossed_column(
          ['dst_port', 'service'], hash_bucket_size=10000),
      tf.feature_column.crossed_column(
          ['conn_len', 'src_bytes'], hash_bucket_size=10000),
      tf.feature_column.crossed_column(
          ['conn_len', 'dst_bytes'], hash_bucket_size=10000),
      tf.feature_column.crossed_column(
          ['dst_port', 'conn_numb'], hash_bucket_size=10000),  
      tf.feature_column.crossed_column(
          ['service', 'proto'], hash_bucket_size=100),
    
  ]

In [444]:
model = tf.estimator.DNNLinearCombinedClassifier(
    model_dir = "/home/matz/Desktop/ml_ids/NN_model2",
    dnn_activation_fn=tf.nn.selu,
    linear_feature_columns = deep_columns + deep_numeric_feat_tf + wide_numeric_feat_tf + crossed_columns,
    dnn_feature_columns = deep_columns + deep_numeric_feat_tf + wide_numeric_feat_tf,
    dnn_hidden_units = [450, 250, 125, 75, 55, 35, 20]
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/matz/Desktop/ml_ids/NN_model2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3925f8b5c0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [445]:
feature_set= deep_numeric_feat + wide_numeric_feat + categorical_feat

In [446]:
def get_input_fn(data_set, num_epochs=None, n_batch = 256, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in feature_set}),
       y = pd.Series(data_set["attack_flag"].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [447]:
model.train(input_fn=get_input_fn(train, n_batch=180), steps=20000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /home/matz/Desktop/ml_ids/NN_model2/model.ckpt.
INFO:tensorflow:loss = 133.64708, step = 1
INFO:tensorflow:global_step/sec: 60.8993
INFO:tensorflow:loss = 9.242481, step = 101 (1.644 sec)
INFO:tensorflow:global_step/sec: 89.7141
INFO:tensorflow:loss = 2.0768383, step = 201 (1.114 sec)
INFO:tensorflow:global_step/sec: 91.6082
INFO:tensorflow:loss = 9.181526, step = 301 (1.092 sec)
INFO:tensorflow:global_step/sec: 93.5229
INFO:tensorflow:loss = 6.9554467, step = 401 (1.069 sec)
INFO:tensorflow:global_step/sec: 105.136
INFO:tensorflow:loss = 1.8151029, step = 501 (0.951 sec)
INFO:tensorflow:global_step/sec: 105.988
INFO:tensorflow:loss = 6.348154, step = 601 (0.944 sec)
INFO:tensorflow:global_step/sec: 99.9904


INFO:tensorflow:global_step/sec: 78.4764
INFO:tensorflow:loss = 2.0542085, step = 8101 (1.274 sec)
INFO:tensorflow:global_step/sec: 76.7568
INFO:tensorflow:loss = 40.443428, step = 8201 (1.303 sec)
INFO:tensorflow:global_step/sec: 80.931
INFO:tensorflow:loss = 24.6836, step = 8301 (1.236 sec)
INFO:tensorflow:global_step/sec: 77.9019
INFO:tensorflow:loss = 7.632496, step = 8401 (1.284 sec)
INFO:tensorflow:global_step/sec: 78.3253
INFO:tensorflow:loss = 0.8809751, step = 8501 (1.276 sec)
INFO:tensorflow:global_step/sec: 76.7198
INFO:tensorflow:loss = 2.6999104, step = 8601 (1.304 sec)
INFO:tensorflow:global_step/sec: 79.9785
INFO:tensorflow:loss = 2.7021062, step = 8701 (1.250 sec)
INFO:tensorflow:global_step/sec: 77.2571
INFO:tensorflow:loss = 5.454455, step = 8801 (1.295 sec)
INFO:tensorflow:global_step/sec: 75.9718
INFO:tensorflow:loss = 10.937195, step = 8901 (1.317 sec)
INFO:tensorflow:global_step/sec: 77.4359
INFO:tensorflow:loss = 1.9573661, step = 9001 (1.291 sec)
INFO:tensorflow

INFO:tensorflow:global_step/sec: 79.0182
INFO:tensorflow:loss = 0.9240111, step = 16401 (1.266 sec)
INFO:tensorflow:global_step/sec: 79.0216
INFO:tensorflow:loss = 2.1085324, step = 16501 (1.265 sec)
INFO:tensorflow:global_step/sec: 78.7327
INFO:tensorflow:loss = 1.0253159, step = 16601 (1.270 sec)
INFO:tensorflow:global_step/sec: 79.4943
INFO:tensorflow:loss = 1.2692142, step = 16701 (1.260 sec)
INFO:tensorflow:global_step/sec: 77.604
INFO:tensorflow:loss = 1.4126577, step = 16801 (1.286 sec)
INFO:tensorflow:global_step/sec: 80.0176
INFO:tensorflow:loss = 9.675607, step = 16901 (1.250 sec)
INFO:tensorflow:global_step/sec: 70.5089
INFO:tensorflow:loss = 6.668715, step = 17001 (1.418 sec)
INFO:tensorflow:global_step/sec: 78.1752
INFO:tensorflow:loss = 7.7922974, step = 17101 (1.279 sec)
INFO:tensorflow:global_step/sec: 78.8548
INFO:tensorflow:loss = 2.433801, step = 17201 (1.268 sec)
INFO:tensorflow:global_step/sec: 74.6523
INFO:tensorflow:loss = 4.1493754, step = 17301 (1.340 sec)
INFO

<tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier at 0x7f3925f8b4e0>

In [448]:
eval_metrics = model.evaluate(input_fn=get_input_fn(validate, n_batch=validate.shape[0], shuffle=False), steps=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-15-22:15:13
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/matz/Desktop/ml_ids/NN_model2/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-05-15-22:15:18
INFO:tensorflow:Saving dict for global step 20000: accuracy = 0.9975493, accuracy_baseline = 0.7026406, auc = 0.9998531, auc_precision_recall = 0.9996146, average_loss = 0.0074825445, global_step = 20000, label/mean = 0.2973594, loss = 244.26018, precision = 0.9936417, prediction/mean = 0.2977546, recall = 0.99814564
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 20000: /home/matz/Desktop/ml_ids/NN_model2/model.ckpt-20000


In [449]:
for key,value in sorted(eval_metrics.items()):
  print('%s: %s' % (key, value))

accuracy: 0.9975493
accuracy_baseline: 0.7026406
auc: 0.9998531
auc_precision_recall: 0.9996146
average_loss: 0.0074825445
global_step: 20000
label/mean: 0.2973594
loss: 244.26018
precision: 0.9936417
prediction/mean: 0.2977546
recall: 0.99814564


In [450]:
results = []
results = list(model.predict(input_fn=get_input_fn(test, num_epochs=1, n_batch = X_test.shape[0], shuffle=False)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/matz/Desktop/ml_ids/NN_model2/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [451]:
y_pred = []
for el in results:
    y_pred.append(el["class_ids"])

In [452]:
name = "NN"

Acc = {}
F1S = {}
Prec = {}
Rec = {}
FPR = {}

Acc[name] = metrics.accuracy_score(test["attack_flag"], y_pred)
F1S[name] = metrics.f1_score(test["attack_flag"], y_pred)
Prec[name] = metrics.precision_score(test["attack_flag"], y_pred)
Rec[name] = metrics.recall_score(test["attack_flag"], y_pred)
tn, fp, fn, tp = metrics.confusion_matrix(test["attack_flag"], y_pred).ravel()
FPR[name] = fp/(fp+tn)

print("{0:2} Accuracy: {1:.5f}, F1-score: {2:.5f}, Precision: {3:.5f}, Recall: {4:.5f}, FPR: {5:.5f}".format(name, Acc[name], F1S[name], Prec[name], Rec[name], FPR[name]))
print("TN: {0:3}; FP: {1:3}; FN: {2:4}; TP: {3:3}\n".format(tn, fp, fn, tp))

NN Accuracy: 0.98484, F1-score: 0.99199, Precision: 0.99313, Recall: 0.99086, FPR: 0.12478
TN: 1487; FP: 212; FN:  283; TP: 30663



## New data prediction

In [286]:
test_set = pd.read_csv("20151220.txt", sep = "\t", header = None)
test_set.columns = ["conn_len", "service", "src_bytes", "dst_bytes", "conn_numb", "Same_srv_rate", "Serror_rate", "Srv_serror_rate", "Dst_host_count", "Dst_host_srv_count", "Dst_host_same_src_port_rate", "Dst_host_serror_rate", "Dst_host_srv_serror_rate", "Conn_state", "IDS_detection", "Malware_detection", "Ashula_detection", "attack_flag", "src_IP", "src_port", "dst_IP", "dst_port", "start_time", "proto"]
print(Counter(test_set["attack_flag"]))

Counter({-1: 189082, 1: 12436})


In [65]:
test_set.loc[:, "attack_flag"] = test_set.loc[:,"attack_flag"].replace(-2, -1)
test_set.loc[:, "attack_flag"] = test_set.loc[:, "attack_flag"] * -1
test_set.loc[:, "attack_flag"] = test_set.loc[:,"attack_flag"].replace(-1, 0)

In [27]:
# removing NaNs
test_set = test_set.dropna()
test_set.shape

(201518, 24)

In [28]:
test_set["well_known_src_pool"] = test_set.apply(wk, axis=1)
test_set["registered_src_pool"] = test_set.apply(reg, axis=1)
test_set["cli_src_pool"] = test_set.apply(cli, axis=1)

In [29]:
test_set["src_port"] = test_set["src_port"].apply(str)
test_set["dst_port"] = test_set["dst_port"].apply(str)

In [30]:
#test_new_data = test_set.sample(n=30000, random_state=1)
test_new_data = test_set.loc[range(30000), :]
test_new_data = test_new_data.sample(frac=1)
y_test_new_data = test_new_data.loc[:, "attack_flag"].values.ravel()
X_test_new_data = test_new_data.drop(["attack_flag"], axis=1)
print(test_new_data.shape)

(30000, 27)


In [31]:
print(Counter(test_new_data["attack_flag"]))

Counter({1: 29540, 0: 460})


In [32]:
results = []
results = list(model.predict(input_fn=get_input_fn(test_new_data, num_epochs=1, n_batch = X_test.shape[0], shuffle=False)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/matz/Desktop/ml_ids/NN_model/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [33]:
y_pred = []
for el in results:
    y_pred.append(el["class_ids"])

In [34]:
name = "NN"

Acc = {}
F1S = {}
Prec = {}
Rec = {}
FPR = {}

Acc[name] = metrics.accuracy_score(y_test_new_data, y_pred)
F1S[name] = metrics.f1_score(y_test_new_data, y_pred)
Prec[name] = metrics.precision_score(y_test_new_data, y_pred)
Rec[name] = metrics.recall_score(y_test_new_data, y_pred)
tn, fp, fn, tp = metrics.confusion_matrix(y_test_new_data, y_pred).ravel()
FPR[name] = fp/(fp+tn)

print("{0:2} Accuracy: {1:.5f}, F1-score: {2:.5f}, Precision: {3:.5f}, Recall: {4:.5f}, FPR: {5:.5f}".format(name, Acc[name], F1S[name], Prec[name], Rec[name], FPR[name]))
print("TN: {0:3}; FP: {1:3}; FN: {2:4}; TP: {3:3}\n".format(tn, fp, fn, tp))

NN Accuracy: 0.98410, F1-score: 0.99192, Precision: 0.99284, Recall: 0.99100, FPR: 0.45870
TN: 249; FP: 211; FN:  266; TP: 29274

