# Importing libraries

In [None]:
import numpy as np
import csv
import collections
import time
from sklearn.metrics import classification_report
import tensorflow as tf

# Open dataset files

In [None]:
Dataset = collections.namedtuple('Dataset', ['data', 'target'])
label_numbers = {'normal':0,'dos':1,'probe':2,'u2r':3,'r2l':4}

TRAINING_FILE_20P = '/content/drive/My Drive/20\ Percent\ Training\ Set.csv'
TRAINING_FILE_SMALL = '/content/drive/My Drive/Small\ Training\ Set.csv'
TRAINING_FILE_FULL = '/content/drive/My Drive/KDDTrain+.txt'
TEST_FILE = '/content/drive/My Drive/KDDTest+.txt'


# Read csv files in numpy arrays

In [None]:
def simple_csv_to_array(csv_file):
    """Make an array from a csv file
    """

    to_return = []
    packets = csv.reader(open(csv_file), delimiter=',',dialect=csv.excel_tab)
    for packet in packets:
        tmp = []
        for feature_index in range(0,len(packet)):
            tmp.append(packet[feature_index])
        to_return.append(tmp)
    return to_return

# Feature Extraction

In [None]:
# convert labels to numbers
def binarize_labels(raw_labels,label_lookups):

    labels = []
    # Figure out which group 
    for label in raw_labels:
        if not label == 'normal':
            label = label_lookups[label]
        label_number = label_numbers[label]
        # Make all non-normal packets anomalies
        # Just for now :)
        if label_number != 0: label_number = 1
        labels.append(np.int(label_number))
        if label_number != 1 and label_number != 0:
            print("Error")
    return np.array(labels)

In [None]:
# convert fetures to numbers
def binarize_features(raw_features,conversion_lookups):

    features = []

    for packet in raw_features:
        tmp = []
        for feature_index in range(0,len(packet)):
            if feature_index in conversion_lookups.keys():
                binarize = [np.float32(0.0)]*len(conversion_lookups[feature_index])
                # length of binarize is how many possible qualitative features
                label = packet[feature_index] # ex. 'tcp'
                if(conversion_lookups[feature_index].get(label)):
                    binarize[conversion_lookups[feature_index][label]] = np.float32(1.0)
                    #        --------    map    -------------
                    #										  --key--
                    #        --------- index -------------------------
                tmp.extend(binarize)
            else:
                tmp.append(np.float32(packet[feature_index]))

        features.append(np.array(tmp))

    features = np.array(features)
    return features



In [None]:
def map_for_labels(labels):
    """ given a list of values, return a map of label->val
    i.e. {'tcp':0,'ftp':1,etc}
    """
    the_map = {}
    for label in labels:
    	if not label in the_map:
        	the_map[label] = len(the_map.keys())
    return the_map


In [None]:
def feature_names(): 
    """An array of the KDD features, represented as dictionaries"""

    return [
        # c = continuous feature
        # d = discrete feature
        # basic features (0-8)
        {'name':'duration','type':'c'},
        {'name':'protocol_type','type':'d'},
        {'name':'service','type':'d'},
        {'name':'flag','type':'d'},
        {'name':'src_bytes','type':'c'},
        {'name':'dst_bytes','type':'c'},
        {'name':'land','type':'d'},
        {'name':'wrong_fragment','type':'c'},
        {'name':'urgent','type':'c'},
        
        {'name':'hot','type':'c'},
        {'name':'num_failed_logins','type':'c'},
        {'name':'logged_in','type':'d'},
        {'name':'num_compromised','type':'c'},
        {'name':'root_shell','type':'d'},
        {'name':'su_attempted','type':'d'},
        {'name':'num_root','type':'c'},
        {'name':'num_file_creations','type':'c'},
        {'name':'num_shells','type':'c'},
        {'name':'num_access_files','type':'c'},
        {'name':'num_outbound_cmds','type':'c'},
        {'name':'is_hot_login','type':'d'},
        {'name':'is_guest_login','type':'d'},
        
        {'name':'count','type':'d'},
        {'name':'srv_count','type':'d'},
        {'name':'serror_rate','type':'c'},
        {'name':'srv_serror_rate','type':'c'},
        {'name':'rerror_rate','type':'c'},
        {'name':'srv_rerror_rate','type':'c'},
        {'name':'same_srv_rate','type':'c'},
        {'name':'diff_srv_rate','type':'c'},
        {'name':'srv_diff_host_rate','type':'c'},
        # host based traffic features (31-40)
        {'name':'dst_host_count','type':'d'},
        {'name':'dst_host_srv_count','type':'d'},
        {'name':'dst_host_same_srv_rate','type':'c'},
        {'name':'dst_host_diff_srv_rate','type':'c'},
        {'name':'dst_host_same_src_port_rate','type':'c'},
        {'name':'dst_host_srv_diff_host_rate','type':'c'},
        {'name':'dst_host_serror_rate','type':'c'},
        {'name':'dst_host_srv_serror_rate','type':'c'},
        {'name':'dst_host_rerror_rate','type':'c'},
        {'name':'dst_host_rerror_rate','type':'c'},
        {'name':'dst_host_srv_rerror_rate','type':'c'}
    ]



In [None]:
def get_label_groups():
    """ The label groups for types of attacks"""

    return  {	    # denial of service attacks
					'back':'dos',
					'land':'dos',
					'neptune':'dos',
					'pod':'dos',
					'smurf':'dos',
					'teardrop':'dos',
					'apache2':'dos',
					'udpstorm':'dos',
					'processtable':'dos',
					'worm':'dos',
                    'mailbomb':'dos',

					# probe attacks
					'satan':'probe',
					'ipsweep':'probe',
					'nmap':'probe',
					'portsweep':'probe',
					'mscan':'probe',
					'saint':'probe',

					# root to local (r2l) attacks
					'guess_passwd':'r2l',
					'ftp_write':'r2l',
					'imap':'r2l',
					'phf':'r2l',
					'multihop':'r2l',
					'warezmaster':'r2l',
					'warezclient':'r2l' ,
					'spy':'r2l',
					'xlock':'r2l',
					'xsnoop':'r2l',
					'snmpguess':'r2l',
					'snmpgetattack':'r2l',
					'httptunnel':'r2l',
					'sendmail':'r2l',
					'named':'r2l',

					# user to root (u2r) attacks
					'buffer_overflow':'u2r',
					'loadmodule':'u2r',
					'rootkit':'u2r',
                    'perl':'u2r',
                    'sqlattack':'u2r',
					'xterm':'u2r',
					'ps':'u2r'
			}


# Deep Neural Network Implementation

In [None]:
def run_dnn_with_units_steps(training_set,test_set,units_array,num_steps):
  
    # as per tensorflow's recommendation / sample code
    x_train, x_test, y_train, y_test = training_set.data, test_set.data, \
              training_set.target, test_set.target


    #Build a DNN!
    start = time.clock()
    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=1)]
    
    classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=units_array)
    classifier.fit(x=x_train, y=y_train, steps=num_steps)
    test_pred = classifier.predict(x_test)
    
    stop = time.clock()


    print('---------------------------------------------')
    print('DNN with hidden units: ' + str(units_array))
    print('Number of steps: ' + str(num_steps))
    print('Seconds elapsed: {}'.format(stop - start))
    print(classification_report(list(y_test),list(test_pred)))
    print('---------------------------------------------')
  



In [None]:

# Grab the raw data at face value
training_file = np.array(simple_csv_to_array(TRAINING_FILE_FULL))
testing_file = np.array(simple_csv_to_array(TEST_FILE))

# Determine the number of columns
columns = len(training_file[0])
# Delete the last column
training_file = np.delete(training_file,columns-1,1)
testing_file = np.delete(testing_file,columns-1,1)

# Separate labels and features
training_labels = training_file[:,columns-2]
testing_labels = testing_file[:,columns-2]
training_features = np.delete(training_file,columns-2,1)
testing_features = np.delete(testing_file,columns-2,1)

# Determine which features need lookups
# that is, which features are qualitative
# ex. 'tcp'
feature_lookups = {}
for i in range(0,len(training_file[0])):
    try:
        np.float32(training_file[0][i])
    except ValueError as e:
        feature_lookups[i] = map_for_labels(training_file[:,i])

 
label_lookups = get_label_groups()

training_features = binarize_features(training_file,feature_lookups)
test_features = binarize_features(testing_file,feature_lookups)
training_labels = binarize_labels(training_labels,label_lookups)
test_labels = binarize_labels(testing_labels,label_lookups)
print("NUMBER OF FEATURES AFTER BINARIZATION:")
print(len(training_features[0]))



NUMBER OF FEATURES AFTER BINARIZATION:
145


In [None]:
# Group features and datasets
the_training_set = Dataset(training_features,training_labels)
the_test_set = Dataset(test_features,test_labels)

num_features = len(training_features[0])
unit_trials = []
unit_trials.append([num_features]) #baseline
unit_trials.append([num_features,2*num_features])
unit_trials.append([num_features,2*num_features,num_features])

step_trials = []
step_trials.append(100)
step_trials.append(200)
step_trials.append(300)

In [None]:
the_training_set# Run the dnn with every possible configuration
for trial in unit_trials:
    for num_steps in step_trials:
        run_dnn_with_units_steps(the_training_set,the_test_set,trial,num_steps)



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe8e7ffac88>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpq2e1gyek', '_session_creation_timeout_secs': 7200}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done run