In [1]:
import math
import numpy as np
import h5py
import tensorflow as tf
from tensorflow.python.framework import ops
import pandas as pd
import time
#import matplotlib.pyplot as plt
#%matplotlib inline

np.random.seed(1)

In [2]:
# LOAD FILES INTO DATAFRAME

tic = time.time()

# Local path - prefix string with r to avoid unicode escape on '\U'
# path = r'C:\Users\Nick Bashour\Documents\Personal\14. Stanford\2. Academics\3. 2021 Spring\1. CS 230\2. Project\2. Data\MachineLearningCVE\\'
# AWS EC2 path
path = 'data/'

files = [
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX.csv',
    'Monday-WorkingHours.pcap_ISCX.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    'Tuesday-WorkingHours.pcap_ISCX.csv',
    'Wednesday-workingHours.pcap_ISCX.csv'
]

pd_files = {} # empty dict of pd objects

# Read in every file
for i in range(len(files)):    
    files[i] = path + files[i]   
    pd_files[i] = pd.read_csv(files[i])
    pd_files[i]["File #"] = i  # a column to track where each file came from
    
df = pd.concat(pd_files)
df.columns = df.columns.str.strip()

toc = time.time()
print("Data loaded in", toc-tic, "seconds")

Data loaded in 21.8242244720459 seconds


In [3]:
# EXTRACT LABELS
labels = df['Label'].unique()
labels = np.sort(labels)
print("Labels are:\n", labels)
print("\n# of unique labels: ", len(labels))

Labels are:
 ['BENIGN' 'Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web Attack - Brute Force' 'Web Attack - Sql Injection'
 'Web Attack - XSS']

# of unique labels:  15


In [4]:
# CAPTURE LABEL AND FILE FREQUENCY
label_freq = df.groupby("Label").size()
print(label_freq)

file_freq = df.groupby("File #").size()
print(file_freq)

Label
BENIGN                        2273097
Bot                              1966
DDoS                           128027
DoS GoldenEye                   10293
DoS Hulk                       231073
DoS Slowhttptest                 5499
DoS slowloris                    5796
FTP-Patator                      7938
Heartbleed                         11
Infiltration                       36
PortScan                       158930
SSH-Patator                      5897
Web Attack - Brute Force         1507
Web Attack - Sql Injection         21
Web Attack - XSS                  652
dtype: int64
File #
0    225745
1    286467
2    191033
3    529918
4    288602
5    170366
6    445909
7    692703
dtype: int64


In [5]:
# CONVERT DATAFRAME LABELS INTO INDICES FOR ONE-HOT REPRESENTATION

# Create a dictionary mapping labels to indices
labels_to_indices = {}
for i in range(len(labels)):
    labels_to_indices[labels[i]] = i
print("Lables to indices is:\n", labels_to_indices, "\n")

# Convert the dataframe column to indices
df["Label"] = df["Label"].map(labels_to_indices)

Lables to indices is:
 {'BENIGN': 0, 'Bot': 1, 'DDoS': 2, 'DoS GoldenEye': 3, 'DoS Hulk': 4, 'DoS Slowhttptest': 5, 'DoS slowloris': 6, 'FTP-Patator': 7, 'Heartbleed': 8, 'Infiltration': 9, 'PortScan': 10, 'SSH-Patator': 11, 'Web Attack - Brute Force': 12, 'Web Attack - Sql Injection': 13, 'Web Attack - XSS': 14} 



In [13]:
# CHECK INDEX / LABEL MAPPING WORKED
index_freq = df.groupby("Label").size()
print(index_freq)
for i in range(len(labels)):
    assert(label_freq[i] == index_freq[i])

Label
0     2273097
1        1966
2      128027
3       10293
4      231073
5        5499
6        5796
7        7938
8          11
9          36
10     158930
11       5897
12       1507
13         21
14        652
dtype: int64


In [6]:
# CREATE TEST & TRAINING DATA: iterate through each label type. Place 1,000 of each label type
# into the test file and the remaining into training.
tic = time.time()

test_samples_per_label = 1000

# Select features
feature_cols = df.columns.tolist()
feature_cols.remove('Label')
feature_cols.remove('File #')
feature_cols.remove('Destination Port')

# Empty dataframes to store test and training variables
X_train = pd.DataFrame()
X_test = pd.DataFrame()

# Iterate through files
for i in range(len(labels)):
    data = df[df["Label"] == i]
    data = data.sample(frac=1) # random sort

    # Add the first 1000 (or as many as available) datapoints to test, remaining to train
    test_rows = min(test_samples_per_label, len(data))
    test = data.head(test_rows)
    train = data.tail(len(data) - test_rows)
    assert(len(test) + len(train) == len(data)) 
        
    # Append X features and labels to train / test sets
    X_train = X_train.append(train[feature_cols])
    X_test = X_test.append(test[feature_cols])
    if i == 0:
        Y_train = train['Label']
        Y_test = test['Label']
    else:
        Y_train = Y_train.append(train['Label'])
        Y_test = Y_test.append(test['Label'])
    
# Transpose labels so their shape is (# samples, 1)
Y_train = Y_train.transpose()
Y_test = Y_test.transpose()
                      
toc = time.time()
print("Time elapsed:", toc-tic, "seconds")

Time elapsed: 70.18049430847168 seconds


In [15]:
# Check # of test samples corresponds to sum of min(1000,# of samples) across labels
n_test_samples = 0
for i in range(len(labels)):
    n_test_samples += min(1000, index_freq[i])
assert n_test_samples == len(X_test)

In [7]:
# CONVERT PANDAS INTO NP ARRAYS AND RUN CHECKS
X_train, X_test, Y_train, Y_test = X_train.to_numpy(), X_test.to_numpy(), Y_train.to_numpy(), Y_test.to_numpy()

for i in [X_train, X_test, Y_train, Y_test]:
    print(type(i))
    print(i.shape)    
    # Replace NaN's with 0's
    print("# of Nan's:", np.count_nonzero(np.isnan(i)))
    i[np.isnan(i)] = 0

<class 'numpy.ndarray'>
(2819023, 77)
# of Nan's: 1355
<class 'numpy.ndarray'>
(11720, 77)
# of Nan's: 3
<class 'numpy.ndarray'>
(2819023,)
# of Nan's: 0
<class 'numpy.ndarray'>
(11720,)
# of Nan's: 0


In [8]:
# CONVERT Y TO ONE-HOT ARRAYS

def one_hot(y, classes):
    # Input: array y of shape (m,) or (m,1) whose values represent indices
    # Output: one-hot matrix of shape (m, classes)
    y_OH = np.zeros((len(y), classes))
    for i in range(len(y)):
        y_OH[i,int(y[i])] = 1
    return y_OH

Y_train = one_hot(Y_train, len(labels))
Y_test = one_hot(Y_test, len(labels))
print(Y_train.shape)
print(Y_test.shape)

(2819023, 15)
(11720, 15)


In [9]:
# Normalize training & test sets according to training set data
mean = np.mean(X_train, axis=0)
epsilon = pow(10,-7)
std = np.std(X_train, axis=0) + epsilon
mean = mean.reshape(1, mean.shape[0])
std = std.reshape(1, std.shape[0])
assert(X_train.shape[1] == mean.shape[1]) # ensure proper sizes
assert(X_train.shape[1] == std.shape[1])
X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

  x = asanyarray(arr - arrmean)
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [10]:
# Check again for NaN's and remove
for i in [X_train, X_test]:
    print(i.shape)
    
    # Replace NaN's with 0's
    print("# of Nan's:", np.count_nonzero(np.isnan(i)))
    i[np.isnan(i)] = 0

(2819023, 77)
# of Nan's: 5638046
(11720, 77)
# of Nan's: 23440


In [20]:
# Write files to CSV
tic = time.time()

path = "clean_data/"
out_X_train = open(path+"X_train.csv", "w")
np.savetxt(path+'X_train.csv', X_train, delimiter=',')
out_Y_train = open(path+"Y_train.csv", "w")
np.savetxt(path+'Y_train.csv', Y_train, delimiter=',')

out_X_test = open(path+"X_test.csv", "w")
np.savetxt(path+'X_test.csv', X_test, delimiter=',')
out_Y_test = open(path+"Y_test.csv", "w")
np.savetxt(path+'Y_test.csv', Y_test, delimiter=',')

toc = time.time()
print("Data output in", toc-tic, "seconds")

Data output in 252.58813571929932 seconds
