In [None]:
import os
import numpy as np
import pandas as pd

database_directory = '../dataset'
file_list = ['Tuesday-WorkingHours.csv',
             'Wednesday-WorkingHours.csv',
             'Thursday-WorkingHours-Morning-WebAttacks.csv',
             'Thursday-WorkingHours-Afternoon-Infilteration.csv',
             'Friday-WorkingHours-Morning.csv',
             'Friday-WorkingHours-Afternoon-PortScan.csv',
             'Friday-WorkingHours-Afternoon-DDos.csv',
             'Monday-WorkingHours.csv']
df = pd.DataFrame()

for f in file_list:
    file_name = os.path.join(database_directory, f)
    df = df.append(pd.read_csv(file_name, header=0, encoding = 'unicode_escape'), ignore_index=True)
    print(f, 'is read.')
    
print('All files are read.')
print('Number of rows:', len(df))

In [None]:
df = df.replace('',np.nan)
df = df.dropna(how='all')
print('Empty lines dropped.')
print('Number of rows:', len(df))

df = df.drop_duplicates(subset=None, keep='first', inplace=False)
print('Duplicated lines dropped.')
print('Number of rows:', len(df))

df = df.reset_index(drop=True)

df.columns = [
    'flow_id',
    'source_ip',
    'source_port',
    'destination_ip',
    'destination_port',
    'protocol',
    'timestamp',
    'flow_duration',
    'total_fwd_packets',
    'total_backward_packets',
    'total_length_of_fwd_packets',
    'total_length_of_bwd_packets',
    'fwd_packet_length_max',
    'fwd_packet_length_min',
    'fwd_packet_length_mean',
    'fwd_packet_length_std',
    'bwd_packet_length_max',
    'bwd_packet_length_min',
    'bwd_packet_length_mean',
    'bwd_packet_length_std',
    'flow_bytes_s',
    'flow_packets_s',
    'flow_iat_mean',
    'flow_iat_std',
    'flow_iat_max',
    'flow_iat_min',
    'fwd_iat_total',
    'fwd_iat_mean',
    'fwd_iat_std',
    'fwd_iat_max',
    'fwd_iat_min',
    'bwd_iat_total',
    'bwd_iat_mean',
    'bwd_iat_std',
    'bwd_iat_max',
    'bwd_iat_min',
    'fwd_psh_flags',
    'bwd_psh_flags',
    'fwd_urg_flags',
    'bwd_urg_flags',
    'fwd_header_length',
    'bwd_header_length',
    'fwd_packets_s',
    'bwd_packets_s',
    'min_packet_length',
    'max_packet_length',
    'packet_length_mean',
    'packet_length_std',
    'packet_length_variance',
    'fin_flag_count',
    'syn_flag_count',
    'rst_flag_count',
    'psh_flag_count',
    'ack_flag_count',
    'urg_flag_count',
    'cwe_flag_count',
    'ece_flag_count',
    'down_up_ratio',
    'average_packet_size',
    'avg_fwd_segment_size',
    'avg_bwd_segment_size',
    'fwd_header_length_duplicate',
    'fwd_avg_bytes_bulk',
    'fwd_avg_packets_bulk',
    'fwd_avg_bulk_rate',
    'bwd_avg_bytes_bulk',
    'bwd_avg_packets_bulk',
    'bwd_avg_bulk_rate',
    'subflow_fwd_packets',
    'subflow_fwd_bytes',
    'subflow_bwd_packets',
    'subflow_bwd_bytes',
    'init_win_bytes_forward',
    'init_win_bytes_backward',
    'act_data_pkt_fwd',
    'min_seg_size_forward',
    'active_mean',
    'active_std',
    'active_max',
    'active_min',
    'idle_mean',
    'idle_std',
    'idle_max',
    'idle_min',
    'label'
    ]

#removing duplicate fwd_header_length and packet specific data
df.drop('fwd_header_length_duplicate', axis=1, inplace=True)
df.drop('flow_id', axis=1, inplace=True)
df.drop('source_ip', axis=1, inplace=True)
df.drop('source_port', axis=1, inplace=True)
df.drop('destination_ip', axis=1, inplace=True)
df.drop('destination_port', axis=1, inplace=True)
df.drop('timestamp', axis=1, inplace=True)
print('6 columns (flow_id, source_ip, source_port, destination_ip, destination_port, timestamp) are dropped')

In [None]:
df = df.to_numpy()
labels = df[:,-1]
df = df[:,:-1].astype(float)

# find column averages for average_flow_bytes_s and average_flow_packets_s (without nan and inf)
data_no_nan = df[~np.isnan(df).any(axis=1)]
data_no_inf = data_no_nan[~np.isinf(data_no_nan).any(axis=1)]
del data_no_nan
average_flow_bytes_s = np.average(data_no_inf[:,14])
average_flow_packets_s = np.average(data_no_inf[:,15])
del data_no_inf

for c in range(df.shape[0]):
    if np.isnan(df[c,14]): df[c,14] = 0
    if np.isnan(df[c,15]): df[c,15] = 0
    if np.isinf(df[c,14]): df[c,14] = average_flow_bytes_s
    if np.isinf(df[c,15]): df[c,15] = average_flow_packets_s
        
print('Nan entries are replaced with zeros and inf entries with the column average.')

np.save('../data/features.npy', df)
np.save('../data/labels.npy', labels)

print('Numpy files saved into the data folder.')

In [None]:
extra_benign = np.load('../data/extra_benign.npy')
print('Number of extra benign test samples:', len(extra_benign))
numbers = {}
for fold in range(1,6):
    test_index = np.load('../data/test_index' + str(fold) + '.npy')
    mask = np.ones(len(labels), dtype=bool)
    mask[test_index,] = False
    y_train, y_test_uniform = labels[mask], labels[~mask]
    
    unique_train, counts_train = np.unique(y_train, return_counts=True)
    unique_test_uniform, counts_test_uniform = np.unique(y_test_uniform, return_counts=True)
    for i in range(len(unique_train)):
        train_key = '_'.join([unique_train[i], 'tr'])
        test_key = '_'.join([unique_train[i], 'te'])
        if not train_key in numbers: numbers[train_key] = []
        if not test_key in numbers: numbers[test_key] = []
        numbers[train_key].append(counts_train[i])
        numbers[test_key].append(counts_test_uniform[i])

print(pd.DataFrame(numbers).transpose())
print(counts_test_uniform)