In [1]:
import numpy as np
import pandas as pd
import json
import os
from datetime import datetime
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

### Execution parameters

In [2]:
SEED = 10
limit_rows = 100000

In [3]:
def get_time():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

### Read individual data sets

In [4]:
base_folder = 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/'
sub_folders = filter(lambda x : ('bro' in x), [x[0] for x in os.walk(base_folder)])
base_filename = 'conn.log.labeled'

In [5]:
data_frames = []

for folder in sub_folders:  
    
    full_filename = folder.replace('\\','/') + '/' + base_filename
    
    print(f"{get_time()} Processing folder '{folder}' started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")
    
    df = pd.read_table(filepath_or_buffer=full_filename, skiprows=8, nrows=limit_rows)
    
    df.columns = [
        'ts',
        'uid',
        'id.orig_h',
        'id.orig_p',
        'id.resp_h',
        'id.resp_p',
        'proto',
        'service',
        'duration',
        'orig_bytes',
        'resp_bytes',
        'conn_state',
        'local_orig',
        'local_resp',
        'missed_bytes',
        'history',
        'orig_pkts',
        'orig_ip_bytes',
        'resp_pkts',
        'resp_ip_bytes',
        'label'
    ]
    
    df.drop(columns=['ts','uid','service','local_orig','local_resp','history','id.orig_h','id.resp_h'], inplace=True)
        
    df.drop(df.tail(1).index, inplace=True)
    
    print(f'{get_time()} Duplicated rows (before removal):\t{df.duplicated().sum()}')
    df.drop_duplicates(inplace=True)
    print(f'{get_time()} Duplicated rows (after removal):\t{df.duplicated().sum()}')
    
    data_frames.append(df)
        
    print(f"{get_time()} Processing finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}; DF shape: {df.shape}.\n")

2022-03-03 13:58:28 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-4-1\bro' started at 2022-03-03 13:58:28.
2022-03-03 13:58:28 Duplicated rows (before removal):	0
2022-03-03 13:58:28 Duplicated rows (after removal):	0
2022-03-03 13:58:28 Processing finished at 2022-03-03 13:58:28; DF shape: (451, 13).

2022-03-03 13:58:28 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1\bro' started at 2022-03-03 13:58:28.
2022-03-03 13:58:28 Duplicated rows (before removal):	144
2022-03-03 13:58:28 Duplicated rows (after removal):	0
2022-03-03 13:58:28 Processing finished at 2022-03-03 13:58:28; DF shape: (1229, 13).

2022-03-03 13:58:28 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1\Somfy-01\bro' started at 2022-03-03 13:58:28.
2022-03-0

2022-03-03 13:58:31 Duplicated rows (before removal):	29547
2022-03-03 13:58:31 Duplicated rows (after removal):	0
2022-03-03 13:58:31 Processing finished at 2022-03-03 13:58:31; DF shape: (70452, 13).



### Concatenate data sets

In [6]:
df_c = pd.concat(data_frames)

In [7]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,proto,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,1900.0,1900.0,udp,160.367579,7536,0,S0,0.0,24.0,8208.0,0.0,0.0,- benign -
1,32893.0,123.0,udp,0.016986,48,48,SF,0.0,1.0,76.0,1.0,76.0,- benign -
2,53395.0,443.0,tcp,0.003497,0,0,SF,0.0,5.0,212.0,3.0,144.0,- benign -
3,52801.0,53.0,udp,0.036724,34,311,SF,0.0,1.0,62.0,1.0,339.0,- benign -
4,1900.0,1900.0,udp,384.518261,15072,0,S0,0.0,48.0,16416.0,0.0,0.0,- benign -
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99989,6279.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
99990,29978.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
99991,22165.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
99996,41762.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan


### Standardize labels

In [8]:
df_c['label'].value_counts()

(empty)   Malicious   PartOfAHorizontalPortScan    180019
-   Malicious   PartOfAHorizontalPortScan          105223
-   Benign   -                                      72630
-   Malicious   DDoS                                40449
(empty)   Benign   -                                35540
-   Malicious   C&C                                  6818
(empty)   Malicious   C&C                            4119
(empty)   Malicious   Attack                         3814
-   benign   -                                       1804
-   Malicious   C&C-HeartBeat                         228
(empty)   Malicious   Okiru                           129
(empty)   Malicious   C&C-HeartBeat                   122
-   Malicious   Okiru                                 112
-   Malicious   Attack                                101
-   Malicious   C&C-FileDownload                       45
-   Malicious   C&C-Torii                              23
-   Malicious   FileDownload                           15
-   Malicious 

In [9]:
def replace_label(src,dest):
    df_c.loc[(df_c.label == src), 'label'] = dest

In [10]:
replace_label('-   Malicious   Attack',                                  'Attack')
replace_label('(empty)   Malicious   Attack',                            'Attack')
replace_label('-   benign   -',                                          'Benign')
replace_label('-   Benign   -',                                          'Benign')
replace_label('(empty)   Benign   -',                                    'Benign')
replace_label('CARhxZ3hLNVO3xYFok   Benign   -',                         'Benign')
replace_label('COLnd035cNITygYHp3   Benign   -',                         'Benign')
replace_label('-   Malicious   C&C',                                     'C&C')
replace_label('(empty)   Malicious   C&C',                               'C&C')
replace_label('-   Malicious   C&C-FileDownload',                        'C&C-FileDownload')
replace_label('-   Malicious   C&C-HeartBeat',                           'C&C-HeartBeat')
replace_label('(empty)   Malicious   C&C-HeartBeat',                     'C&C-HeartBeat')
replace_label('-   Malicious   C&C-HeartBeat-Attack',                    'C&C-HeartBeat-Attack')
replace_label('-   Malicious   C&C-HeartBeat-FileDownload',              'C&C-HeartBeat-FileDownload')
replace_label('-   Malicious   C&C-HeartBeat-PartOfAHorizontalPortScan', 'C&C-HeartBeat-PartOfAHorizontalPortScan')
replace_label('-   Malicious   C&C-Mirai',                               'C&C-Mirai')
replace_label('-   Malicious   C&C-PartOfAHorizontalPortScan',           'C&C-PartOfAHorizontalPortScan')
replace_label('-   Malicious   C&C-Torii',                               'C&C-Torii')
replace_label('-   Malicious   DDoS',                                    'DDoS')
replace_label('(empty)   Malicious   DDoS',                              'DDoS')
replace_label('-   Malicious   FileDownload',                            'FileDownload')
replace_label('-   Malicious   Okiru',                                   'Okiru')
replace_label('(empty)   Malicious   Okiru',                             'Okiru')
replace_label('-   Malicious   Okiru-Attack',                            'Okiru-Attack')
replace_label('-   Malicious   PartOfAHorizontalPortScan',               'PartOfAHorizontalPortScan')
replace_label('(empty)   Malicious   PartOfAHorizontalPortScan',         'PartOfAHorizontalPortScan')
replace_label('-   Malicious   PartOfAHorizontalPortScan-Attack',        'PartOfAHorizontalPortScan-Attack')

In [11]:
df_c['label'].value_counts()

PartOfAHorizontalPortScan     285242
Benign                        109974
DDoS                           40449
C&C                            10937
Attack                          3915
C&C-HeartBeat                    350
Okiru                            241
C&C-FileDownload                  45
C&C-Torii                         23
FileDownload                      15
C&C-HeartBeat-FileDownload        10
C&C-Mirai                          1
Name: label, dtype: int64

### Filter rows by label

In [12]:
vc = df_c['label'].value_counts(normalize=True)
print(f'**** Value counts (after) ****\n{vc}')
relevant_labels = ['Benign','DDoS','PartOfAHorizontalPortScan']

print(f'\n**** Dropping rows with label not in {relevant_labels} ****\n')
filtered_labels = df_c['label'].value_counts().index.drop(relevant_labels)
for label in filtered_labels:
    df_c.drop(df_c[df_c.label == label].index, inplace=True)

vc = df_c['label'].value_counts(normalize=True)
print(f'**** Value counts (after) ****\n{vc}')

**** Value counts (after) ****
PartOfAHorizontalPortScan     0.632182
Benign                        0.243736
DDoS                          0.089647
C&C                           0.024240
Attack                        0.008677
C&C-HeartBeat                 0.000776
Okiru                         0.000534
C&C-FileDownload              0.000100
C&C-Torii                     0.000051
FileDownload                  0.000033
C&C-HeartBeat-FileDownload    0.000022
C&C-Mirai                     0.000002
Name: label, dtype: float64

**** Dropping rows with label not in ['Benign', 'DDoS', 'PartOfAHorizontalPortScan'] ****

**** Value counts (after) ****
PartOfAHorizontalPortScan    0.674731
Benign                       0.234785
DDoS                         0.090484
Name: label, dtype: float64


In [13]:
df_c['label'].value_counts()

PartOfAHorizontalPortScan    241700
Benign                        84104
DDoS                          32413
Name: label, dtype: int64

### Replace missing values

In [14]:
df_c.loc[(df_c.duration == '-'), 'duration'] = 0.0
df_c.loc[(df_c.orig_bytes == '-'), 'orig_bytes'] = 0
df_c.loc[(df_c.resp_bytes == '-'), 'resp_bytes'] = 0

### Remove NaN values

In [15]:
print('NaN values (before removal):','\n',df_c.isna().sum())
df_c = df_c.dropna()
print('\nNaN values (after removal):','\n',df_c.isna().sum())

NaN values (before removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64

NaN values (after removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64


### Remove duplicate rows

In [16]:
current_cols = [x for x in df_c.columns]
print('Duplicated rows (before removal):\t',df_c.duplicated().sum())
df_c['count'] = 1
df_c = df_c.groupby(current_cols)['count'].count().reset_index().drop_duplicates()
print('Duplicated rows (after removal):\t',df_c.duplicated().sum())

Duplicated rows (before removal):	 120
Duplicated rows (after removal):	 0


### One-hot-encode categorical features

In [17]:
to_be_encoded = ['proto','conn_state']#,'id.orig_p','id.resp_p']
for col in to_be_encoded:
    df_c = pd.get_dummies(df_c, columns=[col])
    print(f'Column \'{col}\' successfully one-hot-encoded; new DF shape: {df_c.shape}.')

Column 'proto' successfully one-hot-encoded; new DF shape: (358097, 16).
Column 'conn_state' successfully one-hot-encoded; new DF shape: (358097, 28).


### Reorder columns to [..., count, label]

In [18]:
final_cols = [x for x in df_c.columns.values if x != 'label' and x != 'count']
final_cols.append('count')
final_cols.append('label')
print(final_cols,type(final_cols))
df_c = df_c.reindex(columns=final_cols)

['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'count', 'label'] <class 'list'>


### Define data types

In [19]:
df_c = df_c.astype({
    'id.orig_p'     : 'uint64',
    'id.resp_p'     : 'uint64',
    'duration'      : float,
    'orig_bytes'    : 'uint64',
    'resp_bytes'    : 'uint64',
    'missed_bytes'  : 'uint64',
    'orig_pkts'     : 'uint64',
    'orig_ip_bytes' : 'uint64',
    'resp_pkts'     : 'uint64',
    'resp_ip_bytes' : 'uint64',
    'label'         : 'category'
}).infer_objects()

In [20]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_icmp,proto_tcp,proto_udp,conn_state_OTH,conn_state_REJ,conn_state_RSTO,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,count,label
0,0,23,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,PartOfAHorizontalPortScan
1,0,53,2.751432,0,0,0,3069,122760,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,Benign
2,0,80,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,DDoS
3,0,80,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,DDoS
4,0,80,0.008748,0,0,0,19,760,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,DDoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358092,65532,81,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,PartOfAHorizontalPortScan
358093,65533,23,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,PartOfAHorizontalPortScan
358094,65534,81,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,PartOfAHorizontalPortScan
358095,65534,62336,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,DDoS


In [21]:
print(json.dumps(dict(df_c.dtypes), indent=4, default=str))

{
    "id.orig_p": "uint64",
    "id.resp_p": "uint64",
    "duration": "float64",
    "orig_bytes": "uint64",
    "resp_bytes": "uint64",
    "missed_bytes": "uint64",
    "orig_pkts": "uint64",
    "orig_ip_bytes": "uint64",
    "resp_pkts": "uint64",
    "resp_ip_bytes": "uint64",
    "proto_icmp": "uint8",
    "proto_tcp": "uint8",
    "proto_udp": "uint8",
    "conn_state_OTH": "uint8",
    "conn_state_REJ": "uint8",
    "conn_state_RSTO": "uint8",
    "conn_state_RSTOS0": "uint8",
    "conn_state_RSTR": "uint8",
    "conn_state_RSTRH": "uint8",
    "conn_state_S0": "uint8",
    "conn_state_S1": "uint8",
    "conn_state_S2": "uint8",
    "conn_state_S3": "uint8",
    "conn_state_SF": "uint8",
    "conn_state_SH": "uint8",
    "conn_state_SHR": "uint8",
    "count": "int64",
    "label": "category"
}


### Split data intro train/test subsets

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_c.drop(labels=['label'], axis=1),
                                                    df_c['label'],
                                                    test_size=0.2,
                                                    random_state=SEED)

### Select relevant features

In [23]:
from sklearn.feature_selection import VarianceThreshold

constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)

print('\nNon quasi-constant columns:')
non_constant_columns = [column for column in X_train.columns if column not in X_train.columns[constant_filter.get_support()]]
print(non_constant_columns)

print('\nQuasi-constant columns:')
constant_columns = [column for column in X_train.columns if column in X_train.columns[constant_filter.get_support()]]
print(constant_columns)

X_train = constant_filter.transform(X_train)
X_test = constant_filter.transform(X_test)


Non quasi-constant columns:
['proto_icmp', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'count']

Quasi-constant columns:
['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_S0']


### Scale data to [0,1] range

In [24]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

print(f'MinMax scaler producted the following scale:\n\n{scaler.scale_}')

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

MinMax scaler producted the following scale:

[1.52590219e-05 1.52590219e-05 3.33254216e-05 1.32368589e-08
 9.07350446e-06 1.72651934e-04 4.46754882e-08 1.54053408e-09
 2.87356322e-03 8.73889069e-06 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00]


### Label-encode target feature

In [25]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y_train)

print(f'Label encoder found the following classes:\n\n{encoder.classes_}')

y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

Label encoder found the following classes:

['Benign' 'DDoS' 'PartOfAHorizontalPortScan']


### Persist train and test data subsets

In [26]:
def persist_csv(df,name):
    if limit_rows is None:        
        pd.DataFrame(df).to_csv(f'pruned-by-label/iot23_combined_{name}.csv',
                                float_format='%g',
                                header=None,
                                index=None)
    else:
        pd.DataFrame(df).round(6).to_csv(f'pruned-by-label/iot23_combined_{int(limit_rows/1000)}k_{name}.csv',
                                float_format='%g',
                                header=None,
                                index=None)

In [27]:
persist_csv(X_train,'X_train')
persist_csv(y_train,'y_train')
print('X_train',X_train.shape,'\ny_train',y_train.shape)

persist_csv(X_test,'X_test')
persist_csv(y_test,'y_test')
print('X_test',X_test.shape,'\ny_test',y_test.shape)

X_train (286477, 14) 
y_train (286477,)
X_test (71620, 14) 
y_test (71620,)
