In [1]:
import numpy as np
import pandas as pd
import json
import os
from datetime import datetime
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

### Execution parameters

In [2]:
SEED = 10
limit_rows = (1)*1000

In [3]:
def get_time():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

### Read individual data sets

In [4]:
base_folder = 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/'
sub_folders = filter(lambda x : ('Honeypot' not in x and 'bro' in x), [x[0] for x in os.walk(base_folder)])
base_filename = 'conn.log.labeled'

In [5]:
data_frames = []

for folder in sub_folders:  
    
    full_filename = folder.replace('\\','/') + '/' + base_filename
    
    print(f"{get_time()} Processing folder '{folder}' started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")
    
    df = pd.read_table(filepath_or_buffer=full_filename, skiprows=8, nrows=limit_rows)
    
    df.columns = [
        'ts',
        'uid',
        'id.orig_h',
        'id.orig_p',
        'id.resp_h',
        'id.resp_p',
        'proto',
        'service',
        'duration',
        'orig_bytes',
        'resp_bytes',
        'conn_state',
        'local_orig',
        'local_resp',
        'missed_bytes',
        'history',
        'orig_pkts',
        'orig_ip_bytes',
        'resp_pkts',
        'resp_ip_bytes',
        'label'
    ]
    
    df.drop(columns=['ts','uid','service','local_orig','local_resp','history','id.orig_h','id.resp_h'], inplace=True)
        
    df.drop(df.tail(1).index, inplace=True)
    
    print(f'{get_time()} Duplicated rows (before removal):\t{df.duplicated().sum()}')
    df.drop_duplicates(inplace=True)
    print(f'{get_time()} Duplicated rows (after removal):\t{df.duplicated().sum()}')
    
    data_frames.append(df)
        
    print(f"{get_time()} Processing finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}; DF shape: {df.shape}.\n")

2022-03-01 19:39:02 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-1-1\bro' started at 2022-03-01 19:39:02.
2022-03-01 19:39:02 Duplicated rows (before removal):	216
2022-03-01 19:39:02 Duplicated rows (after removal):	0
2022-03-01 19:39:02 Processing finished at 2022-03-01 19:39:02; DF shape: (783, 13).

2022-03-01 19:39:02 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-17-1\bro' started at 2022-03-01 19:39:02.
2022-03-01 19:39:02 Duplicated rows (before removal):	951
2022-03-01 19:39:02 Duplicated rows (after removal):	0
2022-03-01 19:39:02 Processing finished at 2022-03-01 19:39:02; DF shape: (48, 13).

2022-03-01 19:39:02 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-20-1\bro' started at 2022-03-01 19:39:02.
2022-03

### Concatenate data sets

In [6]:
df_c = pd.concat(data_frames)

In [7]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,proto,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,56305.0,23.0,tcp,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
1,41101.0,23.0,tcp,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
2,60905.0,23.0,tcp,2.998796,0,0,S0,0.0,3.0,180.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
3,44301.0,23.0,tcp,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
4,50244.0,23.0,tcp,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,53429.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
995,63832.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
996,51275.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
997,16324.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan


### Standardize labels

In [8]:
df_c['label'].value_counts()

-   Benign   -                                     3631
-   Malicious   PartOfAHorizontalPortScan          2060
(empty)   Malicious   PartOfAHorizontalPortScan    1656
(empty)   Benign   -                               1205
-   Malicious   C&C                                 887
-   Malicious   DDoS                                464
(empty)   Malicious   C&C                           371
-   Malicious   C&C-HeartBeat                        96
-   Malicious   C&C-FileDownload                     41
-   Malicious   Okiru                                15
-   Malicious   C&C-HeartBeat-FileDownload           10
-   Malicious   C&C-Torii                             8
-   Malicious   FileDownload                          7
(empty)   Malicious   C&C-HeartBeat                   6
(empty)   Malicious   Attack                          4
(empty)   Malicious   Okiru                           3
-   Malicious   C&C-Mirai                             1
Name: label, dtype: int64

In [9]:
def replace_label(src,dest):
    df_c.loc[(df_c.label == src), 'label'] = dest

In [10]:
replace_label('-   Malicious   Attack',                                  'Attack')
replace_label('(empty)   Malicious   Attack',                            'Attack')
replace_label('-   Benign   -',                                          'Benign')
replace_label('(empty)   Benign   -',                                    'Benign')
replace_label('COLnd035cNITygYHp3   Benign   -',                         'Benign')
replace_label('-   Malicious   C&C',                                     'C&C')
replace_label('(empty)   Malicious   C&C',                               'C&C')
replace_label('-   Malicious   C&C-FileDownload',                        'C&C-FileDownload')
replace_label('-   Malicious   C&C-HeartBeat',                           'C&C-HeartBeat')
replace_label('(empty)   Malicious   C&C-HeartBeat',                     'C&C-HeartBeat')
replace_label('-   Malicious   C&C-HeartBeat-Attack',                    'C&C-HeartBeat-Attack')
replace_label('-   Malicious   C&C-HeartBeat-FileDownload',              'C&C-HeartBeat-FileDownload')
replace_label('-   Malicious   C&C-HeartBeat-PartOfAHorizontalPortScan', 'C&C-HeartBeat-PartOfAHorizontalPortScan')
replace_label('-   Malicious   C&C-Mirai',                               'C&C-Mirai')
replace_label('-   Malicious   C&C-PartOfAHorizontalPortScan',           'C&C-PartOfAHorizontalPortScan')
replace_label('-   Malicious   C&C-Torii',                               'C&C-Torii')
replace_label('-   Malicious   DDoS',                                    'DDoS')
replace_label('(empty)   Malicious   DDoS',                              'DDoS')
replace_label('-   Malicious   FileDownload',                            'FileDownload')
replace_label('-   Malicious   Okiru',                                   'Okiru')
replace_label('(empty)   Malicious   Okiru',                             'Okiru')
replace_label('-   Malicious   Okiru-Attack',                            'Okiru-Attack')
replace_label('-   Malicious   PartOfAHorizontalPortScan',               'PartOfAHorizontalPortScan')
replace_label('(empty)   Malicious   PartOfAHorizontalPortScan',         'PartOfAHorizontalPortScan')
replace_label('-   Malicious   PartOfAHorizontalPortScan-Attack',        'PartOfAHorizontalPortScan-Attack')

In [11]:
df_c['label'].value_counts()

Benign                        4836
PartOfAHorizontalPortScan     3716
C&C                           1258
DDoS                           464
C&C-HeartBeat                  102
C&C-FileDownload                41
Okiru                           18
C&C-HeartBeat-FileDownload      10
C&C-Torii                        8
FileDownload                     7
Attack                           4
C&C-Mirai                        1
Name: label, dtype: int64

### Remove less frequent classes

In [12]:
vc = df_c['label'].value_counts(normalize=True)
print(f'**** Value counts (before) ****\n{vc}')
threshold = vc.quantile(0.75)
print(f'\n**** Dropping rows with less than {threshold:g} occurrences ****\n')
df_c = df_c[df_c['label'].isin(vc.index[vc.gt(threshold)])]
vc = df_c['label'].value_counts(normalize=True)
print(f'**** Value counts (after) ****\n{vc}')

**** Value counts (before) ****
Benign                        0.462112
PartOfAHorizontalPortScan     0.355088
C&C                           0.120210
DDoS                          0.044338
C&C-HeartBeat                 0.009747
C&C-FileDownload              0.003918
Okiru                         0.001720
C&C-HeartBeat-FileDownload    0.000956
C&C-Torii                     0.000764
FileDownload                  0.000669
Attack                        0.000382
C&C-Mirai                     0.000096
Name: label, dtype: float64

**** Dropping rows with less than 0.0633063 occurrences ****

**** Value counts (after) ****
Benign                       0.492966
PartOfAHorizontalPortScan    0.378797
C&C                          0.128236
Name: label, dtype: float64


### Replace missing values

In [13]:
df_c.loc[(df_c.duration == '-'), 'duration'] = 0.0
df_c.loc[(df_c.orig_bytes == '-'), 'orig_bytes'] = 0
df_c.loc[(df_c.resp_bytes == '-'), 'resp_bytes'] = 0

### Remove NaN values

In [14]:
print('NaN values (before removal):','\n',df_c.isna().sum())
df_c = df_c.dropna()
print('\nNaN values (after removal):','\n',df_c.isna().sum())

NaN values (before removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64

NaN values (after removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64


### Remove duplicate rows

In [15]:
current_cols = [x for x in df_c.columns]
print('Duplicated rows (before removal):\t',df_c.duplicated().sum())
df_c['count'] = 1
df_c = df_c.groupby(current_cols)['count'].count().reset_index().drop_duplicates()
print('Duplicated rows (after removal):\t',df_c.duplicated().sum())

Duplicated rows (before removal):	 167
Duplicated rows (after removal):	 0


### One-hot-encode categorical features

In [16]:
to_be_encoded = ['proto','conn_state']#,'id.orig_p','id.resp_p']
for col in to_be_encoded:
    df_c = pd.get_dummies(df_c, columns=[col])
    print(f'Column \'{col}\' successfully one-hot-encoded; new DF shape: {df_c.shape}.')

Column 'proto' successfully one-hot-encoded; new DF shape: (9643, 16).
Column 'conn_state' successfully one-hot-encoded; new DF shape: (9643, 25).


### Reorder columns to [..., count, label]

In [17]:
final_cols = [x for x in df_c.columns.values if x != 'label' and x != 'count']
final_cols.append('count')
final_cols.append('label')
print(final_cols,type(final_cols))
df_c = df_c.reindex(columns=final_cols)

['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTR', 'conn_state_S0', 'conn_state_S1', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'count', 'label'] <class 'list'>


### Define data types

In [18]:
df_c = df_c.astype({
    'id.orig_p'     : 'uint64',
    'id.resp_p'     : 'uint64',
    'duration'      : float,
    'orig_bytes'    : 'uint64',
    'resp_bytes'    : 'uint64',
    'missed_bytes'  : 'uint64',
    'orig_pkts'     : 'uint64',
    'orig_ip_bytes' : 'uint64',
    'resp_pkts'     : 'uint64',
    'resp_ip_bytes' : 'uint64',
    'label'         : 'category'
}).infer_objects()

In [19]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_icmp,proto_tcp,proto_udp,conn_state_OTH,conn_state_REJ,conn_state_RSTO,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,count,label
0,0,8,0.000000,0,0,0,1,84,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,Benign
1,3,1,0.000000,0,0,0,1,56,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,2,Benign
2,3,1,0.000000,0,0,0,1,88,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,2,Benign
3,3,1,0.000001,120,0,0,2,176,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,Benign
4,3,1,0.000004,120,0,0,2,176,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9638,64543,81,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,PartOfAHorizontalPortScan
9639,64923,22,0.000000,0,0,0,0,0,1,88,0,1,0,1,0,0,0,0,0,0,0,0,0,1,Benign
9640,65091,81,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,PartOfAHorizontalPortScan
9641,65440,23,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,PartOfAHorizontalPortScan


In [20]:
print(json.dumps(dict(df_c.dtypes), indent=4, default=str))

{
    "id.orig_p": "uint64",
    "id.resp_p": "uint64",
    "duration": "float64",
    "orig_bytes": "uint64",
    "resp_bytes": "uint64",
    "missed_bytes": "uint64",
    "orig_pkts": "uint64",
    "orig_ip_bytes": "uint64",
    "resp_pkts": "uint64",
    "resp_ip_bytes": "uint64",
    "proto_icmp": "uint8",
    "proto_tcp": "uint8",
    "proto_udp": "uint8",
    "conn_state_OTH": "uint8",
    "conn_state_REJ": "uint8",
    "conn_state_RSTO": "uint8",
    "conn_state_RSTR": "uint8",
    "conn_state_S0": "uint8",
    "conn_state_S1": "uint8",
    "conn_state_S3": "uint8",
    "conn_state_SF": "uint8",
    "conn_state_SH": "uint8",
    "conn_state_SHR": "uint8",
    "count": "int64",
    "label": "category"
}


### Split data intro train/test subsets

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_c.drop(labels=['label'], axis=1),
                                                    df_c['label'],
                                                    test_size=0.2,
                                                    random_state=SEED)

### Select relevant features

In [22]:
from sklearn.feature_selection import VarianceThreshold

constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)

print('\nNon quasi-constant columns:')
non_constant_columns = [column for column in X_train.columns if column not in X_train.columns[constant_filter.get_support()]]
print(non_constant_columns)

print('\nQuasi-constant columns:')
constant_columns = [column for column in X_train.columns if column in X_train.columns[constant_filter.get_support()]]
print(constant_columns)

X_train = constant_filter.transform(X_train)
X_test = constant_filter.transform(X_test)


Non quasi-constant columns:
['proto_icmp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTR', 'conn_state_S1', 'conn_state_SH', 'conn_state_SHR']

Quasi-constant columns:
['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_tcp', 'proto_udp', 'conn_state_S0', 'conn_state_S3', 'conn_state_SF', 'count']


### Scale data to [0,1] range

In [23]:
from sklearn.preprocessing import MinMaxScaler

range_scaler = MinMaxScaler()
range_scaler.fit(X_train)

X_train = range_scaler.transform(X_train)
X_test = range_scaler.transform(X_test)

### Persist train and test data subsets

In [24]:
def persist_csv(df,name):
    if limit_rows is None:        
        pd.DataFrame(df).to_csv(f'sklearn/iot23_combined_{name}.csv',
                                float_format='%g',
                                header=None,
                                index=None)
    else:
        pd.DataFrame(df).round(6).to_csv(f'sklearn/iot23_combined_{int(limit_rows/1000)}k_{name}.csv',
                                float_format='%g',
                                header=None,
                                index=None)

In [25]:
persist_csv(X_train,'X_train')
persist_csv(y_train,'y_train')
print('X_train',X_train.shape,'\ny_train',y_train.shape)

persist_csv(X_test,'X_test')
persist_csv(y_test,'y_test')
print('X_test',X_test.shape,'\ny_test',y_test.shape)

X_train (7714, 16) 
y_train (7714,)
X_test (1929, 16) 
y_test (1929,)
