In [1]:
import numpy as np
import pandas as pd
import json
import os
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

### Execution parameters

In [2]:
# pseudorandom number generator seed
SEED = 10

# number of rows to be read from each data set (None = all rows)
limit_rows     = 1000

# can be 'by_frequency' (min_rel_freq), 'by_label' (labels_to_keep), 'by_quantile' (quantile_pct) or None (no filtering)
filter_mode    = 'by_label'
min_rel_freq   = 0.01
labels_to_keep = ['Benign', 'DDoS', 'Okiru', 'PartOfAHorizontalPortScan']
quantile_pct   = 0.75

In [3]:
from datetime import datetime

def get_time():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [4]:
from collections import OrderedDict
from operator import getitem

def pretty_print_value_counts(df_,lpad=64,rpad=12):
    vc_norm_f = df_['label'].value_counts(normalize=False).to_dict()
    vc_norm_t = df_['label'].value_counts(normalize=True).to_dict()
    ans = {}
    for label in df_['label'].unique():
        ans[label] = {'abs' : vc_norm_f[label], 'rel' : vc_norm_t[label]}        
    ans = OrderedDict(sorted(ans.items(), key=lambda x: getitem(x[1], 'abs'), reverse=True))
    output = []
    output.append(f"+-{'-----'.ljust(lpad,'-')}-+-{'-----'.rjust(rpad,'-')}-+-{'---------'.rjust(rpad,'-')}-+")
    output.append(f"| {'Label'.center(lpad)} | {'Count'.center(rpad)} | {'Count (%)'.center(rpad)} |")
    output.append(f"+-{'-----'.ljust(lpad,'-')}-+-{'-----'.rjust(rpad,'-')}-+-{'---------'.rjust(rpad,'-')}-+")
    for key,value in ans.items():
        col_1 = f"{key.ljust(lpad)}"
        col_2 = f"{value['abs']}".rjust(rpad)
        col_3 = f"{(100.0 * value['rel']):.06f}".rjust(rpad)
        output.append(f'| {col_1} | {col_2} | {col_3} |')
    output.append(f"+-{'-----'.ljust(lpad,'-')}-+-{'-----'.rjust(rpad,'-')}-+-{'---------'.rjust(rpad,'-')}-+")
    for line in output:
        print(line)
    return output

In [5]:
print(f"{' Execution parameters '.center(77,'*')}"            +
      '\n* ' + f'SEED           = {SEED}'.ljust(73)           + ' *' \
      '\n* ' + f'limit_rows     = {limit_rows}'.ljust(73)     + ' *' \
      '\n* ' + f'filter_mode    = {filter_mode}'.ljust(73)    + ' *' \
      '\n* ' + f'min_rel_freq   = {min_rel_freq}'.ljust(73) + ' *' \
      '\n* ' + f'labels_to_keep = {labels_to_keep}'.ljust(73) + ' *' \
      '\n'.ljust(80,'*'))

**************************** Execution parameters ***************************
* SEED           = 10                                                       *
* limit_rows     = 1000                                                     *
* filter_mode    = by_label                                                 *
* min_rel_freq   = 0.01                                                     *
* labels_to_keep = ['Benign', 'DDoS', 'Okiru', 'PartOfAHorizontalPortScan'] *
*****************************************************************************


### Read individual data sets

In [6]:
base_folder = 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/'
sub_folders = filter(lambda x : ('bro' in x), [x[0] for x in os.walk(base_folder)])
base_filename = 'conn.log.labeled'

In [7]:
data_frames = []

for folder in sub_folders:  

    full_filename = folder.replace('\\','/') + '/' + base_filename

    print(f"{get_time()} Processing folder '{folder}' started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

    df = pd.read_table(filepath_or_buffer=full_filename, skiprows=8, nrows=limit_rows)

    df.columns = [
        'ts',
        'uid',
        'id.orig_h',
        'id.orig_p',
        'id.resp_h',
        'id.resp_p',
        'proto',
        'service',
        'duration',
        'orig_bytes',
        'resp_bytes',
        'conn_state',
        'local_orig',
        'local_resp',
        'missed_bytes',
        'history',
        'orig_pkts',
        'orig_ip_bytes',
        'resp_pkts',
        'resp_ip_bytes',
        'label'
    ]

    df.drop(columns=['ts','uid','service','local_orig','local_resp','history','id.orig_h','id.resp_h'], inplace=True)

    df.drop(df.tail(1).index, inplace=True)

    print(f'{get_time()} Duplicated rows (before removal):\t{df.duplicated().sum()}')
    df.drop_duplicates(inplace=True)
    print(f'{get_time()} Duplicated rows (after removal):\t{df.duplicated().sum()}')

    data_frames.append(df)

    print(f"{get_time()} Processing finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}; DF shape: {df.shape}.\n")

2022-03-11 01:22:25 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-4-1\bro' started at 2022-03-11 01:22:25.
2022-03-11 01:22:25 Duplicated rows (before removal):	0
2022-03-11 01:22:25 Duplicated rows (after removal):	0
2022-03-11 01:22:25 Processing finished at 2022-03-11 01:22:25; DF shape: (451, 13).

2022-03-11 01:22:25 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1\bro' started at 2022-03-11 01:22:25.
2022-03-11 01:22:25 Duplicated rows (before removal):	115
2022-03-11 01:22:25 Duplicated rows (after removal):	0
2022-03-11 01:22:25 Processing finished at 2022-03-11 01:22:25; DF shape: (884, 13).

2022-03-11 01:22:25 Processing folder 'D:/Workspace/IoT-23 Dataset/iot_23_datasets_small/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1\Somfy-01\bro' started at 2022-03-11 01:22:25.
2022-03-11

### Concatenate data sets

In [8]:
df_c = pd.concat(data_frames)

In [9]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,proto,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,1900.0,1900.0,udp,160.367579,7536,0,S0,0.0,24.0,8208.0,0.0,0.0,- benign -
1,32893.0,123.0,udp,0.016986,48,48,SF,0.0,1.0,76.0,1.0,76.0,- benign -
2,53395.0,443.0,tcp,0.003497,0,0,SF,0.0,5.0,212.0,3.0,144.0,- benign -
3,52801.0,53.0,udp,0.036724,34,311,SF,0.0,1.0,62.0,1.0,339.0,- benign -
4,1900.0,1900.0,udp,384.518261,15072,0,S0,0.0,48.0,16416.0,0.0,0.0,- benign -
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,53429.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
995,63832.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
996,51275.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
997,16324.0,23.0,tcp,-,-,-,S0,0.0,1.0,40.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan


### Standardize labels

In [10]:
pretty_print_value_counts(df_c)

+------------------------------------------------------------------+--------------+--------------+
|                              Label                               |    Count     |  Count (%)   |
+------------------------------------------------------------------+--------------+--------------+
| -   Benign   -                                                   |         3631 |    30.451191 |
| -   Malicious   PartOfAHorizontalPortScan                        |         2060 |    17.276082 |
| (empty)   Malicious   PartOfAHorizontalPortScan                  |         1656 |    13.887957 |
| -   benign   -                                                   |         1459 |    12.235827 |
| (empty)   Benign   -                                             |         1205 |    10.105669 |
| -   Malicious   C&C                                              |          887 |     7.438779 |
| -   Malicious   DDoS                                             |          464 |     3.891312 |
| (empty) 

['+------------------------------------------------------------------+--------------+--------------+',
 '|                              Label                               |    Count     |  Count (%)   |',
 '+------------------------------------------------------------------+--------------+--------------+',
 '| -   Benign   -                                                   |         3631 |    30.451191 |',
 '| -   Malicious   PartOfAHorizontalPortScan                        |         2060 |    17.276082 |',
 '| (empty)   Malicious   PartOfAHorizontalPortScan                  |         1656 |    13.887957 |',
 '| -   benign   -                                                   |         1459 |    12.235827 |',
 '| (empty)   Benign   -                                             |         1205 |    10.105669 |',
 '| -   Malicious   C&C                                              |          887 |     7.438779 |',
 '| -   Malicious   DDoS                                             |   

In [11]:
def replace_label(df_,src_,dest_):
    df_.loc[(df_.label == src_), 'label'] = dest_

In [12]:
replace_label(df_c, '-   Malicious   Attack',                                  'Attack')
replace_label(df_c, '(empty)   Malicious   Attack',                            'Attack')
replace_label(df_c, '-   benign   -',                                          'Benign')
replace_label(df_c, '-   Benign   -',                                          'Benign')
replace_label(df_c, '(empty)   Benign   -',                                    'Benign')
replace_label(df_c, 'CARhxZ3hLNVO3xYFok   Benign   -',                         'Benign')
replace_label(df_c, 'COLnd035cNITygYHp3   Benign   -',                         'Benign')
replace_label(df_c, '-   Malicious   C&C',                                     'C&C')
replace_label(df_c, '(empty)   Malicious   C&C',                               'C&C')
replace_label(df_c, '-   Malicious   C&C-FileDownload',                        'C&C-FileDownload')
replace_label(df_c, '-   Malicious   C&C-HeartBeat',                           'C&C-HeartBeat')
replace_label(df_c, '(empty)   Malicious   C&C-HeartBeat',                     'C&C-HeartBeat')
replace_label(df_c, '-   Malicious   C&C-HeartBeat-Attack',                    'C&C-HeartBeat-Attack')
replace_label(df_c, '-   Malicious   C&C-HeartBeat-FileDownload',              'C&C-HeartBeat-FileDownload')
replace_label(df_c, '-   Malicious   C&C-HeartBeat-PartOfAHorizontalPortScan', 'C&C-HeartBeat-PartOfAHorizontalPortScan')
replace_label(df_c, '-   Malicious   C&C-Mirai',                               'C&C-Mirai')
replace_label(df_c, '-   Malicious   C&C-PartOfAHorizontalPortScan',           'C&C-PartOfAHorizontalPortScan')
replace_label(df_c, '-   Malicious   C&C-Torii',                               'C&C-Torii')
replace_label(df_c, '-   Malicious   DDoS',                                    'DDoS')
replace_label(df_c, '(empty)   Malicious   DDoS',                              'DDoS')
replace_label(df_c, '-   Malicious   FileDownload',                            'FileDownload')
replace_label(df_c, '-   Malicious   Okiru',                                   'Okiru')
replace_label(df_c, '(empty)   Malicious   Okiru',                             'Okiru')
replace_label(df_c, '-   Malicious   Okiru-Attack',                            'Okiru-Attack')
replace_label(df_c, '-   Malicious   PartOfAHorizontalPortScan',               'PartOfAHorizontalPortScan')
replace_label(df_c, '(empty)   Malicious   PartOfAHorizontalPortScan',         'PartOfAHorizontalPortScan')
replace_label(df_c, '-   Malicious   PartOfAHorizontalPortScan-Attack',        'PartOfAHorizontalPortScan-Attack')

In [13]:
pretty_print_value_counts(df_c)

+------------------------------------------------------------------+--------------+--------------+
|                              Label                               |    Count     |  Count (%)   |
+------------------------------------------------------------------+--------------+--------------+
| Benign                                                           |         6295 |    52.792687 |
| PartOfAHorizontalPortScan                                        |         3716 |    31.164039 |
| C&C                                                              |         1258 |    10.550151 |
| DDoS                                                             |          464 |     3.891312 |
| C&C-HeartBeat                                                    |          102 |     0.855418 |
| C&C-FileDownload                                                 |           41 |     0.343844 |
| Okiru                                                            |           18 |     0.150956 |
| C&C-Hear

['+------------------------------------------------------------------+--------------+--------------+',
 '|                              Label                               |    Count     |  Count (%)   |',
 '+------------------------------------------------------------------+--------------+--------------+',
 '| Benign                                                           |         6295 |    52.792687 |',
 '| PartOfAHorizontalPortScan                                        |         3716 |    31.164039 |',
 '| C&C                                                              |         1258 |    10.550151 |',
 '| DDoS                                                             |          464 |     3.891312 |',
 '| C&C-HeartBeat                                                    |          102 |     0.855418 |',
 '| C&C-FileDownload                                                 |           41 |     0.343844 |',
 '| Okiru                                                            |   

### Remove less relevant rows (if specified)

In [14]:
if filter_mode == 'by_frequency':
    vcd = df_c['label'].value_counts(normalize=True).to_dict()
    relevant_labels = [key for key,value in vcd.items() if value > min_rel_freq]
    print(f'\n**** Dropping rows with relative frequency inferior to {100*min_rel_freq:.2f}% ****\n')
    filtered_labels = df_c['label'].value_counts().index.drop(relevant_labels)
    for label in filtered_labels:
        df_c.drop(df_c[df_c.label == label].index, inplace=True)

elif filter_mode == 'by_label':
    vc = df_c['label'].value_counts(normalize=False)
    print(f'\n**** Dropping rows with label not in {labels_to_keep} ****\n')
    filtered_labels = df_c['label'].value_counts().index.drop(labels_to_keep)
    for label in filtered_labels:
        df_c.drop(df_c[df_c.label == label].index, inplace=True)        

elif filter_mode == 'by_quantile':
    vc = df_c['label'].value_counts(normalize=False)
    threshold = int(vc.quantile(quantile_pct))
    print(f'\n**** Dropping rows with number of occurrences inferior to the {int(100*quantile_pct)}% quantile ****\n')
    df_c = df_c[df_c['label'].isin(vc.index[vc.gt(threshold)])]

pretty_print_value_counts(df_c)


**** Dropping rows with label not in ['Benign', 'DDoS', 'Okiru', 'PartOfAHorizontalPortScan'] ****

+------------------------------------------------------------------+--------------+--------------+
|                              Label                               |    Count     |  Count (%)   |
+------------------------------------------------------------------+--------------+--------------+
| Benign                                                           |          447 |    62.605042 |
| PartOfAHorizontalPortScan                                        |          239 |    33.473389 |
| DDoS                                                             |           26 |     3.641457 |
| Okiru                                                            |            2 |     0.280112 |
+------------------------------------------------------------------+--------------+--------------+


['+------------------------------------------------------------------+--------------+--------------+',
 '|                              Label                               |    Count     |  Count (%)   |',
 '+------------------------------------------------------------------+--------------+--------------+',
 '| Benign                                                           |          447 |    62.605042 |',
 '| PartOfAHorizontalPortScan                                        |          239 |    33.473389 |',
 '| DDoS                                                             |           26 |     3.641457 |',
 '| Okiru                                                            |            2 |     0.280112 |',
 '+------------------------------------------------------------------+--------------+--------------+']

### Replace missing values

In [15]:
df_c.loc[(df_c.duration == '-'), 'duration'] = 0.0
df_c.loc[(df_c.orig_bytes == '-'), 'orig_bytes'] = 0
df_c.loc[(df_c.resp_bytes == '-'), 'resp_bytes'] = 0

### Remove NaN values

In [16]:
print('NaN values (before removal):','\n',df_c.isna().sum())
df_c = df_c.dropna()
print('\nNaN values (after removal):','\n',df_c.isna().sum())

NaN values (before removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64

NaN values (after removal): 
 id.orig_p        0
id.resp_p        0
proto            0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64


### Remove duplicate rows

In [17]:
current_cols = [x for x in df_c.columns]
print('Duplicated rows (before removal):\t',df_c.duplicated().sum())
df_c['count'] = 1
df_c = df_c.groupby(current_cols)['count'].count().reset_index().drop_duplicates()
print('Duplicated rows (after removal):\t',df_c.duplicated().sum())

Duplicated rows (before removal):	 3
Duplicated rows (after removal):	 0


### One-hot-encode categorical features

In [18]:
to_be_encoded = ['proto','conn_state']#,'id.orig_p','id.resp_p']
for col in to_be_encoded:
    df_c = pd.get_dummies(df_c, columns=[col])
    print(f'Column \'{col}\' successfully one-hot-encoded; new DF shape: {df_c.shape}.')

Column 'proto' successfully one-hot-encoded; new DF shape: (711, 16).
Column 'conn_state' successfully one-hot-encoded; new DF shape: (711, 23).


### Reorder columns to [..., count, label]

In [19]:
final_cols = [x for x in df_c.columns.values if x != 'label' and x != 'count']
final_cols.append('count')
final_cols.append('label')
print(final_cols,type(final_cols))
df_c = df_c.reindex(columns=final_cols)

['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTR', 'conn_state_S0', 'conn_state_S1', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'count', 'label'] <class 'list'>


### Define data types

In [20]:
df_c = df_c.astype({
    'id.orig_p'     : 'uint64',
    'id.resp_p'     : 'uint64',
    'duration'      : float,
    'orig_bytes'    : 'uint64',
    'resp_bytes'    : 'uint64',
    'missed_bytes'  : 'uint64',
    'orig_pkts'     : 'uint64',
    'orig_ip_bytes' : 'uint64',
    'resp_pkts'     : 'uint64',
    'resp_ip_bytes' : 'uint64',
    'label'         : 'category'
}).infer_objects()

In [21]:
df_c

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_icmp,proto_tcp,proto_udp,conn_state_OTH,conn_state_REJ,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_S3,conn_state_SF,conn_state_SH,count,label
0,3,1,0.000000,0,0,0,1,56,0,0,1,0,0,1,0,0,0,0,0,0,0,1,Benign
1,8,0,0.014866,560,560,0,10,840,10,840,1,0,0,1,0,0,0,0,0,0,0,1,Benign
2,8,0,0.015672,560,560,0,10,840,10,840,1,0,0,1,0,0,0,0,0,0,0,1,Benign
3,68,67,90.034713,3300,0,0,11,3608,0,0,0,0,1,0,0,0,1,0,0,0,0,1,Benign
4,68,67,58.133510,3000,0,0,10,3280,0,0,0,0,1,0,0,0,1,0,0,0,0,1,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,60818,22,113.222444,440,23464,0,201,10892,285,38284,0,1,0,1,0,0,0,0,0,0,0,1,Benign
707,60902,80,125.214360,910,182,0,7,1202,6,434,0,1,0,0,0,1,0,0,0,0,0,1,Benign
708,60917,53,0.000498,45,45,0,1,73,1,73,0,0,1,0,0,0,0,0,0,1,0,1,Benign
709,61822,81,0.000000,0,0,0,1,40,0,0,0,1,0,0,0,0,1,0,0,0,0,1,PartOfAHorizontalPortScan


In [22]:
print(json.dumps(dict(df_c.dtypes), indent=4, default=str))

{
    "id.orig_p": "uint64",
    "id.resp_p": "uint64",
    "duration": "float64",
    "orig_bytes": "uint64",
    "resp_bytes": "uint64",
    "missed_bytes": "uint64",
    "orig_pkts": "uint64",
    "orig_ip_bytes": "uint64",
    "resp_pkts": "uint64",
    "resp_ip_bytes": "uint64",
    "proto_icmp": "uint8",
    "proto_tcp": "uint8",
    "proto_udp": "uint8",
    "conn_state_OTH": "uint8",
    "conn_state_REJ": "uint8",
    "conn_state_RSTR": "uint8",
    "conn_state_S0": "uint8",
    "conn_state_S1": "uint8",
    "conn_state_S3": "uint8",
    "conn_state_SF": "uint8",
    "conn_state_SH": "uint8",
    "count": "int64",
    "label": "category"
}


### Persist prettified value counts

In [23]:
folder = 'unfiltered' if filter_mode is None else f'filtered_{filter_mode}'
filename = f'{folder}/value_counts.txt' if limit_rows is None else f'{folder}/value_counts_{int(limit_rows/1000)}k.txt'
if not os.path.exists(folder):
    os.makedirs(folder)
with open(filename, 'w') as file:
    for line in pretty_print_value_counts(df_c):
        file.write(line + '\n')

+------------------------------------------------------------------+--------------+--------------+
|                              Label                               |    Count     |  Count (%)   |
+------------------------------------------------------------------+--------------+--------------+
| Benign                                                           |          444 |    62.447257 |
| PartOfAHorizontalPortScan                                        |          239 |    33.614627 |
| DDoS                                                             |           26 |     3.656821 |
| Okiru                                                            |            2 |     0.281294 |
+------------------------------------------------------------------+--------------+--------------+


### Split data intro train/test subsets

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_c.drop(labels=['label'], axis=1),
                                                    df_c['label'],
                                                    test_size=0.2,
                                                    random_state=SEED)

### Select relevant features

In [25]:
from sklearn.feature_selection import VarianceThreshold

constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)

print('\nNon quasi-constant columns:')
non_constant_columns = [column for column in X_train.columns if column not in X_train.columns[constant_filter.get_support()]]
print(non_constant_columns)

print('\nQuasi-constant columns:')
constant_columns = [column for column in X_train.columns if column in X_train.columns[constant_filter.get_support()]]
print(constant_columns)

X_train = constant_filter.transform(X_train)
X_test = constant_filter.transform(X_test)


Non quasi-constant columns:
['proto_icmp', 'conn_state_REJ', 'conn_state_RSTR', 'conn_state_S1', 'conn_state_SH', 'count']

Quasi-constant columns:
['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_S0', 'conn_state_S3', 'conn_state_SF']


### Scale data to [0,1] range

In [26]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

print(f'MinMax scaler producted the following scale:\n\n{scaler.scale_}')

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

MinMax scaler producted the following scale:

[1.54942671e-05 1.60420945e-05 3.33326051e-04 4.46754882e-08
 9.07350446e-06 1.72651934e-04 4.46754882e-08 1.54053408e-09
 1.23456790e-02 8.73889069e-06 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]


### Label-encode target feature

In [27]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df_c['label']) #!!!

print(f'Label encoder found the following classes:\n\n{encoder.classes_}')

y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

Label encoder found the following classes:

['Benign' 'DDoS' 'Okiru' 'PartOfAHorizontalPortScan']


### Persist train and test data subsets

In [28]:
def persist_csv(df_,name_):
    suffix = f'{name_}' if limit_rows is None else f'{int(limit_rows/1000)}k_{name_}'    
    pd.DataFrame(df_).round(6).to_csv(f'{folder}/iot23_combined_{suffix}.csv', float_format='%g', header=None, index=None)

In [29]:
persist_csv(X_train,'X_train')
persist_csv(y_train,'y_train')
print('X_train',X_train.shape,'\ny_train',y_train.shape)

persist_csv(X_test,'X_test')
persist_csv(y_test,'y_test')
print('X_test',X_test.shape,'\ny_test',y_test.shape)

X_train (568, 16) 
y_train (568,)
X_test (143, 16) 
y_test (143,)
