In [456]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics
from imblearn.under_sampling import RandomUnderSampler

pd.set_option('display.max_columns', None)

In [457]:
dtype_hikari = {
    'Unnamed: 0': 'uint32',
    'uid': 'str',
    'originh': 'category',
    'originp': 'uint16',
    'responh': 'category',
    'responp': 'uint16',
    'flow_duration': 'float64',
    'fwd_pkts_tot': 'uint64',
    'bwd_pkts_tot': 'uint64',
    'fwd_data_pkts_tot': 'uint64',
    'bwd_data_pkts_tot': 'uint64',
    'fwd_pkts_per_sec': 'float64',
    'bwd_pkts_per_sec': 'float64',
    'flow_pkts_per_sec': 'float64',
    'down_up_ratio': 'float32',
    'fwd_header_size_tot': 'uint64',
    'fwd_header_size_min': 'uint8',
    'fwd_header_size_max': 'uint8',
    'bwd_header_size_tot': 'uint64',
    'bwd_header_size_min': 'uint8',
    'bwd_header_size_max': 'uint8',
    'flow_FIN_flag_count': 'uint64',
    'flow_SYN_flag_count': 'uint64',
    'flow_RST_flag_count': 'uint64',
    'fwd_PSH_flag_count': 'uint64',
    'bwd_PSH_flag_count': 'uint64',
    'flow_ACK_flag_count': 'uint64',
    'fwd_URG_flag_count': 'uint64',
    'bwd_URG_flag_count': 'uint64',
    'flow_CWR_flag_count': 'uint64',
    'flow_ECE_flag_count': 'uint64',
    'fwd_pkts_payload.min': 'uint16',
    'fwd_pkts_payload.max': 'uint16',
    'fwd_pkts_payload.tot': 'float64',
    'fwd_pkts_payload.avg': 'float64',
    'fwd_pkts_payload.std': 'float64',
    'bwd_pkts_payload.min': 'uint16',
    'bwd_pkts_payload.max': 'uint16',
    'bwd_pkts_payload.tot': 'float64',
    'bwd_pkts_payload.avg': 'float64',
    'bwd_pkts_payload.std': 'float64',
    'flow_pkts_payload.min': 'uint16',
    'flow_pkts_payload.max': 'uint16',
    'flow_pkts_payload.tot': 'float64',
    'flow_pkts_payload.avg': 'float64',
    'flow_pkts_payload.std': 'float64',
    'fwd_iat.min': 'float64',
    'fwd_iat.max': 'float64',
    'fwd_iat.tot': 'float64',
    'fwd_iat.avg': 'float64',
    'fwd_iat.std': 'float64',
    'bwd_iat.min': 'float64',
    'bwd_iat.max': 'float64',
    'bwd_iat.tot': 'float64',
    'bwd_iat.avg': 'float64',
    'bwd_iat.std': 'float64',
    'flow_iat.min': 'float64',
    'flow_iat.max': 'float64',
    'flow_iat.tot': 'float64',
    'flow_iat.avg': 'float64',
    'flow_iat.std': 'float64',
    'payload_bytes_per_second': 'float64',
    'fwd_subflow_pkts': 'float64',
    'bwd_subflow_pkts': 'float64',
    'fwd_subflow_bytes': 'float64',
    'bwd_subflow_bytes': 'float64',
    'fwd_bulk_bytes': 'float64',
    'bwd_bulk_bytes': 'float64',
    'fwd_bulk_packets': 'float32',
    'bwd_bulk_packets': 'float32',
    'fwd_bulk_rate': 'float64',
    'bwd_bulk_rate': 'float64',
    'active.min': 'float64',
    'active.max': 'float64',
    'active.tot': 'float64',
    'active.avg': 'float64',
    'active.std': 'float64',
    'idle.min': 'float64',
    'idle.max': 'float64',
    'idle.tot': 'float64',
    'idle.avg': 'float64',
    'idle.std': 'float64',
    'fwd_init_window_size': 'uint16',
    'bwd_init_window_size': 'uint16',
    'fwd_last_window_size': 'uint16',
    'traffic_category': 'category',
    'Label': 'bool'
}

In [458]:
data = pd.read_csv('../datasets/HIKARI-2021/ALLFLOWMETER_HIKARI2021.csv', dtype=dtype_hikari, index_col=0)

In [459]:
data.head()

Unnamed: 0.1,Unnamed: 0,uid,originh,originp,responh,responp,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,down_up_ratio,fwd_header_size_tot,fwd_header_size_min,fwd_header_size_max,bwd_header_size_tot,bwd_header_size_min,bwd_header_size_max,flow_FIN_flag_count,flow_SYN_flag_count,flow_RST_flag_count,fwd_PSH_flag_count,bwd_PSH_flag_count,flow_ACK_flag_count,fwd_URG_flag_count,bwd_URG_flag_count,flow_CWR_flag_count,flow_ECE_flag_count,fwd_pkts_payload.min,fwd_pkts_payload.max,fwd_pkts_payload.tot,fwd_pkts_payload.avg,fwd_pkts_payload.std,bwd_pkts_payload.min,bwd_pkts_payload.max,bwd_pkts_payload.tot,bwd_pkts_payload.avg,bwd_pkts_payload.std,flow_pkts_payload.min,flow_pkts_payload.max,flow_pkts_payload.tot,flow_pkts_payload.avg,flow_pkts_payload.std,fwd_iat.min,fwd_iat.max,fwd_iat.tot,fwd_iat.avg,fwd_iat.std,bwd_iat.min,bwd_iat.max,bwd_iat.tot,bwd_iat.avg,bwd_iat.std,flow_iat.min,flow_iat.max,flow_iat.tot,flow_iat.avg,flow_iat.std,payload_bytes_per_second,fwd_subflow_pkts,bwd_subflow_pkts,fwd_subflow_bytes,bwd_subflow_bytes,fwd_bulk_bytes,bwd_bulk_bytes,fwd_bulk_packets,bwd_bulk_packets,fwd_bulk_rate,bwd_bulk_rate,active.min,active.max,active.tot,active.avg,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,traffic_category,Label
0,0,Cg61Jch3vdz9DBptj,103.255.15.23,13316,128.199.242.104,443,2.207588,15,14,6,6,6.794746,6.341763,13.136509,0.933333,464,20,40,492,32,44,2,2,2,6,5,26,0,0,0,0,0,742,1826.0,121.733333,220.736581,0,1448,5025.0,358.928571,552.23984,0,1448,6851.0,236.241379,424.859275,18.119812,1963762.0,2207603.0,157685.9,520505.2,7.867813,2032929.0,2177950.0,167534.6,560626.7,7.867813,1963762.0,2207603.0,78842.963491,369637.8,3103.387105,7.5,7.0,913.0,2512.5,0.0,0.0,0.0,0.0,0.0,0.0,2207603.0,2207603.0,2207603.0,2207603.0,0.0,0.0,0.0,0.0,0.0,0.0,29200,65160,0,Bruteforce-XML,True
1,1,CdRIlqLWdj35Y9vW9,103.255.15.23,13318,128.199.242.104,443,15.624266,15,14,6,6,0.960045,0.896042,1.856087,0.933333,488,20,44,468,32,44,2,2,2,6,5,26,0,0,0,0,0,745,1829.0,121.933333,221.339257,0,1448,5025.0,358.928571,552.23984,0,1448,6854.0,236.344828,424.987166,20.980835,15343000.0,15624280.0,1116020.0,4094889.0,20.980835,15411440.0,15595170.0,1199628.0,4270148.0,10.01358,15343000.0,15624280.0,558009.89696,2897622.0,438.676603,7.5,7.0,914.5,2512.5,0.0,0.0,0.0,0.0,0.0,0.0,28837.92,252438.1,281276.0,140638.0,158109.181742,15343000.0,15343000.0,15343000.0,15343000.0,0.0,29200,65160,0,Bruteforce-XML,True
2,2,CLzp9Khd0Y09Qkgrg,103.255.15.23,13320,128.199.242.104,443,12.203357,14,13,6,5,1.147225,1.065281,2.212506,0.928571,432,20,40,448,32,44,2,2,2,6,5,24,0,0,0,0,0,744,1828.0,130.571429,226.803444,0,2896,5025.0,386.538462,817.479013,0,2896,6853.0,253.814815,592.570284,36.001205,11968140.0,12203380.0,938721.6,3314032.0,15.02037,12036740.0,12174820.0,1014569.0,3471107.0,15.02037,11968140.0,12203380.0,469360.81006,2345336.0,561.566789,7.0,6.5,914.0,2512.5,0.0,0.0,0.0,0.0,0.0,0.0,28913.02,206325.1,235238.1,117619.0,125449.251656,11968140.0,11968140.0,11968140.0,11968140.0,0.0,29200,65160,0,Bruteforce-XML,True
3,3,Cnf1YA4iLB4CSNWB88,103.255.15.23,13322,128.199.242.104,443,9.992448,14,13,6,5,1.401058,1.300983,2.702041,0.928571,432,20,40,436,32,44,2,2,2,6,5,24,0,0,0,0,0,744,1828.0,130.571429,226.803444,0,2896,5025.0,386.538462,817.479013,0,2896,6853.0,253.814815,592.570284,50.067902,9759205.0,9992470.0,768651.5,2701448.0,20.980835,9828447.0,9963348.0,830279.0,2833716.0,20.980835,9759205.0,9992470.0,384325.770231,1912152.0,685.81794,7.0,6.5,914.0,2512.5,0.0,0.0,0.0,0.0,0.0,0.0,29529.09,203736.1,233265.2,116632.6,123182.931318,9759205.0,9759205.0,9759205.0,9759205.0,0.0,29200,65160,0,Bruteforce-XML,True
4,4,C4ZKvv3fpO72EAOsJ6,103.255.15.23,13324,128.199.242.104,443,7.780611,14,14,6,5,1.799345,1.799345,3.598689,1.0,432,20,40,480,32,44,2,2,2,6,5,25,0,0,0,0,0,744,1828.0,130.571429,226.803444,0,2896,5025.0,358.928571,792.173394,0,2896,6853.0,244.75,583.468215,16.927719,7545305.0,7780620.0,598509.2,2087417.0,15.02037,7613719.0,7750841.0,596218.5,2108534.0,9.059906,7545305.0,7780620.0,288171.114745,1450411.0,880.779153,7.0,7.0,914.0,2512.5,0.0,0.0,0.0,0.0,0.0,0.0,28550.15,206764.9,235315.1,117657.5,126016.885411,7545305.0,7545305.0,7545305.0,7545305.0,0.0,29200,65160,0,Bruteforce-XML,True


In [460]:
# probing attacker
data['probing_1'] = data['originh'] == '103.255.15.150'

In [461]:
# miner attacker
#data['miner_1'] = data['originh'] == '103.255.15.42'

In [462]:
# bruteforce, bruteforce, xml, probing
data['server_1'] = data['responh'] == '128.199.242.104'

In [463]:
# bruteforce, bruteforce, xml
#data['server_2'] = data['responh'] == '128.199.88.81'

In [464]:
# miner victim
# data['miner_victim'] = data['responh'] == '103.255.15.255'

In [465]:
# port for all the attacks
data['responp_443'] = data['responp'] == 443

In [466]:
# the attacker computer for all except the crpyto miner and probing
#data['attacker_1'] = data['originh'] == '103.255.15.23'

In [467]:
# cryptminer ports
# data['miner_port'] = (data['responp'] == 137) | (data['responp'] == 138)

In [468]:
x_features = data.columns[6:85].to_list() + data.columns[87:].to_list()


In [469]:
data['traffic_category'].value_counts()

traffic_category
Benign                 347431
Background             170151
Probing                 23388
Bruteforce               5884
Bruteforce-XML           5145
XMRIGCC CryptoMiner      3279
Name: count, dtype: int64

In [470]:
sampling_weights = {'Background': 0, 'Benign': 23388, 'XMRIGCC CryptoMiner': 0, 'Probing': 23388, 'Bruteforce': 0, 'Bruteforce-XML': 0}

rus = RandomUnderSampler(random_state=42, sampling_strategy=sampling_weights)
X_res, y_res = rus.fit_resample(data[x_features], data.traffic_category)

In [492]:
clf = DecisionTreeClassifier(random_state=0, max_depth=4)
clf.fit(X_res, y_res)

In [493]:
attack = 'Probing'

In [494]:
clf.get_depth()

4

In [495]:
predicted = clf.predict(X_res)

In [496]:
metrics.accuracy_score(y_res, predicted)

0.9129254318453908

In [497]:
metrics.precision_score(y_res, predicted, pos_label=attack)

0.8516805651651433

In [498]:
import graphviz 
dot_data = tree.export_graphviz(clf, feature_names=x_features, out_file=None, class_names=['Benign', attack]) 
graph = graphviz.Source(dot_data)
graph.render(attack) 

'Probing.pdf'