In [1]:
data_dir = 'data'
result_dir = 'result'

# Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

import warnings
warnings.filterwarnings("ignore")

# Read data

In [3]:
af = 'vv'
dt = 'v'
pf = '__1__'

Xval = np.load(f'{data_dir}/{af}3.v.X.{pf}.npy', allow_pickle=True)
y_val = np.load(f'{data_dir}/{af}3.v.y.npy', allow_pickle=True)

#? features names
fts_names = [line.strip() for line in open(f'{data_dir}/{af}3.fts_cols.{pf}.txt').readlines()]

In [4]:
# for i in range(0, len(fts_names)):
#     print(i, fts_names[i])

In [5]:
list(np.where(y_val == 1)[0])

[352983,
 353215,
 353220,
 389317,
 389342,
 400584,
 400603,
 406683,
 411244,
 417088,
 420926,
 420959,
 422844,
 422847,
 424163,
 424210,
 425306,
 425995,
 426050,
 426125,
 426149,
 426150,
 426163,
 426199,
 426248,
 426250,
 426256,
 426286,
 426291,
 426314,
 426348,
 426355,
 426357,
 426361,
 426388,
 426389,
 426390,
 426391,
 426415,
 426454,
 426504,
 426517,
 426564,
 426580,
 426643,
 426648,
 426660,
 426676,
 426730,
 426796,
 426849,
 426925,
 427075,
 427096,
 427099,
 427218,
 427259,
 427518,
 427521,
 427573,
 427577,
 427597,
 427870,
 427934,
 427947,
 427948,
 427981,
 428003,
 428034,
 428211,
 428213,
 428239,
 428266,
 428325,
 428610,
 428614,
 428615,
 428616,
 428658,
 428667,
 428668,
 428700,
 428901,
 428940,
 428969,
 428972,
 428994,
 428996,
 429004,
 429005,
 429012,
 429041,
 429060,
 429070,
 429125,
 429126,
 429132,
 429154,
 429167,
 429185,
 429195,
 429196,
 429197,
 429214,
 429223,
 429224,
 429225,
 429226,
 429266,
 429267,
 429268,
 

In [6]:
#? test
sidx = 352983
eidx = 426248
Xval = Xval[sidx:,:]
y_val = y_val[sidx:]

# Functions

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, PrecisionRecallDisplay, f1_score, roc_auc_score, roc_curve, auc

def evaluate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    print(classification_report(y_true, y_pred, digits=4))
    roc_auc = roc_auc_score(y_true, y_pred)
    print(roc_auc)

    # display = PrecisionRecallDisplay.from_predictions(y_true, y_pred)
    # _ = display.ax_.set_title("2-class Precision-Recall curve")
    return roc_auc

In [8]:
import pickle

def save_result(model, Xtr, Xt, model_name, selected_fts_names, roc_auc, expname='', config=None):
    ra = round(roc_auc * 100, 2)
    # if ra <= 70: #? not good enough to even bother
    #     return
    
    #? save model
    pickle.dump(model, open(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.model.__{ra}__.pkl','wb'))

    #? save transformed X as well
    np.save(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.__{ra}__.data.tr.X.npy', Xtr)
    np.save(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.__{ra}__.data.t.X.npy', Xt)
    # np.save(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.__{ra}__.data.v.X.npy', Xv)

    #? save fts
    open(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.fts.__{ra}__.txt', 'w').write('\n'.join(selected_fts_names))
    
    #? save config
    if config is not None:
        open(f'{result_dir}/{af}4.{pf}.{model_name}.{expname}.config.__{ra}__.txt', 'w').write(config)

In [9]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

def test_param(x, y, eps, min_samples):
    # eps = 0.0002 #0.0375
    # min_samples = 3
    model = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=6)

    """ cluster """
    model.fit(x)

    """ find clusters """
    y_pred = copy.deepcopy(model.labels_)
    print(y_pred.shape, y.shape)

    """ -1 are outliers (not belong to any clusters) """
    #? relabel to use classification metrics
    y_pred[y_pred != -1] = 0
    y_pred[y_pred == -1] = 1
    #? evaluate
    print('Evaluate')
    evaluate(y, y_pred)

    """ silhouette score 
        A silhouette score ranges from -1 to 1, with -1 being the worst score possible and 1 being the best score. Silhouette scores of 0 suggest overlapping clusters. """
    # score = silhouette_score(x, model.labels_)
    # print('score', score)

    """ save to a df """
    df_cluster__01 = pd.DataFrame(model.labels_, columns=['clusters'])
    df_pred__01 = pd.DataFrame(y_pred, columns=['Label_Pred'])
    df_ft__01 = pd.DataFrame(x)
    df_y = pd.DataFrame(y, columns=['Label'])
    df = pd.concat([df_ft__01, df_y, df_cluster__01, df_pred__01], axis=1)

    print('n_clusters =', len(np.unique(model.labels_)))
    print('\n---------------')
    print('Stat clusters')
    print('All clusters')
    print(df['clusters'].value_counts(), '\n')
    print('Label = 1 (true bots) clusters')
    print(df.loc[df['Label'] == 1]['clusters'].value_counts(), '\n')
    print('Label = 0 (true normal) clusters')
    print(df.loc[df['Label'] == 0]['clusters'].value_counts())

    print('\n---------------')
    print('Stat labels')
    for i in np.unique(model.labels_):
        print(f'cluster {i} labels')
        print(df.loc[df['clusters'] == i]['Label'].value_counts(), '\n')
    
    del x, y, y_pred, model
    del df_ft__01, df_y, df_cluster__01, df_pred__01

    return df

## Exp 03: Run on all features



In [10]:
""" select features """
selected_fts_names = fts_names[:]
', '.join(selected_fts_names)

'Dur, sTos, dTos, Sport, Dport, PktsPerSec, BytesPerSec, SrcBytesPerSec, BytesPerPkt, TotPkts, TotBytes, SrcBytes, DstBytes, DstBytesPerSec, State_CON, State_alltcp, State_INT, State_S_, State_URP, State_ECO, State_RED, State_REQ, State_ECR, State_URH, State_TXD, State_URFIL, State_R_, State_URN, State_RSP, State_URHPRO, State_A_, State_other, Flag_nan, Flag_S, Flag_A, Flag_P, Flag_R, Flag_F, Proto_udp, Proto_tcp, Proto_icmp, Proto_rtp, Proto_rtcp, Proto_igmp, Proto_arp, Proto_other, Service_80, Service_443, Service_21, Service_22, Service_25, Service_6667, Service_other, sTos_0, sTos_2, sTos_3, sTos_1, sTos_-1, dTos_0, dTos_-1, dTos_3, dTos_2, dTos_1'

In [11]:
X__02 = Xval[:,:]
print(X__02.shape)

(411649, 63)


In [12]:
df__02_2 = test_param(X__02, y_val, eps=0.07, min_samples=2)

In [80]:
df__02 = test_param(X__02, y_val, eps=0.000006, min_samples=3)

(73265,) (73265,)
Evaluate
[[  501 72740]
 [    0    24]]
              precision    recall  f1-score   support

           0     1.0000    0.0068    0.0136     73241
           1     0.0003    1.0000    0.0007        24

    accuracy                         0.0072     73265
   macro avg     0.5002    0.5034    0.0071     73265
weighted avg     0.9997    0.0072    0.0136     73265

0.5034202154530932
score 0.4155019720905571

---------------
Stat clusters
All clusters
-1     72764
 2       133
 1        50
 14       40
 6        26
 5        25
 0        20
 11       20
 22       15
 9        15
 25       11
 12       10
 17       10
 20        8
 29        6
 34        6
 3         6
 10        6
 36        5
 28        5
 26        5
 4         5
 24        4
 35        4
 27        4
 33        4
 41        4
 40        4
 18        4
 13        4
 38        3
 37        3
 39        3
 21        3
 32        3
 31        3
 30        3
 23        3
 19        3
 16        3
 15    

In [59]:
df__02

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,Label,clusters,Label_Pred
0,4.156271e-08,0.25,0.25,0.015640,9.832680e-08,9.757955e-03,0.026795,1.662827e-03,0.207530,2.449149e-07,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1,1
1,8.145977e-01,0.25,0.25,0.683431,2.478948e-05,2.489370e-09,0.021701,3.035730e-10,0.174295,1.224574e-06,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
2,3.364123e-07,0.25,0.25,0.690587,2.478577e-05,2.411131e-03,0.022151,2.664263e-04,0.171521,4.898298e-07,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
3,1.855692e-04,0.25,0.25,0.038299,8.218636e-07,1.055611e-03,0.024958,4.119428e-05,0.482754,1.182939e-04,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
4,2.239193e-07,0.25,0.25,0.935777,2.478577e-05,3.622440e-03,0.022339,3.568709e-04,0.170400,4.898298e-07,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73260,5.161115e-01,0.25,0.25,0.336985,2.478763e-05,2.357439e-09,0.021701,3.222167e-10,0.176892,7.347447e-07,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
73261,0.000000e+00,0.25,0.00,0.154081,1.158976e-04,0.000000e+00,0.021701,0.000000e+00,0.151485,0.000000e+00,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0
73262,2.685851e-07,0.25,0.25,0.730686,2.479134e-05,1.510013e-03,0.022245,3.136055e-04,0.190156,2.449149e-07,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,1
73263,2.426909e-01,0.25,0.00,0.915132,7.184350e-05,1.336901e-08,0.021701,5.877872e-09,0.197547,1.959319e-06,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,1


In [12]:
df__02_2 = test_param(X__02, y_val, eps=0.001, min_samples=3)

(73265,) (73265,)
Evaluate
[[27461 45780]
 [    5    19]]
              precision    recall  f1-score   support

           0     0.9998    0.3749    0.5454     73241
           1     0.0004    0.7917    0.0008        24

    accuracy                         0.3751     73265
   macro avg     0.5001    0.5833    0.2731     73265
weighted avg     0.9995    0.3751    0.5452     73265

0.5833034661824206
score -0.6005066321920994

---------------
Stat clusters
All clusters
-1       45799
 8        1195
 45       1183
 15       1049
 13        780
         ...  
 910         3
 913         3
 1750        3
 922         3
 1769        3
Name: clusters, Length: 1773, dtype: int64 

Label = 1 (true bots) clusters
-1       19
 1541     3
 476      2
Name: clusters, dtype: int64
Label = 0 (true normal) clusters
-1       45780
 8        1195
 45       1183
 15       1049
 13        780
         ...  
 913         3
 1750        3
 922         3
 1769        3
 476         1
Name: clusters, Length

In [14]:
df__02_2 = test_param(X__02, y_val, eps=0.003, min_samples=3)

(73265,) (73265,)
Evaluate
[[49040 24201]
 [   10    14]]
              precision    recall  f1-score   support

           0     0.9998    0.6696    0.8020     73241
           1     0.0006    0.5833    0.0012        24

    accuracy                         0.6695     73265
   macro avg     0.5002    0.6265    0.4016     73265
weighted avg     0.9995    0.6695    0.8018     73265

0.6264518279834156

---------------
Stat clusters
All clusters
 0       34682
-1       24215
 11        531
 15        438
 10        220
         ...  
 1610        3
 491         3
 1228        3
 488         3
 1716        3
Name: clusters, Length: 1725, dtype: int64 

Label = 1 (true bots) clusters
-1       14
 0        4
 1499     3
 387      2
 32       1
Name: clusters, dtype: int64
Label = 0 (true normal) clusters
 0       34678
-1       24201
 11        531
 15        438
 10        220
         ...  
 491         3
 1228        3
 488         3
 1716        3
 387         1
Name: clusters, Length: 

In [15]:
df__02_2 = test_param(X__02, y_val, eps=0.006, min_samples=3)

(73265,) (73265,)
Evaluate
[[59083 14158]
 [   15     9]]
              precision    recall  f1-score   support

           0     0.9997    0.8067    0.8929     73241
           1     0.0006    0.3750    0.0013        24

    accuracy                         0.8066     73265
   macro avg     0.5002    0.5908    0.4471     73265
weighted avg     0.9994    0.8066    0.8926     73265

0.5908464862576972

---------------
Stat clusters
All clusters
 0       42553
-1       14167
 23       1518
 8         855
 22        561
         ...  
 450         3
 830         3
 829         3
 462         3
 1217        3
Name: clusters, Length: 1313, dtype: int64 

Label = 1 (true bots) clusters
-1       9
 0       7
 1175    4
 38      2
 8       1
 22      1
Name: clusters, dtype: int64
Label = 0 (true normal) clusters
 0       42546
-1       14158
 23       1518
 8         854
 13        561
         ...  
 830         3
 829         3
 462         3
 1217        3
 38          1
Name: clusters, Le

In [27]:
df__02_2 = test_param(X__02, y_val, eps=0.01, min_samples=2)

(73265,) (73265,)
Evaluate
[[66712  6529]
 [   21     3]]
              precision    recall  f1-score   support

           0     0.9997    0.9109    0.9532     73241
           1     0.0005    0.1250    0.0009        24

    accuracy                         0.9106     73265
   macro avg     0.5001    0.5179    0.4771     73265
weighted avg     0.9994    0.9106    0.9529     73265

0.5179279706721646
n_clusters = 2132

---------------
Stat clusters
All clusters
 0       46616
-1        6532
 1        3739
 20       1280
 7        1177
         ...  
 1389        2
 1390        2
 509         2
 1392        2
 1065        2
Name: clusters, Length: 2132, dtype: int64 

Label = 1 (true bots) clusters
 0       9
 20      6
 1922    4
-1       3
 37      2
Name: clusters, dtype: int64 

Label = 0 (true normal) clusters
 0       46607
-1        6529
 1        3739
 20       1274
 7        1177
         ...  
 1387        2
 1388        2
 1389        2
 1390        2
 37          1
Name: clu

In [29]:
df__02_2 = test_param(X__02, y_val, eps=0.06, min_samples=3)

(73265,) (73265,)
Evaluate
[[72525   716]
 [   23     1]]
              precision    recall  f1-score   support

           0     0.9997    0.9902    0.9949     73241
           1     0.0014    0.0417    0.0027        24

    accuracy                         0.9899     73265
   macro avg     0.5005    0.5159    0.4988     73265
weighted avg     0.9994    0.9899    0.9946     73265

0.5159453607496712
n_clusters = 202

---------------
Stat clusters
All clusters
 0      58599
 4       2462
 5       2296
 9       1447
-1        717
        ...  
 164        3
 185        3
 141        3
 198        3
 167        3
Name: clusters, Length: 202, dtype: int64 

Label = 1 (true bots) clusters
 0      9
 9      6
 194    4
 19     2
-1      1
 184    1
 14     1
Name: clusters, dtype: int64 

Label = 0 (true normal) clusters
 0      58590
 4       2462
 5       2296
 9       1441
-1        716
        ...  
 137        3
 185        3
 141        3
 167        3
 184        2
Name: clusters, Le