In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, precision_recall_curve
from sklearn.metrics import confusion_matrix, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter

import seaborn as sns

In [2]:
train_dataset = sorted([x for x in Path('train-supervised/').glob("*.csv")])
train_dataset

[WindowsPath('train-supervised/train1.csv'),
 WindowsPath('train-supervised/train2.csv'),
 WindowsPath('train-supervised/train3.csv'),
 WindowsPath('train-supervised/train4.csv'),
 WindowsPath('train-supervised/train5.csv')]

In [56]:
test_dataset = sorted([x for x in Path('test/').glob("*.csv")])
test_dataset

[WindowsPath('test/Golden.csv'),
 WindowsPath('test/hulk.csv'),
 WindowsPath('test/slowloris.csv'),
 WindowsPath('test/test2.csv')]

In [4]:
def dataframe_from_csv(target):
    return pd.read_csv(target, encoding = 'CP949').rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

In [5]:
train_df = dataframe_from_csvs(train_dataset)
train_df = train_df.drop(['Timestamp'],axis = 1)
train_df = train_df[train_df['Dst Port'] == 80]
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,476513,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
1,80,6,475048,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
2,80,6,474926,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
3,80,6,477471,5,3,209,461,209,0,41.800000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
4,80,6,512758,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,80,6,61369938,11,9,444,788,438,0,40.363636,...,20,23522.333333,29492.486648,83721,11294,9.995354e+06,20778.055219,10004022,9952945,Benign
181011,80,6,102508170,15,13,303,384,293,0,20.200000,...,20,24066.200000,23439.923240,90755,16347,1.000813e+07,19441.796925,10014622,9952829,Benign
181012,80,6,4627404,4,5,1264,3143,1264,0,316.000000,...,20,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
181013,80,6,18892988,6,4,435,706,434,0,72.500000,...,20,48659.500000,52535.912522,85808,11511,9.397834e+06,781851.503492,9950687,8844982,Benign


In [57]:
test_df = dataframe_from_csvs(test_dataset)
# test_df = pd.read_csv('test/5th_label.csv', encoding = 'CP949')
test_df = test_df.drop(['Timestamp'], axis = 1)
# test_df = test_df.rename(columns = {'anormal':'Label'})
# test_df = test_df[(test_df['Dst Port'] == 80)]
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,120059,9820.171749,83.292381,83.292381,0,10,0,1179.000000,...,0.0,0.000000e+00,1.179000e+02,0.0,0.0,10.0,0.000000e+00,1179.0,0.0,1
1,80,6,117362,8878.512636,76.685810,76.685810,0,9,0,1042.000000,...,0.0,0.000000e+00,1.157778e+02,0.0,0.0,9.0,0.000000e+00,1042.0,0.0,1
2,80,6,177022,4666.086701,39.543108,39.543108,0,7,0,826.000000,...,0.0,0.000000e+00,1.180000e+02,0.0,0.0,7.0,0.000000e+00,826.0,0.0,1
3,80,6,175774,4585.433568,39.823865,39.823865,0,7,0,806.000000,...,0.0,0.000000e+00,1.151429e+02,0.0,0.0,7.0,0.000000e+00,806.0,0.0,1
4,80,6,111320,8093.783687,71.864894,71.864894,0,8,0,901.000000,...,0.0,0.000000e+00,1.126250e+02,0.0,0.0,8.0,0.000000e+00,901.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,80,6,20076819,12.000000,0.000000,744.000000,0,66,60,62.000000,...,20.0,6.666264e+06,5.526124e+06,13532530.0,816.0,6688774.0,5.499402e+06,13532731.0,67527.0,0
6362,80,6,52482590,18.000000,0.000000,1128.000000,0,66,60,62.666667,...,20.0,1.289096e+07,1.177616e+07,31554068.0,1444.0,13116401.0,1.153343e+07,31554064.0,902857.0,0
6363,80,6,52491268,18.000000,0.000000,1128.000000,0,66,60,62.666667,...,20.0,1.284374e+07,1.173353e+07,31374266.0,1651.0,13118750.0,1.143800e+07,31373448.0,1101719.0,0
6364,80,6,2987,6.000000,0.000000,384.000000,0,66,60,64.000000,...,20.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0


In [7]:
train_df.loc[(train_df['Label'] != 'Benign'), 'Label'] = 1
train_df.loc[(train_df['Label'] == 'Benign'), 'Label'] = 0
print("Class distribution: {}".format(Counter(train_df['Label'])))

Class distribution: Counter({0: 678174, 1: 514918})


In [8]:
train_df = train_df.dropna()

In [9]:
TAG_MIN = train_df[train_df.columns].min()
TAG_MAX = train_df[train_df.columns].max()

In [10]:
def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

In [11]:
train_df = normalize(train_df[train_df.columns])
train_df = train_df.dropna()
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.003971,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.0,0.0,0.003959,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0.0,0.0,0.003958,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.0,0.0,0.003979,0.000069,0.000024,0.002431,0.000003,0.044946,0.0,0.035544,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.0,0.0,0.004273,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,0.0,0.0,0.511416,0.000172,0.000073,0.005164,0.000005,0.094194,0.0,0.034323,...,0.0,0.000735,0.001115,0.002226,0.000353,0.083668,0.000451,0.083741,0.083313,0
181011,0.0,0.0,0.854235,0.000241,0.000106,0.003524,0.000002,0.063011,0.0,0.017177,...,0.0,0.000752,0.000886,0.002413,0.000511,0.083775,0.000422,0.083829,0.083312,0
181012,0.0,0.0,0.038562,0.000052,0.000041,0.014701,0.000020,0.271828,0.0,0.268707,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
181013,0.0,0.0,0.157442,0.000086,0.000032,0.005059,0.000005,0.093333,0.0,0.061650,...,0.0,0.001520,0.001986,0.002282,0.000360,0.078666,0.016963,0.083294,0.074039,0


In [18]:
train_df_x=train_df.drop(['Label'],axis=1)
train_df_y=train_df['Label']

In [19]:
train_df = train_df.astype({'Label':'int'})

In [20]:
train_df_x, valid_df_x, train_df_y, valid_df_y = train_test_split(train_df_x, train_df_y, 
                                                                  test_size = 0.3,
                                                                  shuffle = True,
                                                                 stratify = train_df_y,
                                                                 random_state = 30)

In [75]:
%%time
rf=RandomForestClassifier(n_estimators = 10)
rf.fit(train_df_x,train_df_y)
rf_pred=rf.predict(train_df_x)

Wall time: 11.9 s


In [26]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [70]:
get_clf_eval(train_df_y, rf_pred)

오차행렬:
 [[474722      0]
 [   129 360313]]

정확도: 0.9998
정밀도: 1.0000
재현율: 0.9996
F1: 0.9998
AUC: 0.9998


In [71]:
rf_pred_valid = rf.predict(valid_df_x)

In [29]:
get_clf_eval(valid_df_y, rf_pred_valid)

오차행렬:
 [[203446      6]
 [   101 154375]]

정확도: 0.9997
정밀도: 1.0000
재현율: 0.9993
F1: 0.9997
AUC: 0.9997


In [52]:
fi = []
for i in range(len(train_df.columns)-1):
#     print((list(train_df.columns)[i],list(rf.feature_importances_)[i]))
    fi.append((list(train_df.columns)[i],round(list(rf.feature_importances_)[i]*100,2)))
fi

[('Dst Port', 0.0),
 ('Protocol', 0.0),
 ('Flow Duration', 1.52),
 ('Tot Fwd Pkts', 0.16),
 ('Tot Bwd Pkts', 3.0),
 ('TotLen Fwd Pkts', 0.29),
 ('TotLen Bwd Pkts', 0.65),
 ('Fwd Pkt Len Max', 0.17),
 ('Fwd Pkt Len Min', 0.01),
 ('Fwd Pkt Len Mean', 0.76),
 ('Fwd Pkt Len Std', 0.67),
 ('Bwd Pkt Len Max', 1.43),
 ('Bwd Pkt Len Min', 0.0),
 ('Bwd Pkt Len Mean', 1.53),
 ('Bwd Pkt Len Std', 0.89),
 ('Flow Byts/s', 0.01),
 ('Flow Pkts/s', 4.27),
 ('Flow IAT Mean', 1.49),
 ('Flow IAT Std', 0.18),
 ('Flow IAT Max', 1.17),
 ('Flow IAT Min', 0.99),
 ('Fwd IAT Tot', 0.78),
 ('Fwd IAT Mean', 0.81),
 ('Fwd IAT Std', 0.24),
 ('Fwd IAT Max', 0.65),
 ('Fwd IAT Min', 1.04),
 ('Bwd IAT Tot', 0.23),
 ('Bwd IAT Mean', 1.2),
 ('Bwd IAT Std', 0.04),
 ('Bwd IAT Max', 1.06),
 ('Bwd IAT Min', 0.02),
 ('Fwd PSH Flags', 0.02),
 ('Bwd PSH Flags', 0.0),
 ('Fwd URG Flags', 0.0),
 ('Bwd URG Flags', 0.0),
 ('Fwd Header Len', 7.39),
 ('Bwd Header Len', 1.84),
 ('Fwd Pkts/s', 3.47),
 ('Bwd Pkts/s', 0.04),
 ('Pkt Len Mi

In [53]:
fi = sorted(fi,key = lambda x : x[1], reverse = True)
fi

[('Init Fwd Win Byts', 24.76),
 ('Fwd Seg Size Min', 22.31),
 ('Fwd Header Len', 7.39),
 ('Flow Pkts/s', 4.27),
 ('Fwd Pkts/s', 3.47),
 ('Tot Bwd Pkts', 3.0),
 ('Init Bwd Win Byts', 2.36),
 ('Bwd Seg Size Avg', 1.89),
 ('Subflow Bwd Pkts', 1.85),
 ('Bwd Header Len', 1.84),
 ('PSH Flag Cnt', 1.77),
 ('Bwd Pkt Len Mean', 1.53),
 ('Flow Duration', 1.52),
 ('Subflow Bwd Byts', 1.52),
 ('Flow IAT Mean', 1.49),
 ('Bwd Pkt Len Max', 1.43),
 ('Bwd IAT Mean', 1.2),
 ('Flow IAT Max', 1.17),
 ('Bwd IAT Max', 1.06),
 ('Fwd IAT Min', 1.04),
 ('Flow IAT Min', 0.99),
 ('Bwd Pkt Len Std', 0.89),
 ('Pkt Size Avg', 0.87),
 ('Fwd Seg Size Avg', 0.83),
 ('Fwd IAT Mean', 0.81),
 ('Fwd IAT Tot', 0.78),
 ('Fwd Pkt Len Mean', 0.76),
 ('Pkt Len Std', 0.73),
 ('Fwd Pkt Len Std', 0.67),
 ('TotLen Bwd Pkts', 0.65),
 ('Fwd IAT Max', 0.65),
 ('Pkt Len Max', 0.56),
 ('ACK Flag Cnt', 0.51),
 ('TotLen Fwd Pkts', 0.29),
 ('Fwd Act Data Pkts', 0.28),
 ('Fwd IAT Std', 0.24),
 ('Bwd IAT Tot', 0.23),
 ('Pkt Len Mean', 0.23

In [54]:
colums_choosed = []
for i in range(len(fi)):
    if fi[i][1] >= 1.0:
#     if fi[i][1] >= 50:
        colums_choosed.append(fi[i][0])
len(colums_choosed)

20

In [55]:
train_df_x = train_df_x[colums_choosed]
valid_df_x = valid_df_x[colums_choosed]

In [56]:
%%time
rf=RandomForestClassifier()
rf.fit(train_df_x,train_df_y)
rf_pred=rf.predict(train_df_x)

Wall time: 1min 39s


In [61]:
get_clf_eval(train_df_y, rf_pred)

오차행렬:
 [[360412     30]
 [     0 474722]]

정확도: 1.0000
정밀도: 0.9999
재현율: 1.0000
F1: 1.0000
AUC: 1.0000


In [62]:
rf_pred_valid=rf.predict(valid_df_x)

In [63]:
get_clf_eval(valid_df_y, rf_pred_valid)

오차행렬:
 [[154451     25]
 [     5 203447]]

정확도: 0.9999
정밀도: 0.9999
재현율: 1.0000
F1: 0.9999
AUC: 0.9999


In [30]:
Counter(test_df['Label'])

Counter({1: 33513, 0: 34131})

In [59]:
test_df = test_df.dropna()
test_df_y = test_df['Label']
test_df = test_df.drop(['Label'],axis = 1)

In [373]:
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,80,6,472,7,0,579,0,213,60,82.714286,...,6,20,0.000000e+00,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0
1,21,6,17256142,38,0,2748,0,80,66,72.315789,...,8,20,1.915990e+06,2.654916e+06,8359606,209,1.917127e+06,2.654152e+06,8359737,8714
2,21,6,7918278,54,0,3940,0,94,66,72.962963,...,12,20,6.083749e+05,9.364406e+05,2686378,159,6.089205e+05,9.361127e+05,2686394,7122
3,21,6,20178,15,0,1008,0,78,60,67.200000,...,7,20,0.000000e+00,0.000000e+00,0,0,9.855000e+03,2.312000e+03,12167,7543
4,21,6,1084200,44,0,2818,0,68,60,64.045455,...,22,20,1.076035e+05,2.981989e+05,1002080,90,1.082798e+05,2.979648e+05,1002089,6655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168155,80,6,4486,6,0,519,0,213,60,86.500000,...,5,20,0.000000e+00,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0
168156,80,6,22303,4,0,412,0,226,60,103.000000,...,3,20,0.000000e+00,0.000000e+00,0,0,1.094350e+04,2.355000e+02,11179,10708
168157,21,6,161193,12,0,768,0,66,60,64.000000,...,4,20,5.500500e+04,5.481600e+04,109821,189,7.870450e+04,3.093850e+04,109643,47766
168158,21,6,192319,12,0,768,0,66,60,64.000000,...,4,20,5.515950e+04,5.503750e+04,110197,122,9.581550e+04,1.436150e+04,110177,81454


In [49]:
TAG_MIN = test_df[test_df.columns].min()
TAG_MAX = test_df[test_df.columns].max()

In [54]:
test_df = normalize(test_df[test_df.columns])
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.001000,0.169034,0.000677,0.000969,0.0,0.002151,0.00000,1.002551,...,-1.0,0.000000,0.000004,0.000000,0.000000,8.370701e-08,0.000000,0.000010,0.000000,1.0
1,0.0,0.0,0.000978,0.152823,0.000623,0.000892,0.0,0.001935,0.00000,0.886054,...,-1.0,0.000000,0.000004,0.000000,0.000000,7.533631e-08,0.000000,0.000009,0.000000,1.0
2,0.0,0.0,0.001475,0.080308,0.000321,0.000460,0.0,0.001505,0.00000,0.702381,...,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000,1.0
3,0.0,0.0,0.001465,0.078919,0.000323,0.000463,0.0,0.001505,0.00000,0.685374,...,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000,1.0
4,0.0,0.0,0.000928,0.139315,0.000584,0.000836,0.0,0.001720,0.00000,0.766156,...,-1.0,0.000000,0.000004,0.000000,0.000000,6.696561e-08,0.000000,0.000008,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,0.0,0.0,0.167307,0.000189,0.000000,0.008653,0.0,0.014194,0.05102,0.052721,...,0.0,0.208208,0.208914,0.359813,0.000025,5.598973e-02,0.119316,0.113278,0.000565,0.0
6362,0.0,0.0,0.437355,0.000293,0.000000,0.013119,0.0,0.014194,0.05102,0.053288,...,0.0,0.402624,0.445194,0.838983,0.000045,1.097935e-01,0.250232,0.264130,0.007558,0.0
6363,0.0,0.0,0.437427,0.000293,0.000000,0.013119,0.0,0.014194,0.05102,0.053288,...,0.0,0.401149,0.443583,0.834202,0.000052,1.098131e-01,0.248162,0.262618,0.009222,0.0
6364,0.0,0.0,0.000025,0.000086,0.000000,0.004466,0.0,0.014194,0.05102,0.054422,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.0


In [64]:
test_df=test_df[colums_choosed]
test_df

Unnamed: 0,Init Fwd Win Byts,Fwd Seg Size Min,Fwd Header Len,Flow Pkts/s,Fwd Pkts/s,Tot Bwd Pkts,Init Bwd Win Byts,Bwd Seg Size Avg,Subflow Bwd Pkts,Bwd Header Len,PSH Flag Cnt,Bwd Pkt Len Mean,Flow Duration,Subflow Bwd Byts,Flow IAT Mean,Bwd Pkt Len Max,Bwd IAT Mean,Flow IAT Max,Bwd IAT Max,Fwd IAT Min
0,1.0,0.0,0.000436,0.536016,0.536016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000003,0.0,0.000005,0.0,0.0,0.000005,0.0,0.000050
10,1.0,0.0,0.010198,0.000195,0.000195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256954,0.0,0.010888,0.0,0.0,0.106565,0.0,0.000000
18,1.0,0.0,0.000174,0.007321,0.007321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000229,0.0,0.000387,0.0,0.0,0.000260,0.0,0.000014
29,1.0,0.0,0.000436,0.693150,0.693150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000001,0.0,0.000004,0.0,0.0,0.000002,0.0,0.000053
40,1.0,0.0,0.000174,0.006270,0.006270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000268,0.0,0.000452,0.0,0.0,0.000317,0.0,0.000459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168140,1.0,0.0,0.066940,0.000525,0.000525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.621412,0.0,0.004040,0.0,0.0,0.696292,0.0,0.000000
168142,1.0,0.0,0.026236,0.000856,0.000856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150143,0.0,0.002486,0.0,0.0,0.147936,0.0,0.000000
168145,1.0,0.0,0.000174,0.006335,0.006335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000265,0.0,0.000447,0.0,0.0,0.000366,0.0,0.000270
168155,1.0,0.0,0.000349,0.048339,0.048339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000050,0.0,0.000053,0.0,0.0,0.000055,0.0,0.000086


In [39]:
test_df = test_df.dropna()

In [76]:
test_pred = rf.predict(test_df)
print("Class distribution: {}".format(Counter(test_pred))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({0: 36801, 1: 3038})


In [79]:
get_clf_eval(test_df_y,test_pred)

오차행렬:
 [[ 3016  3350]
 [   22 33451]]

정확도: 0.9154
정밀도: 0.9090
재현율: 0.9993
F1: 0.9520
AUC: 0.7366
