In [571]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score ,precision_score, recall_score, f1_score, roc_auc_score,  classification_report, precision_recall_curve
from sklearn.metrics import confusion_matrix, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter

import seaborn as sns

In [572]:
train_dataset = sorted([x for x in Path('train-supervised/').glob("*.csv")])
train_dataset

[WindowsPath('train-supervised/train1.csv'),
 WindowsPath('train-supervised/train2.csv'),
 WindowsPath('train-supervised/train3.csv'),
 WindowsPath('train-supervised/train4.csv'),
 WindowsPath('train-supervised/train5.csv')]

In [943]:
test_dataset = sorted([x for x in Path('test/').glob("*.csv")])
test_dataset

[WindowsPath('test/Golden.csv'),
 WindowsPath('test/hulk.csv'),
 WindowsPath('test/slowloris.csv'),
 WindowsPath('test/test_normal.csv')]

In [741]:
def dataframe_from_csv(target):
    return pd.read_csv(target, encoding = 'CP949').rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

In [966]:
train_df = dataframe_from_csvs(train_dataset)
train_df = train_df.drop(['Timestamp'],axis = 1)
train_df = train_df[(train_df['Dst Port'] == 80)]
# train_df = train_df[(train_df['Label'] == 'Benign')|
#                     (train_df['Label'] == 'DoS attacks-Hulk')|
#                     (train_df['Label'] == 'DoS attacks-Slowloris')|
#                    (train_df['Label'] == 'DoS attacks-GoldenEye')]
                    
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,476513,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
1,80,6,475048,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
2,80,6,474926,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
3,80,6,477471,5,3,209,461,209,0,41.800000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
4,80,6,512758,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,80,6,61369938,11,9,444,788,438,0,40.363636,...,20,23522.333333,29492.486648,83721,11294,9.995354e+06,20778.055219,10004022,9952945,Benign
181011,80,6,102508170,15,13,303,384,293,0,20.200000,...,20,24066.200000,23439.923240,90755,16347,1.000813e+07,19441.796925,10014622,9952829,Benign
181012,80,6,4627404,4,5,1264,3143,1264,0,316.000000,...,20,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
181013,80,6,18892988,6,4,435,706,434,0,72.500000,...,20,48659.500000,52535.912522,85808,11511,9.397834e+06,781851.503492,9950687,8844982,Benign


In [967]:
train_df.to_csv('train_only80.csv', index = False)

In [901]:
Counter(train_df['Dst Port'])

Counter({80: 1193092})

In [941]:
Counter(train_df['Label'])

Counter({'Benign': 678174,
         'DoS attacks-GoldenEye': 41508,
         'DoS attacks-Slowloris': 10990,
         'DoS attacks-Hulk': 461648})

In [944]:
test_df = dataframe_from_csvs(test_dataset)
# test_df = pd.read_csv('test/5th_label.csv', encoding = 'CP949')
test_df = test_df.drop(['Timestamp'], axis = 1)
# test_df = test_df.rename(columns = {'anormal':'Label'})
# test_df = test_df[(test_df['Dst Port'] == 80)]
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,120059,9820.171749,83.292381,83.292381,0,10,0,1179.000000,...,0.0,0.000000e+00,1.179000e+02,0.0,0.0,10.0,0.000000e+00,1179.0,0.0,1
1,80,6,117362,8878.512636,76.685810,76.685810,0,9,0,1042.000000,...,0.0,0.000000e+00,1.157778e+02,0.0,0.0,9.0,0.000000e+00,1042.0,0.0,1
2,80,6,177022,4666.086701,39.543108,39.543108,0,7,0,826.000000,...,0.0,0.000000e+00,1.180000e+02,0.0,0.0,7.0,0.000000e+00,826.0,0.0,1
3,80,6,175774,4585.433568,39.823865,39.823865,0,7,0,806.000000,...,0.0,0.000000e+00,1.151429e+02,0.0,0.0,7.0,0.000000e+00,806.0,0.0,1
4,80,6,111320,8093.783687,71.864894,71.864894,0,8,0,901.000000,...,0.0,0.000000e+00,1.126250e+02,0.0,0.0,8.0,0.000000e+00,901.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,80,6,20076819,12.000000,0.000000,744.000000,0,66,60,62.000000,...,20.0,6.666264e+06,5.526124e+06,13532530.0,816.0,6688774.0,5.499402e+06,13532731.0,67527.0,0
6362,80,6,52482590,18.000000,0.000000,1128.000000,0,66,60,62.666667,...,20.0,1.289096e+07,1.177616e+07,31554068.0,1444.0,13116401.0,1.153343e+07,31554064.0,902857.0,0
6363,80,6,52491268,18.000000,0.000000,1128.000000,0,66,60,62.666667,...,20.0,1.284374e+07,1.173353e+07,31374266.0,1651.0,13118750.0,1.143800e+07,31373448.0,1101719.0,0
6364,80,6,2987,6.000000,0.000000,384.000000,0,66,60,64.000000,...,20.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0


In [945]:
train_df.loc[(train_df['Label'] != 'Benign'), 'Label'] = 1
train_df.loc[(train_df['Label'] == 'Benign'), 'Label'] = 0
print("Class distribution: {}".format(Counter(train_df['Label']))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({0: 678174, 1: 514146})


In [803]:
le=LabelEncoder()

In [804]:
train_df['Label']=le.fit_transform(train_df['Label'])
print("Class distribution: {}".format(Counter(train_df['Label']))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({1: 678174, 0: 514918})


In [946]:
train_df = train_df.dropna()

In [947]:
TAG_MIN = train_df[train_df.columns].min()
TAG_MAX = train_df[train_df.columns].max()

In [948]:
def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

In [949]:
train_df = normalize(train_df[train_df.columns])
train_df = train_df.dropna()
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.003971,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.0,0.0,0.003959,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0.0,0.0,0.003958,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.0,0.0,0.003979,0.000069,0.000024,0.002431,0.000003,0.044946,0.0,0.035544,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.0,0.0,0.004273,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,0.0,0.0,0.511416,0.000172,0.000073,0.005164,0.000005,0.094194,0.0,0.034323,...,0.0,0.000735,0.001115,0.002226,0.000353,0.083668,0.000451,0.083741,0.083313,0
181011,0.0,0.0,0.854235,0.000241,0.000106,0.003524,0.000002,0.063011,0.0,0.017177,...,0.0,0.000752,0.000886,0.002413,0.000511,0.083775,0.000422,0.083829,0.083312,0
181012,0.0,0.0,0.038562,0.000052,0.000041,0.014701,0.000020,0.271828,0.0,0.268707,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
181013,0.0,0.0,0.157442,0.000086,0.000032,0.005059,0.000005,0.093333,0.0,0.061650,...,0.0,0.001520,0.001986,0.002282,0.000360,0.078666,0.016963,0.083294,0.074039,0


In [950]:
train_df = train_df.astype({'Label':'int'})

In [951]:
train_df_x=train_df.drop(['Label'],axis=1)
train_df_y=train_df['Label']

In [952]:
train_df_x, valid_df_x, train_df_y, valid_df_y = train_test_split(train_df_x, train_df_y, 
                                                                  test_size = 0.3,
                                                                  shuffle = True,
                                                                 stratify = train_df_y,
                                                                 random_state = 30)

In [953]:
%%time
lg = lgb.LGBMClassifier()
lg.fit(train_df_x,train_df_y)
lg_pred = lg.predict(train_df_x)

Wall time: 3 s


In [954]:
print('결정계수: {:.3f}'.format(lg.score(train_df_x,train_df_y)))

결정계수: 1.000


In [955]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
#     AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
#     print('AUC: {:.4f}'.format(AUC))

In [956]:
get_clf_eval(train_df_y, lg_pred)

오차행렬:
 [[474722      0]
 [     0 359902]]

정확도: 1.0000
정밀도: 1.0000
재현율: 1.0000
F1: 1.0000


In [957]:
lg_pred_valid = lg.predict(valid_df_x)

In [958]:
get_clf_eval(valid_df_y, lg_pred_valid)

오차행렬:
 [[203450      2]
 [     1 154243]]

정확도: 1.0000
정밀도: 1.0000
재현율: 1.0000
F1: 1.0000


In [918]:
fi = []
for i in range(len(train_df.columns)-1):
#     print((list(train_df.columns)[i],list(rf.feature_importances_)[i]))
    fi.append((list(train_df.columns)[i],round(list(lg.feature_importances_)[i]*100,2)))
fi

[('Dst Port', 0),
 ('Protocol', 0),
 ('Flow Duration', 11100),
 ('Tot Fwd Pkts', 3300),
 ('Tot Bwd Pkts', 10200),
 ('TotLen Fwd Pkts', 6200),
 ('TotLen Bwd Pkts', 4100),
 ('Fwd Pkt Len Max', 5400),
 ('Fwd Pkt Len Min', 0),
 ('Fwd Pkt Len Mean', 4400),
 ('Fwd Pkt Len Std', 4800),
 ('Bwd Pkt Len Max', 4000),
 ('Bwd Pkt Len Min', 0),
 ('Bwd Pkt Len Mean', 0),
 ('Bwd Pkt Len Std', 700),
 ('Flow Byts/s', 4600),
 ('Flow Pkts/s', 1700),
 ('Flow IAT Mean', 9600),
 ('Flow IAT Std', 1500),
 ('Flow IAT Max', 8400),
 ('Flow IAT Min', 36500),
 ('Fwd IAT Tot', 1700),
 ('Fwd IAT Mean', 500),
 ('Fwd IAT Std', 1200),
 ('Fwd IAT Max', 1900),
 ('Fwd IAT Min', 30000),
 ('Bwd IAT Tot', 100),
 ('Bwd IAT Mean', 600),
 ('Bwd IAT Std', 0),
 ('Bwd IAT Max', 2300),
 ('Bwd IAT Min', 1200),
 ('Fwd PSH Flags', 0),
 ('Bwd PSH Flags', 0),
 ('Fwd URG Flags', 0),
 ('Bwd URG Flags', 0),
 ('Fwd Header Len', 200),
 ('Bwd Header Len', 2300),
 ('Fwd Pkts/s', 11200),
 ('Bwd Pkts/s', 1000),
 ('Pkt Len Min', 0),
 ('Pkt Len Max

In [663]:
fi = sorted(fi,key = lambda x : x[1], reverse = True)
fi

[('Flow IAT Min', 30000),
 ('Fwd IAT Min', 25100),
 ('Init Bwd Win Byts', 15200),
 ('Flow Duration', 14300),
 ('TotLen Fwd Pkts', 14100),
 ('Idle Min', 13700),
 ('Tot Fwd Pkts', 12400),
 ('Flow Pkts/s', 12100),
 ('Fwd Pkts/s', 11300),
 ('Idle Std', 10600),
 ('Flow IAT Mean', 10400),
 ('Flow IAT Std', 10200),
 ('Fwd Header Len', 9100),
 ('Active Mean', 8800),
 ('Fwd IAT Mean', 8700),
 ('Fwd IAT Std', 8500),
 ('Flow IAT Max', 8200),
 ('SYN Flag Cnt', 8200),
 ('Active Max', 8100),
 ('Fwd Pkt Len Max', 8000),
 ('Flow Byts/s', 7700),
 ('Idle Max', 6000),
 ('Bwd Pkts/s', 5400),
 ('Active Min', 5300),
 ('FIN Flag Cnt', 5100),
 ('Idle Mean', 3700),
 ('Pkt Len Std', 3300),
 ('Fwd IAT Max', 2700),
 ('Pkt Len Max', 2400),
 ('Fwd Pkt Len Mean', 2300),
 ('Fwd IAT Tot', 2200),
 ('Pkt Len Mean', 2000),
 ('Fwd Pkt Len Std', 1600),
 ('Fwd Act Data Pkts', 1500),
 ('ACK Flag Cnt', 500),
 ('Active Std', 500),
 ('Bwd Header Len', 400),
 ('Pkt Size Avg', 400),
 ('Dst Port', 0),
 ('Protocol', 0),
 ('Tot Bwd 

In [664]:
colums_choosed = []
for i in range(len(fi)):
#     if fi[i][1] >= 1.0:
    if fi[i][1] >= 50:
        colums_choosed.append(fi[i][0])
len(colums_choosed)

38

In [676]:
train_df_x = train_df_x[colums_choosed]
valid_df_x = valid_df_x[colums_choosed]

In [677]:
%%time
lg = lgb.LGBMClassifier()
lg.fit(train_df_x,train_df_y)
lg_pred = lg.predict(train_df_x)

Wall time: 3 s


In [678]:
get_clf_eval(train_df_y, lg_pred)

오차행렬:
 [[360271    171]
 [    10 474712]]

정확도: 0.9998
정밀도: 0.9996
재현율: 1.0000
F1: 0.9998


In [679]:
lg_pred_valid=lg.predict(valid_df_x)
get_clf_eval(valid_df_y, lg_pred_valid)

오차행렬:
 [[154389     87]
 [     9 203443]]

정확도: 0.9997
정밀도: 0.9996
재현율: 1.0000
F1: 0.9998


In [425]:
test_df.columns = train_df.columns

In [818]:
test_df.loc[(test_df['Label'] != 'normal'), 'Label'] = 'Attack'

In [959]:
Counter(test_df['Label'])

Counter({1: 33513, 0: 6366})

In [920]:
test_df['Label']=le.fit_transform(test_df['Label'])
print("Class distribution: {}".format(Counter(test_df['Label']))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({1: 33513, 0: 6366})


In [960]:
test_df_y = test_df['Label']
test_df = test_df.drop(['Label'],axis = 1)

In [932]:
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,80,6,120059,9820.171749,83.292381,83.292381,0,10,0,1179.0,...,0,0.0,0.0,117.900000,0.0,0.0,10.0,0.000,1179.0,0.0
1,80,6,117362,8878.512636,76.685810,76.685810,0,9,0,1042.0,...,0,0.0,0.0,115.777778,0.0,0.0,9.0,0.000,1042.0,0.0
2,80,6,177022,4666.086701,39.543108,39.543108,0,7,0,826.0,...,0,0.0,0.0,118.000000,0.0,0.0,7.0,0.000,826.0,0.0
3,80,6,175774,4585.433568,39.823865,39.823865,0,7,0,806.0,...,0,0.0,0.0,115.142857,0.0,0.0,7.0,0.000,806.0,0.0
4,80,6,111320,8093.783687,71.864894,71.864894,0,8,0,901.0,...,0,0.0,0.0,112.625000,0.0,0.0,8.0,0.000,901.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11989,80,6,15578941,4.000000,1.000000,0.000000,0,0,0,0.0,...,0,0.0,7045631.0,0.000000,7045631.0,7045631.0,8533310.0,0.000,8533310.0,8533310.0
11990,80,6,15005991,4.000000,1.000000,0.000000,0,0,0,0.0,...,0,0.0,7000819.0,0.000000,7000819.0,7000819.0,8005172.0,0.000,8005172.0,8005172.0
11991,80,6,15528404,5.000000,1.000000,344.000000,0,280,0,68.8,...,4,0.0,235901.5,61649.104720,279494.0,192309.0,7528300.5,2748008.578,9471436.0,5585165.0
11992,80,6,2342803,8.000000,1.000000,944.000000,0,236,0,118.0,...,4,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0


In [893]:
TAG_MIN = test_df[test_df.columns].min()
TAG_MAX = test_df[test_df.columns].max()

In [961]:
test_df = normalize(test_df[test_df.columns])
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0.0,0.0,0.001000,0.169034,0.000677,0.000969,0.0,0.002151,0.00000,1.002551,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,8.370701e-08,0.000000,0.000010,0.000000
1,0.0,0.0,0.000978,0.152823,0.000623,0.000892,0.0,0.001935,0.00000,0.886054,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,7.533631e-08,0.000000,0.000009,0.000000
2,0.0,0.0,0.001475,0.080308,0.000321,0.000460,0.0,0.001505,0.00000,0.702381,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000
3,0.0,0.0,0.001465,0.078919,0.000323,0.000463,0.0,0.001505,0.00000,0.685374,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000
4,0.0,0.0,0.000928,0.139315,0.000584,0.000836,0.0,0.001720,0.00000,0.766156,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,6.696561e-08,0.000000,0.000008,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,0.0,0.0,0.167307,0.000189,0.000000,0.008653,0.0,0.014194,0.05102,0.052721,...,0.062500,0.0,0.208208,0.208914,0.359813,0.000025,5.598973e-02,0.119316,0.113278,0.000565
6362,0.0,0.0,0.437355,0.000293,0.000000,0.013119,0.0,0.014194,0.05102,0.053288,...,0.078125,0.0,0.402624,0.445194,0.838983,0.000045,1.097935e-01,0.250232,0.264130,0.007558
6363,0.0,0.0,0.437427,0.000293,0.000000,0.013119,0.0,0.014194,0.05102,0.053288,...,0.078125,0.0,0.401149,0.443583,0.834202,0.000052,1.098131e-01,0.248162,0.262618,0.009222
6364,0.0,0.0,0.000025,0.000086,0.000000,0.004466,0.0,0.014194,0.05102,0.054422,...,0.015625,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000


In [797]:
test_df=test_df[colums_choosed]
test_df

Unnamed: 0,Flow IAT Min,Fwd IAT Min,Init Bwd Win Byts,Flow Duration,TotLen Fwd Pkts,Idle Min,Tot Fwd Pkts,Flow Pkts/s,Fwd Pkts/s,Idle Std,...,Pkt Len Max,Fwd Pkt Len Mean,Fwd IAT Tot,Pkt Len Mean,Fwd Pkt Len Std,Fwd Act Data Pkts,ACK Flag Cnt,Active Std,Bwd Header Len,Pkt Size Avg
0,4.822562e-05,0.000000e+00,0.0,0.000007,0.199560,0.000000,0.272524,0.0,0.000000e+00,0.000000,...,0.000000,0.596180,3.944964e-07,0.000000,0.000000,0.000000,0.0,0.000002,0.000011,0.000000
1,4.822562e-05,0.000000e+00,0.0,0.000008,0.158971,0.000000,0.211465,0.0,0.000000e+00,0.000000,...,0.000000,0.580719,3.842653e-07,0.000000,0.000000,0.000000,0.0,0.000002,0.000013,0.000000
2,4.822562e-05,0.000000e+00,0.0,0.000011,0.121022,0.000000,0.164447,0.0,0.000000e+00,0.000000,...,0.000000,0.630286,3.925316e-07,0.000000,0.000000,0.000000,0.0,0.000002,0.000022,0.000000
3,4.822562e-05,0.000000e+00,0.0,0.000014,0.092423,0.000000,0.126214,0.0,0.000000e+00,0.000000,...,0.000000,0.596180,3.944964e-07,0.000000,0.000000,0.000000,0.0,0.000002,0.000025,0.000000
4,4.822562e-05,0.000000e+00,0.0,0.000021,0.062876,0.000000,0.087633,0.0,0.000000e+00,0.000000,...,0.000000,0.608458,4.026210e-07,0.000000,0.000000,0.000000,0.0,0.000002,0.000034,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,0.000000e+00,0.000000e+00,0.0,0.096662,0.012648,0.002141,0.000003,0.0,2.988521e-07,0.143479,...,0.118919,0.028195,9.666187e-02,0.260504,0.014184,0.133333,0.0,0.151296,0.000000,0.173669
6362,8.037603e-07,4.307760e-07,0.0,0.252683,0.019176,0.028629,0.000005,0.0,1.714854e-07,0.300907,...,0.118919,0.028498,2.526827e-01,0.263305,0.014952,0.166667,0.0,0.322411,0.000000,0.175537
6363,3.536545e-05,1.895414e-05,0.0,0.252725,0.019176,0.034934,0.000005,0.0,1.714571e-07,0.298418,...,0.118919,0.028498,2.527245e-01,0.263305,0.014952,0.166667,0.0,0.321244,0.000000,0.175537
6364,8.037603e-07,4.307760e-07,0.0,0.000014,0.006528,0.000000,0.000001,0.0,1.004352e-03,0.000000,...,0.118919,0.029104,1.438121e-05,0.268908,0.014184,0.033333,0.0,0.000000,0.000000,0.179272


In [962]:
test_pred = lg.predict(test_df)
print("Class distribution: {}".format(Counter(test_pred))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({0: 36538, 1: 3341})


In [965]:
get_clf_eval(test_df_y,test_pred)

오차행렬:
 [[ 3325  3041]
 [   16 33497]]

정확도: 0.9233
정밀도: 0.9168
재현율: 0.9995
F1: 0.9564


In [964]:
for i in range(len(test_pred)):
    if test_pred[i] == 1:
        test_pred[i] = 0
    else:
        test_pred[i] = 1

In [629]:
test_pred = list(test_pred)
for i in range(len(test_pred)):
    if test_pred[i] == 1.0:
        test_pred[i] = 'BENIGN'
    else:
        test_pred[i] = 'ATTACK'
Counter(test_pred)

Counter({'BENIGN': 501922, 'ATTACK': 54})

In [438]:
answer = pd.DataFrame({'index' : np.arange(len(test_pred)),
                       'answer' : test_pred})
answer.to_csv('answer.csv', 
              index = False)
answer

Unnamed: 0,index,answer
0,0,BENIGN
1,1,BENIGN
2,2,BENIGN
3,3,BENIGN
4,4,BENIGN
...,...,...
322931,322931,BENIGN
322932,322932,BENIGN
322933,322933,BENIGN
322934,322934,BENIGN


In [116]:
df = pd.read_csv('RF_answer.csv', index = False)
df

TypeError: read_csv() got an unexpected keyword argument 'index'

In [659]:
%%time
lg = lgb.LGBMClassifier()
lg.fit(test_df,test_df_y)
lg_pred = lg.predict(test_df)

Wall time: 572 ms


In [124]:
train_df.columns[:78]

Index(['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Siz