In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path
# import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score ,precision_score, recall_score, f1_score, roc_auc_score,  classification_report, precision_recall_curve
from sklearn.metrics import confusion_matrix, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter

import seaborn as sns

In [2]:
train_dataset = sorted([x for x in Path('train-supervised/').glob("*.csv")])
train_dataset

[WindowsPath('train-supervised/train1.csv'),
 WindowsPath('train-supervised/train2.csv'),
 WindowsPath('train-supervised/train3.csv'),
 WindowsPath('train-supervised/train4.csv'),
 WindowsPath('train-supervised/train5.csv')]

In [3]:
test_dataset = sorted([x for x in Path('test/').glob("*.csv")])
test_dataset

[WindowsPath('test/Golden.csv'),
 WindowsPath('test/hulk.csv'),
 WindowsPath('test/slowloris.csv'),
 WindowsPath('test/test_normal.csv')]

In [4]:
def dataframe_from_csv(target):
    return pd.read_csv(target, encoding = 'CP949').rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

In [5]:
train_df = dataframe_from_csvs(train_dataset)
train_df = train_df.drop(['Timestamp'],axis = 1)
train_df = train_df[(train_df['Dst Port'] == 80)]
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,476513,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
1,80,6,475048,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
2,80,6,474926,5,3,220,472,220,0,44.000000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
3,80,6,477471,5,3,209,461,209,0,41.800000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
4,80,6,512758,5,3,211,463,211,0,42.200000,...,32,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,80,6,61369938,11,9,444,788,438,0,40.363636,...,20,23522.333333,29492.486648,83721,11294,9.995354e+06,20778.055219,10004022,9952945,Benign
181011,80,6,102508170,15,13,303,384,293,0,20.200000,...,20,24066.200000,23439.923240,90755,16347,1.000813e+07,19441.796925,10014622,9952829,Benign
181012,80,6,4627404,4,5,1264,3143,1264,0,316.000000,...,20,0.000000,0.000000,0,0,0.000000e+00,0.000000,0,0,Benign
181013,80,6,18892988,6,4,435,706,434,0,72.500000,...,20,48659.500000,52535.912522,85808,11511,9.397834e+06,781851.503492,9950687,8844982,Benign


In [6]:
Counter(train_df['Dst Port'])

Counter({80: 1193092})

In [8]:
Counter(train_df['Label'])

Counter({'Benign': 678174,
         'DoS attacks-GoldenEye': 41508,
         'DoS attacks-Slowloris': 10990,
         'DoS attacks-Hulk': 461648,
         'Brute Force -Web': 460,
         'Brute Force -XSS': 225,
         'SQL Injection': 87})

In [9]:
test_df = dataframe_from_csvs(test_dataset)
# test_df = pd.read_csv('test/5th_label.csv', encoding = 'CP949')
test_df = test_df.drop(['Timestamp'], axis = 1)
# test_df = test_df.rename(columns = {'anormal':'Label'})
# test_df = test_df[(test_df['Dst Port'] == 80)]
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,120059,9820.171749,83.292381,83.292381,0,10,0,1179.000000,...,0.0,0.00,1.179000e+02,0.0,0.0,10.00,0.00,1179.0,0.0,1
1,80,6,117362,8878.512636,76.685810,76.685810,0,9,0,1042.000000,...,0.0,0.00,1.157778e+02,0.0,0.0,9.00,0.00,1042.0,0.0,1
2,80,6,177022,4666.086701,39.543108,39.543108,0,7,0,826.000000,...,0.0,0.00,1.180000e+02,0.0,0.0,7.00,0.00,826.0,0.0,1
3,80,6,175774,4585.433568,39.823865,39.823865,0,7,0,806.000000,...,0.0,0.00,1.151429e+02,0.0,0.0,7.00,0.00,806.0,0.0,1
4,80,6,111320,8093.783687,71.864894,71.864894,0,8,0,901.000000,...,0.0,0.00,1.126250e+02,0.0,0.0,8.00,0.00,901.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34126,80,6,63996942,6.000000,0.000000,372.000000,0,66,60,62.000000,...,20.0,0.00,0.000000e+00,0.0,0.0,12318913.67,13718531.89,31461694.0,15739.0,0
34127,80,6,63997315,6.000000,0.000000,372.000000,0,66,60,62.000000,...,20.0,0.00,0.000000e+00,0.0,0.0,12323277.33,13714246.71,31458691.0,17359.0,0
34128,80,6,51992527,6.000000,0.000000,372.000000,0,66,60,62.000000,...,20.0,0.00,0.000000e+00,0.0,0.0,15781525.00,15771850.00,31553375.0,9675.0,0
34129,80,6,51991936,6.000000,0.000000,372.000000,0,66,60,62.000000,...,20.0,0.00,0.000000e+00,0.0,0.0,15780461.00,15769271.00,31549732.0,11190.0,0


In [10]:
train_df.loc[(train_df['Label'] != 'Benign'), 'Label'] = 1
train_df.loc[(train_df['Label'] == 'Benign'), 'Label'] = 0
print("Class distribution: {}".format(Counter(train_df['Label']))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({0: 678174, 1: 514918})


In [11]:
train_df = train_df.dropna()

In [12]:
TAG_MIN = train_df[train_df.columns].min()
TAG_MAX = train_df[train_df.columns].max()

In [13]:
def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

In [14]:
train_df = normalize(train_df[train_df.columns])
train_df = train_df.dropna()
train_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.003971,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.0,0.0,0.003959,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0.0,0.0,0.003958,0.000069,0.000024,0.002559,0.000003,0.047312,0.0,0.037415,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.0,0.0,0.003979,0.000069,0.000024,0.002431,0.000003,0.044946,0.0,0.035544,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.0,0.0,0.004273,0.000069,0.000024,0.002454,0.000003,0.045376,0.0,0.035884,...,0.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181010,0.0,0.0,0.511416,0.000172,0.000073,0.005164,0.000005,0.094194,0.0,0.034323,...,0.0,0.000735,0.001115,0.002226,0.000353,0.083668,0.000451,0.083741,0.083313,0
181011,0.0,0.0,0.854235,0.000241,0.000106,0.003524,0.000002,0.063011,0.0,0.017177,...,0.0,0.000752,0.000886,0.002413,0.000511,0.083775,0.000422,0.083829,0.083312,0
181012,0.0,0.0,0.038562,0.000052,0.000041,0.014701,0.000020,0.271828,0.0,0.268707,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
181013,0.0,0.0,0.157442,0.000086,0.000032,0.005059,0.000005,0.093333,0.0,0.061650,...,0.0,0.001520,0.001986,0.002282,0.000360,0.078666,0.016963,0.083294,0.074039,0


In [15]:
train_df = train_df.astype({'Label':'int'})

In [16]:
train_df_x=train_df.drop(['Label'],axis=1)
train_df_y=train_df['Label']

In [17]:
train_df_x, valid_df_x, train_df_y, valid_df_y = train_test_split(train_df_x, train_df_y, 
                                                                  test_size = 0.3,
                                                                  shuffle = True,
                                                                 stratify = train_df_y,
                                                                 random_state = 30)

In [40]:
%%time
lr = LogisticRegression(C = 1, penalty = 'l2')
lr.fit(train_df_x,train_df_y)
lr_pred = lr.predict(train_df_x)

Wall time: 8.45 s


In [36]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
#     AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
#     print('AUC: {:.4f}'.format(AUC))

In [21]:
get_clf_eval(train_df_y, lr_pred)

오차행렬:
 [[474545    177]
 [   366 360076]]

정확도: 0.9993
정밀도: 0.9995
재현율: 0.9990
F1: 0.9992


In [22]:
lr_pred_valid = lr.predict(valid_df_x)

In [23]:
get_clf_eval(valid_df_y, lr_pred_valid)

오차행렬:
 [[203372     80]
 [   137 154339]]

정확도: 0.9994
정밀도: 0.9995
재현율: 0.9991
F1: 0.9993


In [24]:
Counter(test_df['Label'])

Counter({1: 33513, 0: 34131})

In [25]:
test_df = test_df.dropna()

In [26]:
test_df_y = test_df['Label']
test_df = test_df.drop(['Label'],axis = 1)

In [27]:
test_df = normalize(test_df[test_df.columns])
test_df

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0.0,0.0,0.001000,0.169034,0.000677,0.000969,0.0,0.002151,0.00000,1.002551,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,8.370701e-08,0.000000,0.000010,0.000000
1,0.0,0.0,0.000978,0.152823,0.000623,0.000892,0.0,0.001935,0.00000,0.886054,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,7.533631e-08,0.000000,0.000009,0.000000
2,0.0,0.0,0.001475,0.080308,0.000321,0.000460,0.0,0.001505,0.00000,0.702381,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000
3,0.0,0.0,0.001465,0.078919,0.000323,0.000463,0.0,0.001505,0.00000,0.685374,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,5.859491e-08,0.000000,0.000007,0.000000
4,0.0,0.0,0.000928,0.139315,0.000584,0.000836,0.0,0.001720,0.00000,0.766156,...,0.000000,-1.0,0.000000,0.000004,0.000000,0.000000,6.696561e-08,0.000000,0.000008,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34126,0.0,0.0,0.533308,0.000086,0.000000,0.004326,0.0,0.014194,0.05102,0.052721,...,0.031250,0.0,0.000000,0.000000,0.000000,0.000000,1.031179e-01,0.297641,0.263356,0.000132
34127,0.0,0.0,0.533311,0.000086,0.000000,0.004326,0.0,0.014194,0.05102,0.052721,...,0.031250,0.0,0.000000,0.000000,0.000000,0.000000,1.031545e-01,0.297548,0.263331,0.000145
34128,0.0,0.0,0.433271,0.000086,0.000000,0.004326,0.0,0.014194,0.05102,0.052721,...,0.031250,0.0,0.000000,0.000000,0.000000,0.000000,1.321024e-01,0.342190,0.264124,0.000081
34129,0.0,0.0,0.433266,0.000086,0.000000,0.004326,0.0,0.014194,0.05102,0.052721,...,0.031250,0.0,0.000000,0.000000,0.000000,0.000000,1.320935e-01,0.342134,0.264093,0.000094


In [28]:
test_pred = lr.predict(test_df)
print("Class distribution: {}".format(Counter(test_pred))) ## 1:BENIGN, 0: ATTACK

Class distribution: Counter({1: 38568, 0: 29036})


In [31]:
get_clf_eval(test_df_y,test_pred)

오차행렬:
 [[32349  1782]
 [ 6219 27254]]

정확도: 0.8816
정밀도: 0.9386
재현율: 0.8142
F1: 0.8720
