In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
import joblib
import sklearn.metrics
import sklearn.model_selection
import sklearn.neural_network

# MLP 모델

In [3]:
#train and evaluate

def train_and_evaluate(X_train, y_train, X_test, y_test):
    
    #첫번째 모델 : sklearn.neural_network 라이브리의 MLP 분류기 사용
    model = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                                                alpha = 0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t = 0.5,
                                                max_iter = 1000, shuffle=True, random_state=None, tol = 0.0001, verbose = False,
                                                warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False,
                                                validation_fraction=0.1, beta_1=0.9, beta_2 = 0.999, epsilon=1e-08,
                                                n_iter_no_change=10)
    
    #train the model on the whole data set
    model.fit(X_train, y_train)
    
    #save the model
    joblib.dump(model, 'mlp_classifier.jbl')
    
    #evaluate on training data
    print('\n -- Training data --')
    
    predictions = model.predict(X_train)
    accuracy = sklearn.metrics.accuracy_score(y_train, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy*100.0))
    print('Classification Report:')
    
    print(sklearn.metrics.classification_report(y_train, predictions))
    print('')
    
    #evaluate on test data
    print('\n -- Test data --')
    predictions = model.predict(X_test)
    
    #print predictions and real value
    df = pd.concat([pd.DataFrame(predictions, columns=['predictions']), pd.DataFrame(y_test.tolist(), columns=['y_test'])], axis=1)
    display(df)
    df.to_csv("MLP_predictions.csv")
    
    #print accuracy & evaluation
    accuracy = sklearn.metrics.accuracy_score(y_test, predictions)
    print('Accuracy : {0:.2f}'.format(accuracy*100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(y_test, predictions))
    print('Confusion Matrix:')
    print(sklearn.metrics.confusion_matrix(y_test, predictions))


## 교차검증

In [4]:
from sklearn.model_selection import KFold, cross_val_score

def cross_validate(features, label):
    
    #seed 는 아무거나~^^
    seed=1771

    #모델 불러오기
    model = joblib.load('mlp_classifier.jbl')

    #fold 수는 10으로 함.
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    results = cross_val_score(model, features, label, cv=kfold)
    
    print(" -- Cross Validation -- ")
    print(results)

## 인코딩

In [5]:
from sklearn.preprocessing import LabelEncoder

def labeling(target):
    encoder = LabelEncoder()
    encoder.fit(target)
    label = encoder.transform(target)
    return label

# 메인함수

In [6]:
#The main entry point for this module
def main():
    
    # load data set (includes header values)
    dataset = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
    
    # -- 전처리!!! -- 
    
    # label/feature 나누기
    label = dataset.iloc[:, -1]
    features = dataset.iloc[:,:-1]
    features.replace('Infinity', -1, inplace=True)
    pd.value_counts(features[" Flow Packets/s"].values, sort=False)
    label.replace(['BENIGN','DDoS'], [0,1], inplace=True) #inplace 해줘야 반영됨

    # 인코딩
    flow_id = features["Flow ID"]
    src_ip = features[" Source IP"]
    dst_ip = features[" Destination IP"]
    timestamp = features[" Timestamp"]
    features["Flow ID"] = labeling(flow_id)
    features[" Source IP"] = labeling(src_ip)
    features[" Destination IP"] = labeling(dst_ip)
    features[" Timestamp"] = labeling(timestamp)
    
    #NaN 값을 0으로 (NUll -> 0으로 처리하는게 통계적으로 수용될 지 모르겠지만...)
    features.fillna(0, inplace=True)

    # -- 학습 & 테스트 -- 
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features,label,test_size=0.8, random_state=1886, stratify=label)
    
    # -- 평가 ---
    '''
    # make sure that data still is balanced
    print(' -- Class balance -- ')
    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_test, return_counts=True))
    '''
    
    # train & evaluate - MLP(model)
    print("MLP 모델 학습 및 테스트 결과.")
    train_and_evaluate(X_train, y_train, X_test, y_test) 
    
    # 교차검증
    cross_validate(features, label)

In [7]:
if __name__ == "__main__":
    main()

  if (await self.run_code(code, result,  async_=asy)):


MLP 모델 학습 및 테스트 결과.

 -- Training data --
Accuracy: 99.16
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     19544
           1       0.99      1.00      0.99     25605

    accuracy                           0.99     45149
   macro avg       0.99      0.99      0.99     45149
weighted avg       0.99      0.99      0.99     45149



 -- Test data --


Unnamed: 0,predictions,y_test
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
...,...,...
180591,1,1
180592,0,0
180593,1,1
180594,0,0


Accuracy : 99.18
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     78174
           1       0.99      1.00      0.99    102422

    accuracy                           0.99    180596
   macro avg       0.99      0.99      0.99    180596
weighted avg       0.99      0.99      0.99    180596

Confusion Matrix:
[[ 76817   1357]
 [   115 102307]]
 -- Cross Validation -- 
[0.9986711  0.9862237  0.99269103 0.98848283 0.98671096 0.90692832
 0.92247719 0.90294144 0.91042793 0.99437406]
