In [None]:
import itertools
import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn.metrics as sm

In [None]:
label = ['BENIGN', 'Bot', 'DDos', 'GlodenEye', 'Dos Hulk',
         'Slowhttp', 'SSH', 'FTP', 'PortScan', 'slowloris', 'BruteForce', 'XSS']
columns = ['Destination_Port', 'Flow_Duration', 'Total_Fwd_Packets',
           'Total_Backward_Packets', 'Total_Length_of_Fwd_Packets',
           'Total_Length_of_Bwd_Packets', 'Fwd_Packet_Length_Max',
           'Fwd_Packet_Length_Min', 'Fwd_Packet_Length_Mean',
           'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
           'Bwd_Packet_Length_Min', 'Bwd_Packet_Length_Mean',
           'Bwd_Packet_Length_Std', 'Flow_Bytes/s', 'Flow_Packets/s',
           'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
           'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
           'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std',
           'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags',
           'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
           'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
           'Min_Packet_Length', 'Max_Packet_Length', 'Packet_Length_Mean',
           'Packet _Length_Std', ' Packet_Length_Variance', 'FIN_Flag_Count',
           'SYN_Flag_Count', 'RST_Flag_Count', 'PSH_Flag_Count',
           'ACK_Flag_Count', 'URG_Flag_Count', 'CWE_Flag_Count',
           'ECE_Flag_Count', 'Down/Up_Ratio', 'Average_Packet_Size',
           'Avg_Fwd_Segment_Size', 'Avg_Bwd_Segment_Size',
           'Fwd_Header_Length.1', 'Fwd_Avg_Bytes/Bulk', '_Fwd_Avg_Packets/Bulk',
           'Fwd_Avg_Bulk_Rate', 'Bwd_Avg_Bytes/Bulk', 'Bwd_Avg_Packets/Bulk',
           'Bwd_Avg_Bulk_Rate', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
           'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Win_bytes_forward',
           'Init_Win_bytes_backward', 'act_data_pkt_fwd',
           'min_seg_size_forward', 'Active_Mean', 'Active_Std', 'Active_Max',
           'Active Min', 'Idle_Mean', 'Idle_Std', 'Idle_Max', 'Idle_Min',
           'Label']


In [None]:
def HandleData(path):
    list_dir = os.listdir(path)
    fd_data = []
    for it in list_dir:
        data = pd.read_csv(path + '/' + it)
        fd_data.append(data)
    # data=fd_data
    # data = pd.concat([fd_data[0], fd_data[1]])
    # if len(fd_data)>=2:
    #     for it in range(2, len(fd_data)):
    #         data = pd.concat([data, fd_data[it]])
    data = data.dropna(axis=0, how='any')
    data = data.replace(',,', np.nan, inplace=False)
    data.replace("Infinity", 0, inplace=True)

    data.replace('Infinity', 0.0, inplace=True)
    data.replace('NaN', 0.0, inplace=True)
    data.columns=columns
    n_row, n_col = data.shape
    data.replace([np.inf, -np.inf], 0, inplace=True)
    data.dropna(inplace=True)
    data.drop_duplicates(inplace=True)
    data.Label[data['Label'] != 'BENIGN'] = 1
    data.Label[data['Label'] == 'BENIGN'] = 0

    return data


In [None]:
data=HandleData("/root/data/fxg/csv")

In [None]:
def Train(data, decomponent=False):
    x_columns = data.columns.drop('Label')
    x = data[x_columns].values
    x = normalize(x, axis=0, norm='max')
    if decomponent:
        pca = PCA(n_components=20)
        x = pca.fit_transform(x)
    dummies = pd.get_dummies(data['Label'])
    outcomes = dummies.columns
    print(outcomes)
    num_classes = len(outcomes)
    print('[traffic] 类别数:', num_classes)
    y = dummies.values
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.3, random_state=20)

    return x_train, y_train, x_test, y_test


In [None]:
# a=Train(data)
data.Label.value_counts()

In [None]:
def RF(train_X, train_Y, test_X, test_Y):
    print('[RF] train...')
    t1 = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(train_X, train_Y)
    Y_pred = rfc.predict(test_X)
    acc = accuracy_score(test_Y, Y_pred)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), Y_pred.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y, Y_pred)
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'RF Confusion matrix')


In [None]:
def KNN(train_X, train_Y, test_X, test_Y):
    print('[KNN] train...')
    t1 = time.time()
    knn = KNeighborsClassifier(n_neighbors=5)
    model = knn.fit(train_X, train_Y)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y, y_hat)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y, y_hat)
    print(report)
    print('-' * 20)
    # plot_confusion_matrix(matrix, label, True, 'KNN Confusion matrix')
    return report,y_hat,acc


In [None]:
# a=KNN(Train(data))
train_X, train_Y, test_X, test_Y = Train(data)
KNN(train_X, train_Y, test_X, test_Y)

In [1]:
def SVM(train_X, train_Y, test_X, test_Y):
    print('[SVM] train ...')
    train_Y = [np.where(r == 1)[0][0] for r in train_Y]
    test_Y = [np.where(r == 1)[0][0] for r in test_Y]
    t1 = time.time()
    clf = svm.SVC(decision_function_shape='ovr', max_iter=300, kernel='rbf')
    model = clf.fit(train_X, train_Y)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y, y_hat)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y, y_hat)
    print(matrix)
    report = classification_report(test_Y, y_hat)
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'SVM Confusion matrix')


In [None]:
def NaiveBayes(train_X, train_Y, test_X, test_Y):
    print('[Naive Bayes] train ...')
    train_Y = [np.where(r == 1)[0][0] for r in train_Y]
    test_Y = [np.where(r == 1)[0][0] for r in test_Y]
    t1 = time.time()
    clf = BernoulliNB()
    model = clf.fit(train_X, train_Y)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y, y_hat)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y, y_hat)
    print(matrix)
    report = classification_report(test_Y, y_hat)
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'NB Confusion matrix')


In [None]:
def MLP(train_X, train_Y, test_X, test_Y):
    print('[MLP] train ...')
    t1 = time.time()
    model = MLPClassifier(hidden_layer_sizes=(100,),
                          activation='logistic',
                          solver='adam',
                          learning_rate_init=0.0001,
                          max_iter=2000)
    model.fit(train_X, train_Y)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y, y_hat)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y, y_hat)
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'MLP Confusion matrix')


In [None]:
def DT(train_X, train_Y, test_X, test_Y):
    t1 = time.time()
    clf = DecisionTreeClassifier(max_depth=6)
    model = clf.fit(train_X, train_Y)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y, y_hat)
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y, y_hat)
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'DT Confusion matrix')


In [None]:
def DNN(train_X, train_Y, test_X, test_Y):
    t1 = time.time()
    model = Sequential()
    model.add(Dense(
        16, input_dim=train_X.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(64, kernel_initializer='normal', activation='relu'))
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.add(Dense(train_Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.summary()
    model.fit(train_X, train_Y, epochs=40, verbose=2)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'DNN Confusion matrix')


In [None]:
def CNN(train_X, train_Y, test_X, test_Y):
    print('[CNN] train ...')
    train_X, test_X = ImageHandle(train_X, test_X, test_Y)
    t1 = time.time()
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(
            32, (3, 3), activation='relu', input_shape=(10, 10, 1)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (2, 2), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(test_Y.shape[1])
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.CategoricalCrossentropy(
                      from_logits=True),
                  metrics=['accuracy'])
    model.summary()
    model.save('model.h5')
    his = model.fit(train_X, train_Y, batch_size=128,
                    verbose=2, epochs=30, validation_split=0.1)
    print(his.history)
    y_hat = model.predict(test_X)
    acc = accuracy_score(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    t2 = time.time()
    print('acc:', acc)
    print('using time:', t2 - t1, 'sec')
    matrix = sm.confusion_matrix(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(matrix)
    report = classification_report(test_Y.argmax(axis=1), y_hat.argmax(axis=1))
    print(report)
    print('-' * 20)
    plot_confusion_matrix(matrix, label, True, 'CNN Confusion matrix')


In [None]:
def main():
    data = HandleData('./input')
    train_X, train_Y, test_X, test_Y = Train(data)
    # RF(train_X, train_Y, test_X, test_Y)
    # KNN(train_X, train_Y, test_X, test_Y)
    # SVM(train_X, train_Y, test_X, test_Y)
    # MLP(train_X, train_Y, test_X, test_Y)
    # NaiveBayes(train_X, train_Y, test_X, test_Y)
    # DT(train_X, train_Y, test_X, test_Y)
    # DNN(train_X, train_Y, test_X, test_Y)
    # CNN(train_X, train_Y, test_X, test_Y)


In [None]:
if __name__ == '__main__':
    main()