In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import numpy as np
import pandas as pd
import torch as th
from torch.autograd import Variable as V
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from preprocessing import Preprocess_GAN,CreateBatch_GAN
# from model.model_class import Blackbox_IDS,Generator,Discriminator
import matplotlib.pyplot as plt
import pickle

# https://github.com/HongQuangDevVN/IDSGAN-on-SDN/blob/master/model/save/BlackBox/IDS.pth

In [2]:
 # feature_names of the CICIDS 2017 dataset
feature_names = ['FlowDuration', 'TotFwdPkts', 'TotBwdPkts', 'TotLenFwdPkts',
    'TotLenBwdPkts', 'FwdPktLenMin', 'FwdPktLenStd', 'BwdPktLenMax',
    'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
    'FlowIATStd', 'FwdIATTot', 'FwdIATMean', 'FwdIATMax', 'BwdIATMean',
    'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'BwdPSHFlags', 'FwdHeaderLen',
    'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMax', 'PktLenStd',
    'FINFlagCnt', 'SYNFlagCnt', 'ACKFlagCnt', 'Down/UpRatio',
    'BwdSegSizeAvg', 'FwdHeaderLen.1', 'SubflowFwdPkts', 'SubflowFwdByts',
    'IdleStd', 'SubflowBwdPkts', 'SubflowBwdByts', 'InitBwdWinByts',
    'FwdActDataPkts', 'ActiveStd', 'ActiveMax']

In [3]:
# Load data and do train test split
from sklearn.model_selection import KFold, cross_val_score, train_test_split


def load_data(filename, trainLabel, data_ml):
    # Load the data
    df = pd.read_csv(filename)
    X_predicted = df.drop(columns=['Label', trainLabel], axis=1)
    # change column name
    df.rename(columns={'Label': 'attack_type', trainLabel: 'Label'}, inplace=True)
    y_predicted = df['Label']
    



    # Initial train test split set
    X_train_predicted, X_test_predicted, y_train_predicted, y_test_predicted = train_test_split(X_predicted, y_predicted, test_size=0.2, random_state=42, stratify=y_predicted)
    
    # Further split training set into training (70%) and validation (10%) sets
    X_train_predicted, X_val_predicted, y_train_predicted, y_val_predicted = train_test_split(X_train_predicted, y_train_predicted, test_size=0.1, random_state=42, stratify=y_train_predicted)
    
    # save datasets
    train_predicted = pd.concat([X_train_predicted, y_train_predicted], axis=1)
    val_predicted = pd.concat([X_val_predicted, y_val_predicted], axis=1)
    test_predicted = pd.concat([X_test_predicted, y_test_predicted], axis=1)
    #../datasets/surrogate_model/CICIDS2017/split_dataset/train_{data_ml}_predicted.csv
    train_predicted.to_csv(f'datasets/surrogate_model/CICIDS2017/data_for_gan/train_{data_ml}_predicted.csv', index=False) 
    val_predicted.to_csv(f'datasets/surrogate_model/CICIDS2017/data_for_gan/val_{data_ml}_predicted.csv', index=False)
    test_predicted.to_csv(f'datasets/surrogate_model/CICIDS2017/data_for_gan/test_{data_ml}_predicted.csv', index=False)
    
    return X_train_predicted, X_val_predicted, X_test_predicted, y_train_predicted, y_val_predicted, y_test_predicted



In [4]:
# # Load data and do train test split
# X_train_lr_predicted, X_val_lr_predicted, X_test_lr_predicted, y_train_lr_predicted, y_val_lr_predicted, y_test_lr_predicted = load_data('surrogate_model/data_for_training/0319/df_lr_predicted_0319.csv', 'LR_Predicted', 'lr')
# X_train_dt_predicted, X_val_dt_predicted, X_test_dt_predicted, y_train_dt_predicted, y_val_dt_predicted, y_test_dt_predicted = load_data('surrogate_model/data_for_training/0319/df_dt_predicted_0319.csv', 'DT_Predicted', 'dt')
# X_train_xgb_predicted, X_val_xgb_predicted, X_test_xgb_predicted, y_train_xgb_predicted, y_val_xgb_predicted, y_test_xgb_predicted = load_data('surrogate_model/data_for_training/0319/df_xgb_predicted_0319.csv', 'XGB_Predicted', 'xgb')
# X_train_dnn_predicted, X_val_dnn_predicted, X_test_dnn_predicted, y_train_dnn_predicted, y_val_dnn_predicted, y_test_dnn_predicted = load_data('surrogate_model/data_for_training/0319/df_dnn_predicted_0319.csv', 'DNN_Predicted', 'dnn')

In [5]:
# train_dataset = pd.read_csv("datasets/KDD_dataset/other_half_KDDTrain+.csv")
# test_dataset = pd.read_csv("datasets/KDD_dataset/KDDTest+.csv")
train_dataset = pd.read_csv("datasets/surrogate_model/CICIDS2017/data_for_gan/train_dt_predicted.csv")
val_dataset = pd.read_csv("datasets/surrogate_model/CICIDS2017/data_for_gan/val_dt_predicted.csv")
test_dataset = pd.read_csv("datasets/surrogate_model/CICIDS2017/data_for_gan/test_dt_predicted.csv")

In [6]:
train_dataset.shape, val_dataset.shape, test_dataset.shape

((86705, 43), (9634, 43), (24085, 43))

In [7]:
train_dataset['Label'].value_counts(), val_dataset['Label'].value_counts(), test_dataset['Label'].value_counts()    

(Label
 0    82355
 1     4350
 Name: count, dtype: int64,
 Label
 0    9151
 1     483
 Name: count, dtype: int64,
 Label
 0    22877
 1     1208
 Name: count, dtype: int64)

In [8]:


def Preprocess_GAN(train):
        
    # min max standardization
    numeric_columns = list(train.select_dtypes(include=['int', "float"]).columns) # select all columns that are numeric
    numeric_columns.remove("BwdPSHFlags")
    for c in numeric_columns:
        max_ = train[c].max()
        min_ = train[c].min()
        if max_ == 0:
            max = 0.1
        train[c] = train[c].map(lambda x: (x - min_) / (max_ - min_))


    #  1: annomaly; 0: normaly
    # train["DT_Predicted"] = train["DT_Predicted"].map(lambda x: 1 if x == "anomaly" else 0)
    # get all rows of malicious traffic, and all columns except the last one
    raw_attack = np.array(train[train["Label"] == 1])[:, :-1]
    # get all rows of benign traffic, and all columns except the last one
    normal = np.array(train[train["Label"] == 0])[:, :-1]
    
    # get the true label of the train set
    true_label = train["Label"]

    del train["Label"]

    return train, raw_attack, normal, true_label

In [9]:
train_data,raw_attack,normal,true_label = Preprocess_GAN(test_dataset)
train_data.columns  

Index(['FlowDuration', 'TotFwdPkts', 'TotBwdPkts', 'TotLenFwdPkts',
       'TotLenBwdPkts', 'FwdPktLenMin', 'FwdPktLenStd', 'BwdPktLenMax',
       'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
       'FlowIATStd', 'FwdIATTot', 'FwdIATMean', 'FwdIATMax', 'BwdIATMean',
       'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'BwdPSHFlags', 'FwdHeaderLen',
       'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMax', 'PktLenStd',
       'FINFlagCnt', 'SYNFlagCnt', 'ACKFlagCnt', 'Down/UpRatio',
       'BwdSegSizeAvg', 'FwdHeaderLen.1', 'SubflowFwdPkts', 'SubflowFwdByts',
       'IdleStd', 'SubflowBwdPkts', 'SubflowBwdByts', 'InitBwdWinByts',
       'FwdActDataPkts', 'ActiveStd', 'ActiveMax'],
      dtype='object')

In [10]:
# values of the true label distribution
true_label.value_counts()


Label
0.0    22877
1.0     1208
Name: count, dtype: int64

In [11]:
len(train_data.columns)

42

In [12]:
def CreateBatch_GAN(x, batch_size):
    #print("x.shape: ", x.shape) 
    # Comment - a là danh sách các số từ 0 -> len(x)
    a = list(range(len(x)))
   # print("a: ", a)
    # Comment - Xáo trộn a lên, đảo lộn vị trí các phần từ của a
    np.random.shuffle(a)
    # Comment - Xáo trộn các phần tử trong x
    x = x[a]
   # print("x: ", x)
    # Comment - Mảng các batch, mỗi batch có số phần tử là batch size
    batch_x = [x[batch_size * i: (i + 1) * batch_size, :] for i in range(len(x) // batch_size)]
    return batch_x

In [13]:
import torch as th
from torch import nn
from torch.autograd import Variable as V

class Generator(nn.Module):
    def __init__(self,input_dim, output_dim):
        super(Generator, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_dim, 21), #input_dim//2
            nn.ReLU(True),
            nn.Linear(21, 21),
            nn.ReLU(True),
            nn.Linear(21, 21),
            nn.ReLU(True),
            # nn.Linear(21, 21),
            # nn.ReLU(True),
            nn.Linear(21,output_dim),
        )
    def forward(self,x):
        x = self.layer(x)
        return th.clamp(x,0.,1.)

class Discriminator(nn.Module):
    def __init__(self,input_dim, output_dim):
        super(Discriminator, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.LeakyReLU(True),
            nn.Linear(input_dim * 2, input_dim),
            nn.LeakyReLU(True),
            #nn.Linear(input_dim*2 , input_dim*2),
            #nn.LeakyReLU(True),
            nn.Linear(input_dim,input_dim//2),
            nn.LeakyReLU(True),
            nn.Linear(input_dim//2,output_dim),
        )

    def forward(self,x):
        return self.layer(x)

In [14]:
import torch

# check if raw_traffic is nan
print(np.isnan(raw_attack).any())
print(raw_attack.shape)
print(type(raw_attack))
np.argwhere(np.isnan(raw_attack))

False
(1208, 42)
<class 'numpy.ndarray'>


array([], shape=(0, 2), dtype=int64)

In [15]:
def compute_gradient_penalty(D, normal_t, attack_t):
    alpha = th.Tensor(np.random.random((normal_t.shape[0], 1)))
    between_n_a = (alpha * normal_t + ((1 - alpha) * attack_t)).requires_grad_(True)
    d_between_n_a = D(between_n_a)
    adv = V(th.Tensor(normal_t.shape[0], 1).fill_(1.0), requires_grad=False)
    gradients = autograd.grad(
        outputs=d_between_n_a,
        inputs=between_n_a,
        grad_outputs=adv,
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty

# train_dataset = pd.read_csv("datasets/KDD_dataset/other_half_KDDTrain+.csv")
# test_dataset = pd.read_csv("datasets/KDD_dataset/KDDTest+.csv")

# train_data,raw_attack,normal,true_label = Preprocess_GAN(train_dataset)

#DEFINE
BATCH_SIZE = 64 # Batch size
CRITIC_ITERS = 15 # For WGAN and WGAN-GP, number of critic iters per gen iter
LAMBDA = 10     # Gradient penalty lambda hyperparameter
MAX_EPOCH = 50 # How many generator iterations to train for
D_G_INPUT_DIM = len(train_data.columns) # 9 features
G_OUTPUT_DIM = len(train_data.columns) # 9 features
D_OUTPUT_DIM = 1
CLAMP = 0.01
LEARNING_RATE_G=0.00001
LEARNING_RATE_D=0.00001
# print D_G_INPUT_DIM
#print("D_G_input DIM: ", D_G_INPUT_DIM)

# Load BlackBox IDS model 
# ids_model = Blackbox_IDS(D_G_INPUT_DIM,2)
# param = th.load('datasets/KDD_dataset/IDS.pth')
# ids_model.load_state_dict(param)
ids_model = pickle.load(open('surrogate_model/ml_model/dt_model_from_dtdata.pickle', 'rb')) #surrogate_model/ml_model/lr_model_from_dtdata.pickle


generator = Generator(D_G_INPUT_DIM,G_OUTPUT_DIM)
print(100*'=')
print(generator)

discriminator = Discriminator(D_G_INPUT_DIM,D_OUTPUT_DIM)
print(100*'=')
print(discriminator)


#Optimization. Similar to Gradient Descent. https://viblo.asia/p/thuat-toan-toi-uu-adam-aWj53k8Q56m
optimizer_G = optim.RMSprop(generator.parameters(), LEARNING_RATE_G)
optimizer_D = optim.RMSprop(discriminator.parameters(), LEARNING_RATE_D)

# 由於不可能放入整個資料集，因此資料集會分批輸出（更小、相等的部分）。
batch_attack = CreateBatch_GAN(raw_attack,BATCH_SIZE)
d_losses,g_losses = [],[] #loss status
#ids_model.eval()

generator.train()
discriminator.train()

cnt = -5
print("IDSGAN start training")
print("-"*100)
for epoch in range(MAX_EPOCH):
    # train one batch per epoch
    normal_batch = CreateBatch_GAN(normal,BATCH_SIZE)
    epoch_g_loss = 0.
    epoch_d_loss = 0.
    c=0
    for nb in normal_batch:
        normal_b = th.Tensor(nb)
        #  Train Generator
        for p in discriminator.parameters():
            p.requires_grad = False

        optimizer_G.zero_grad()
        
        # 將 raw_attack 中的隨機 n=BATCH_SIZE 個元素提取為 random_traffic
        random_attack_traffic = raw_attack[np.random.randint(0,len(raw_attack),BATCH_SIZE)]
        # 從 random_traffic 中提取，並添加來自0到1之間的隨機噪音值。
        ###!! random_traffic_noised - random_traffic_noised 的值可能大於 1
        random_traffic_noised = random_attack_traffic + np.random.uniform(0,1,(BATCH_SIZE,D_G_INPUT_DIM))

        z = V(th.Tensor(random_traffic_noised))
        adversarial_traffic = generator(z) #generate attack traffic

        # print("adversarial_traffic1: ", adversarial_traffic)
        D_pred= discriminator(adversarial_traffic) #discriminator generated output
        # g_loss = -th.mean(D_pred)
        g_loss = -1 * discriminator(generator(z)).mean()
        g_loss.backward()
        optimizer_G.step()

        epoch_g_loss += g_loss.item()
        # Train Discriminator
        for p in discriminator.parameters():
            p.requires_grad = True

        for c in range(CRITIC_ITERS): # update discriminator parameter per loop
            optimizer_D.zero_grad() # zero_grad() clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).
            for p in discriminator.parameters(): #weighting clipping
                p.data.clamp_(-CLAMP, CLAMP)
            # generate adversarial traffic
            temp_data = raw_attack[np.random.randint(0,len(raw_attack),BATCH_SIZE)] + np.random.uniform(0, 1,(BATCH_SIZE,D_G_INPUT_DIM))
            
            # random_attack_traffic = raw_attack[np.random.randint(0,len(raw_attack),BATCH_SIZE)]
            # temp_data = random_attack_traffic + np.random.uniform(0,1,(BATCH_SIZE,D_G_INPUT_DIM))
            # print("temp_data: ", temp_data) 
            z = V(th.Tensor(temp_data))
            adversarial_traffic = generator(z).detach()
        
            #ids_input = th.cat((adversarial_traffic,normal_b))
            ids_input = adversarial_traffic
            # print("ids_input: ", ids_input)
            # 檢查 tensor 中是否所有的值都是 0 或 1
            # is_binary = ((ids_input == 0) | (ids_input == 1)).all()

            # print(is_binary)  # 如果 tensor 中所有的值都是 0 或 1，則輸出 True，否則輸出 False
            # print("epoch: ", epoch) 
            # print("critic: ", c)
            # if epoch == 0 and c == 0:
            #     print("adversarial_traffic: ", adversarial_traffic)
            #     # save the first batch of adversarial traffic
            #     adversarial_traffic_first_batch = pd.DataFrame(adversarial_traffic.detach().numpy(), columns=feature_names)

            #     adversarial_traffic_first_batch.to_csv('datasets/surrogate_model/CICIDS2017/data_for_gan/adversarial_traffic_first_batch.csv', index=False)  
                
            l = list(range(len(ids_input)))
            np.random.shuffle(l)
            ids_input = V(th.Tensor(ids_input[l]))

            # ids_pred = ids_model(ids_input)
            # ids_pred_label = th.argmax(nn.Sigmoid()(ids_pred),dim = 1).detach().numpy()
                        
            # 將 ids_input 轉換為 DataFrame，並設置特徵名稱
            ids_input_df = pd.DataFrame(ids_input.detach().numpy(), columns=feature_names)
            ids_pred_label = ids_model.predict(ids_input_df)   
            # print("ids_pred_label: ", ids_pred_label) 

            pred_normal = ids_input.numpy()[ids_pred_label==0]
            pred_attack = ids_input.numpy()[ids_pred_label==1]


            if len(pred_attack) == 0: #!!!!!why!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                cnt += 1
                break

            D_normal = discriminator(V(th.Tensor(pred_normal)))
            D_attack= discriminator(V(th.Tensor(pred_attack)))
            # print("D_normal: ", D_normal)
            # prkint("D_attack: ", D_attack)
            loss_normal = th.mean(D_normal)
            loss_attack = th.mean(D_attack)
            #gradient_penalty = compute_gradient_penalty(discriminator, normal_b.data, adversarial_traffic.data)
            d_loss =  loss_attack - loss_normal #+ LAMBDA * gradient_penalty
            d_loss.backward()
            optimizer_D.step()
            epoch_d_loss += d_loss.item()

    d_losses.append(epoch_d_loss/CRITIC_ITERS)
    g_losses.append(epoch_g_loss)
    print(f"{epoch} : {epoch_g_loss} \t {epoch_d_loss/CRITIC_ITERS}")
'''
    if cnt >= 100:
        print("Not exist predicted attack traffic")
        break
'''

print("IDSGAN finish training")

th.save(generator.state_dict(), 'GAN_materials/testGAN/gan_model/from_dt_surrogate_model/generator_dt_model_from_dtdata_0319_2338.pth') # GAN_materials\testGAN\gan_model\from_dt_surrogate_model\discriminator_dt_model_from_dtdata_0319_2338.pth
th.save(discriminator.state_dict(), 'GAN_materials/testGAN//gan_model/from_dt_surrogate_model/discriminator_dt_model_from_dtdata_0319_2338.pth')

plt.plot(d_losses,label = "D_loss")
plt.plot(g_losses, label = "G_loss")
plt.legend()
plt.show()

Generator(
  (layer): Sequential(
    (0): Linear(in_features=42, out_features=21, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=21, out_features=21, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=21, out_features=21, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=21, out_features=42, bias=True)
  )
)
Discriminator(
  (layer): Sequential(
    (0): Linear(in_features=42, out_features=84, bias=True)
    (1): LeakyReLU(negative_slope=True)
    (2): Linear(in_features=84, out_features=42, bias=True)
    (3): LeakyReLU(negative_slope=True)
    (4): Linear(in_features=42, out_features=21, bias=True)
    (5): LeakyReLU(negative_slope=True)
    (6): Linear(in_features=21, out_features=1, bias=True)
  )
)
IDSGAN start training
----------------------------------------------------------------------------------------------------
0 : 5.409117531962693 	 nan
1 : 5.447414404712617 	 nan
2 : 4.863914705812931 	 -0.028019800037145614
3 : 4.71

KeyboardInterrupt: 