In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from scipy import interp
import pandas as pd


from sklearn.ensemble import GradientBoostingClassifier as gboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (precision_recall_curve, average_precision_score, 
                             roc_curve, auc, confusion_matrix, mean_squared_error,
                             classification_report)
from sklearn import metrics
from sklearn.model_selection import cross_validate

from keras.models import Model, Sequential
from keras.layers import Input, Dense, LSTM, RepeatVector, Lambda
from keras.layers import Activation
from keras.optimizers import RMSprop
from keras.layers.merge import _Merge
from keras import backend as K
from functools import partial
from keras.metrics import binary_crossentropy
from keras.layers import BatchNormalization
from keras import regularizers


pd.options.display.max_rows = 4000

# In[0]: Functions Definitions

def scalex(X):
    """ normalize between 0 and 1 the values in X """
    nmin, nmax = 0.0, 1.0
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (nmax - nmin) + nmin
    return X_scaled
#
# Last card of scalex.
#
def calcrmse(X_train, gensamples):
    """ compute mse for training """
    max_column = X_train.shape[1]
    rmse_lst = []
    for col in range(max_column):
        rmse_lst.append(np.sqrt(mean_squared_error(X_train[:,col], gensamples[:,col])))
    return np.sum(rmse_lst) / max_column
#
# Last card of calcrmse.
#
def wasserstein_loss(y_true, y_pred):
    """ Wasserstein distance """
    return K.mean(y_true * y_pred)
#
# Last card of wasserstein_loss.
#
def gradient_penalty_loss(y_true, y_pred, averaged_samples, lamba_reg):
    """ compute gradient penalty loss for GP-WGAN """
    gradients = K.gradients(y_pred, averaged_samples)[0]
    gradients_sqr = K.square(gradients)
    gradients_sqr_sum = K.sum(gradients_sqr,
                              axis=np.arange(1, len(gradients_sqr.shape)))
    gradient_l2_norm = K.sqrt(gradients_sqr_sum)
    gradient_penalty = lamba_reg * K.square(1 - gradient_l2_norm)
    return K.mean(gradient_penalty)
#
# Last card of gradient_penalty_loss.
#
class RandomWeightedAverage(_Merge):
    def _merge_function(self, inputs):
        weights = K.random_uniform((BATCH_SIZE, 1))
        return (weights * inputs[0]) + ((1 - weights) * inputs[1])
#
# Last card of RandomWeightedAverage.
#
def generate_samples(generator_model, noise_dim, num_samples):
    """ generate samples to be used for futher analysis """
    return generator_model.predict(np.random.rand(num_samples, noise_dim))
#
# Last card of generate_samples.
#
# Last card of generate_images2D.
#
#
# Last card of writetocsv.

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_csv('./data/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv', sep=",", header=None, low_memory=False)
print(df)

                       0               1                   2   \
0        Destination Port   Flow Duration   Total Fwd Packets   
1                      80           38308                   1   
2                     389             479                  11   
3                      88            1095                  10   
4                     389           15206                  17   
...                   ...             ...                 ...   
692699                 53           32215                   4   
692700                 53             324                   2   
692701              58030              82                   2   
692702                 53         1048635                   6   
692703                 53           94939                   4   

                             3                            4   \
0        Total Backward Packets  Total Length of Fwd Packets   
1                             1                            6   
2                          

In [3]:
df = df.dropna()

In [4]:
df.columns = df.iloc[0]
df = df[1:]

In [5]:
df = df.apply(pd.to_numeric, errors='ignore')
print(df.dtypes)

0
 Destination Port                 int64
 Flow Duration                    int64
 Total Fwd Packets                int64
 Total Backward Packets           int64
Total Length of Fwd Packets       int64
 Total Length of Bwd Packets      int64
 Fwd Packet Length Max            int64
 Fwd Packet Length Min            int64
 Fwd Packet Length Mean         float64
 Fwd Packet Length Std          float64
Bwd Packet Length Max             int64
 Bwd Packet Length Min            int64
 Bwd Packet Length Mean         float64
 Bwd Packet Length Std          float64
Flow Bytes/s                    float64
 Flow Packets/s                 float64
 Flow IAT Mean                  float64
 Flow IAT Std                   float64
 Flow IAT Max                     int64
 Flow IAT Min                     int64
Fwd IAT Total                     int64
 Fwd IAT Mean                   float64
 Fwd IAT Std                    float64
 Fwd IAT Max                      int64
 Fwd IAT Min                      int6

In [6]:
print(df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [7]:
for column in df.columns:
    unique_values = np.unique(df[column])
    if len(unique_values) > 1:
        print(f"Column '{column}' has different values.")
    else:
        print(f"Column '{column}' has the same value in all rows.")
        df.drop(column, axis=1, inplace=True)


Column ' Destination Port' has different values.
Column ' Flow Duration' has different values.
Column ' Total Fwd Packets' has different values.
Column ' Total Backward Packets' has different values.
Column 'Total Length of Fwd Packets' has different values.
Column ' Total Length of Bwd Packets' has different values.
Column ' Fwd Packet Length Max' has different values.
Column ' Fwd Packet Length Min' has different values.
Column ' Fwd Packet Length Mean' has different values.
Column ' Fwd Packet Length Std' has different values.
Column 'Bwd Packet Length Max' has different values.
Column ' Bwd Packet Length Min' has different values.
Column ' Bwd Packet Length Mean' has different values.
Column ' Bwd Packet Length Std' has different values.
Column 'Flow Bytes/s' has different values.
Column ' Flow Packets/s' has different values.
Column ' Flow IAT Mean' has different values.
Column ' Flow IAT Std' has different values.
Column ' Flow IAT Max' has different values.
Column ' Flow IAT Min

In [8]:
label_counts = df[' Label'].value_counts()
print(label_counts)

BENIGN              439972
DoS Hulk            230124
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name:  Label, dtype: int64


In [9]:
print(df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count',

In [10]:
print(len(df.columns))

69


In [11]:
lbl = df.columns

In [12]:
print(df.isna().sum())

0
 Destination Port               0
 Flow Duration                  0
 Total Fwd Packets              0
 Total Backward Packets         0
Total Length of Fwd Packets     0
 Total Length of Bwd Packets    0
 Fwd Packet Length Max          0
 Fwd Packet Length Min          0
 Fwd Packet Length Mean         0
 Fwd Packet Length Std          0
Bwd Packet Length Max           0
 Bwd Packet Length Min          0
 Bwd Packet Length Mean         0
 Bwd Packet Length Std          0
Flow Bytes/s                    0
 Flow Packets/s                 0
 Flow IAT Mean                  0
 Flow IAT Std                   0
 Flow IAT Max                   0
 Flow IAT Min                   0
Fwd IAT Total                   0
 Fwd IAT Mean                   0
 Fwd IAT Std                    0
 Fwd IAT Max                    0
 Fwd IAT Min                    0
Bwd IAT Total                   0
 Bwd IAT Mean                   0
 Bwd IAT Std                    0
 Bwd IAT Max                    0
 Bwd IAT Min

For Patator attack achieved better epoch 0 rmse by changing the learning rate with grad penalty 0.1

The model comverged around 0.3 rmse with kernel regulirization lr=0.005 

Epoch:  1800 	 rmse:  0.17842225006805915 - BATCH SIZE = 128 GPW = 10 LR = 0.001

In [13]:
warnings.filterwarnings("ignore", category=DeprecationWarning)


#def create_normalized_dataframe(df, attack_name):
#    df_new = df.loc[df[" Label"] == attack_name]

#   # normalize each field independently
#    df_attack_norm = df_new
#    for n in range(len(lbl)-1):
#        m = lbl[n] 
#        if (len(np.unique(df_attack_norm[m])) > 1):
#            df_attack_norm[m] = scalex(df_attack_norm[m])
#        else:
#            df_attack_norm[m] = np.int64(1)

#    return df_attack_norm

def create_normalized_dataframe(df, attack_name):
    df_new = df.loc[df[" Label"] == attack_name]

    features = []
    for n in range(len(lbl)-1):
        m = lbl[n]
        tmp = np.unique(df_new[m])
        if (len(tmp) > 1):
            features.append(m)

    # normalize each field independently
    df_attack_norm = df_new
    for n in range(len(lbl)-1):
        m = lbl[n]
        if (np.max(df_attack_norm[m]) > 1).any(): 
            if (len(np.unique(df_attack_norm[m])) > 1):
                df_attack_norm[m] = scalex(df_attack_norm[m])
            else:
                df_attack_norm[m] = np.int64(1)  

    return df_attack_norm

def kill_empty(df):
    for column in df.columns:
        unique_values = np.unique(df[column])
        if len(unique_values) > 1:
            print(f"Column '{column}' has different values.")
        else:
            print(f"Column '{column}' has the same value in all rows.")
            df.drop(column, axis=1, inplace=True)

    return df


#attack_name = "DoS Hulk"
#attack_name = "SSH-Patator"
attack_name = "DoS GoldenEye"
df_attack_norm = create_normalized_dataframe(df, attack_name)
#df_normal_norm = create_normalized_dataframe(df, "BENIGN")
print(df_attack_norm)
df_attack_norm = kill_empty(df_attack_norm)
print(df_attack_norm)


# In[3]: build GP-WGAN and generate adversarial samples
#
BATCH_SIZE = 256

GRADIENT_PENALTY_WEIGHT = 10 #0.1
MAX_SIM = 10000 #10000
X_train = np.asarray(df_attack_norm.iloc[:MAX_SIM, :-1])
np.random.shuffle(X_train)
#df_attack_norm.to_csv('output.csv', index=False)
#
MAX_EPOCH = 50000 #15000
TRAINING_RATIO = 5
#
NUM_SAMPLES = 2000 #5000
#
### Building the model
def make_generator(noise_dim):
    model = Sequential()
    model.add(Dense(256,  kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=noise_dim, activation='linear', kernel_regularizer=regularizers.l1(0.01)))
    return model
#
#    Last card of make_generator.
#    
def make_discriminator():
    model = Sequential()
    model.add(Dense(256, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=1, activation='linear'))
    return model
#
#    Last card of make_discriminator.
#
print("current_gradpenalty:", GRADIENT_PENALTY_WEIGHT)

INPUT_DIM = X_train.shape[1]
noise_dim = INPUT_DIM

generator = make_generator(noise_dim)
discriminator = make_discriminator()


#### for the generator it is mostly the same as WGAN std
for layer in discriminator.layers:
    layer.trainable = False
discriminator.trainable = False

generator_input = Input(shape=(noise_dim,))
generator_layers = generator(generator_input)
discriminator_layers_for_generator = discriminator(generator_layers)

generator_model = Model(inputs=[generator_input], outputs=[discriminator_layers_for_generator])
generator_model.compile(optimizer=RMSprop(lr = 0.005, rho = 0.9, epsilon=1e-6), loss = wasserstein_loss)


#### New discriminator model for GPWGAN
for layer in discriminator.layers:
    layer.trainable = True
for layer in generator.layers:
    layer.trainable = False
discriminator.trainable = True
generator.trainable = False 


real_samples = Input(shape=X_train.shape[1:])
generator_input_for_discriminator = Input(shape=(noise_dim,))
generated_samples_for_discriminator = generator(generator_input_for_discriminator)
discriminator_output_from_generator = discriminator(generated_samples_for_discriminator)
discriminator_output_from_real_samples = discriminator(real_samples)

averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator])
averaged_samples_out = discriminator(averaged_samples)

discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator], 
                            outputs=[discriminator_output_from_real_samples, discriminator_output_from_generator, 
                                     averaged_samples_out])


### the loss function takes more inputs than the standard y_true and y_pred 
### values usually required for a loss function. Therefore, we will make it partial.
partial_gp_loss = partial(gradient_penalty_loss, averaged_samples=averaged_samples, lamba_reg=GRADIENT_PENALTY_WEIGHT)
partial_gp_loss.__name__ = 'gp_loss' 


# finally, we compile the model
discriminator_model.compile(optimizer=RMSprop(lr=0.005, rho=0.9, epsilon=1e-6), 
                            loss=[wasserstein_loss, wasserstein_loss, partial_gp_loss])



positive_y = np.ones((BATCH_SIZE, 1), dtype=np.float32)
negative_y = -positive_y
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) # dummy vector mandatory for the train on batch function

for epoch in range(MAX_EPOCH + 1):
    np.random.shuffle(X_train)

    minibatches_size = BATCH_SIZE * TRAINING_RATIO
    for i in range(int(X_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
        discriminator_minibatches = X_train[i * minibatches_size:(i + 1) * minibatches_size]
        for j in range(TRAINING_RATIO):
            sample_batch = discriminator_minibatches[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
            noise = np.random.rand(BATCH_SIZE, noise_dim).astype(np.float32)

            discriminator_model.train_on_batch([sample_batch, noise], [positive_y, negative_y, dummy_y])

        generator_model.train_on_batch(np.random.rand(BATCH_SIZE, noise_dim), positive_y)


    #Visualization of intermediate results
    if (epoch % 50 == 0):
        gensamples = generate_samples(generator, noise_dim, MAX_SIM)
        rmse_sofar = calcrmse(X_train, gensamples)
        print("Epoch: ", epoch, "\t", "rmse: ", rmse_sofar)

    #if (epoch % 1000 == 0):
        #try:
            #gensamples = generate_samples(generator, noise_dim, MAX_SIM)
            #rmse_sofar = calcrmse(X_train, gensamples)
            #print("Epoch: ", epoch, "\t", "rmse: ", rmse_sofar)
            #print(gensamples)
        #except ValueError:
            #print("ValueError encountered. Skipping this iteration.")
            #continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0        Destination Port   Flow Duration   Total Fwd Packets  \
331034                  1        0.041863            0.105263   
331035                  1        0.041857            0.263158   
331036                  1        0.041830            0.263158   
331037                  1        0.041828            0.263158   
331038                  1        0.041836            0.315789   
...                   ...             ...                 ...   
692607                  1        0.096223            0.315789   
692614                  1        0.096233            0.157895   
692653                  1        0.096198            0.263158   
692682                  1        0.096197            0.315789   
692697                  1        0.096223            0.157895   

0        Total Backward Packets  Total Length of Fwd Packets  \
331034                 0.555556                     0.049349   
331035                 0.555556                     0.060347   
331036                 0.66

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Instructions for updating:
Use tf.cast instead.
Epoch:  0 	 rmse:  0.5949564377169654
Epoch:  50 	 rmse:  0.2907019194895397
Epoch:  100 	 rmse:  0.3097348321976864
Epoch:  150 	 rmse:  0.3328949202233434
Epoch:  200 	 rmse:  0.34817858224511683
Epoch:  250 	 rmse:  0.34426598586423485
Epoch:  300 	 rmse:  0.3471388873596346
Epoch:  350 	 rmse:  0.35154019128705044
Epoch:  400 	 rmse:  0.35223324943115697
Epoch:  450 	 rmse:  0.3442690209393051
Epoch:  500 	 rmse:  0.35397068720015623
Epoch:  550 	 rmse:  0.3358233079195125
Epoch:  600 	 rmse:  0.32193102745342095
Epoch:  650 	 rmse:  0.31607563876017075
Epoch:  700 	 rmse:  0.30810131608042135
Epoch:  750 	 rmse:  0.29419672865877766
Epoch:  800 	 rmse:  0.28384065862339136
Epoch:  850 	 rmse:  0.27137505802916856
Epoch:  900 	 rmse:  0.2214545120189769
Epoch:  950 	 rmse:  0.271942698499667
Epoch:  1000 	 rmse:  0.2286535426353061
Epoch:  1050 	 rmse:  0.3438656145616711
Epoch:  1100 	 rmse:  0.24968364325079118
Epoch:  1150 	 rmse: 

In [None]:
print(generator.summary())
print(generator.layers[0].get_weights())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 256)               17664     
_________________________________________________________________
activation_9 (Activation)    (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 128)               32896     
_________________________________________________________________
activation_10 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_11 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               16512     
__________