In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from scipy import interp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pylab as pl

from sklearn.ensemble import GradientBoostingClassifier as gboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (precision_recall_curve, average_precision_score, 
                             roc_curve, auc, confusion_matrix, mean_squared_error,
                             classification_report)
from sklearn import metrics
from sklearn.model_selection import cross_validate

from keras.models import Model, Sequential
from keras.layers import Input, Dense, LSTM, RepeatVector, Lambda
from keras.layers import Activation
from keras.optimizers import RMSprop
from keras.layers.merge import _Merge
from keras import backend as K
from functools import partial
from keras.metrics import binary_crossentropy

# In[0]: Functions Definitions

NB_EPOCH = 2000
BATCH_SIZE = 64 

#def printdf(dfa):
#    """ print the unique values of the dataframe """
#    lbla = dfa.columns
#    for n in range(len(lbla)):
#        print(lbla[n], np.unique(dfa[lbla[n]]))
#
# Last card of function printdf.
#
def convertstringtonumber(dfa, lst):
    """ convert string to number """
    for n in range(len(lst)):
        dfa = dfa.replace(lst[n], n)
    return dfa
#
# Last card of convertstringtonumber.
#
def scalex(X):
    """ normalize between 0 and 1 the values in X """
    nmin, nmax = 0.0, 1.0
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (nmax - nmin) + nmin
    return X_scaled
#
# Last card of scalex.
#
def calcrmse(X_train, gensamples):
    """ compute mse for training """
    max_column = X_train.shape[1]
    rmse_lst = []
    for col in range(max_column):
        rmse_lst.append(np.sqrt(mean_squared_error(X_train[:,col], gensamples[:,col])))
    return np.sum(rmse_lst) / max_column
#
# Last card of calcrmse.
#
def wasserstein_loss(y_true, y_pred):
    """ Wasserstein distance """
    return K.mean(y_true * y_pred)
#
# Last card of wasserstein_loss.
#
def gradient_penalty_loss(y_true, y_pred, averaged_samples, lamba_reg):
    """ compute gradient penalty loss for GP-WGAN """
    gradients = K.gradients(y_pred, averaged_samples)[0]
    gradients_sqr = K.square(gradients)
    gradients_sqr_sum = K.sum(gradients_sqr,
                              axis=np.arange(1, len(gradients_sqr.shape)))
    gradient_l2_norm = K.sqrt(gradients_sqr_sum)
    gradient_penalty = lamba_reg * K.square(1 - gradient_l2_norm)
    return K.mean(gradient_penalty)
#
# Last card of gradient_penalty_loss.
#
class RandomWeightedAverage(_Merge):
    def _merge_function(self, inputs):
        weights = K.random_uniform((BATCH_SIZE, 1))
        return (weights * inputs[0]) + ((1 - weights) * inputs[1])
#
# Last card of RandomWeightedAverage.
#
def generate_samples(generator_model, noise_dim, num_samples):
    """ generate samples to be used for futher analysis """
    return generator_model.predict(np.random.rand(num_samples, noise_dim))
#
# Last card of generate_samples.
#
# Last card of generate_images2D.
#
def writetocsv(mtrx, flnm):
    """Save the samples for TDA with R (2nd notebook). We do not differentiate frauds from normal transactions"""
    dtfrm = pd.DataFrame(mtrx)
    dtfrm.to_csv(flnm, sep=',', index=None, header=None)
#
# Last card of writetocsv.

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_csv('./data/MachineLearningCVE/NoForwardHeader.csv', sep=",", header=None, low_memory=False)
print(df)

                       0               1                   2   \
0        Destination Port   Flow Duration   Total Fwd Packets   
1                      80           38308                   1   
2                     389             479                  11   
3                      88            1095                  10   
4                     389           15206                  17   
...                   ...             ...                 ...   
692699                 53           32215                   4   
692700                 53             324                   2   
692701              58030              82                   2   
692702                 53         1048635                   6   
692703                 53           94939                   4   

                             3                            4   \
0        Total Backward Packets  Total Length of Fwd Packets   
1                             1                            6   
2                          

In [3]:
print(df.iloc[0])


0                Destination Port
1                   Flow Duration
2               Total Fwd Packets
3          Total Backward Packets
4     Total Length of Fwd Packets
                 ...             
73                      Idle Mean
74                       Idle Std
75                       Idle Max
76                       Idle Min
77                          Label
Name: 0, Length: 78, dtype: object


In [4]:
print(df)
df.columns = df.iloc[0]
df = df[1:]
print(df)
df = df.apply(pd.to_numeric, errors='ignore')
print(df.dtypes)

                       0               1                   2   \
0        Destination Port   Flow Duration   Total Fwd Packets   
1                      80           38308                   1   
2                     389             479                  11   
3                      88            1095                  10   
4                     389           15206                  17   
...                   ...             ...                 ...   
692699                 53           32215                   4   
692700                 53             324                   2   
692701              58030              82                   2   
692702                 53         1048635                   6   
692703                 53           94939                   4   

                             3                            4   \
0        Total Backward Packets  Total Length of Fwd Packets   
1                             1                            6   
2                          

In [5]:
label_counts = df[' Label'].value_counts()
print(label_counts)

BENIGN              440031
DoS Hulk            231073
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name:  Label, dtype: int64


In [6]:
labl = ['Destination_Port', 'Flow_Duration', 'Total_Fwd_Packets',
       'Total_Backward_Packets', 'Total_Length_of_Fwd_Packets',
       'Total_Length_of_Bwd_Packets', 'Fwd_Packet_Length_Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', ' Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count',
       'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
       'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size',
       'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Header Length',
       'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
       ' Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
       'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets',
       'Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'act_data_pkt_fwd',
       'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max',
       'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
       'Label']

In [7]:
lbl = [' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Bwd Header Length',
       'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count',
       ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
       ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count',
       ' Down/Up Ratio', ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Fwd Header Length', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets',
       ' Subflow Fwd Bytes', ' Subflow Bwd Packets', ' Subflow Bwd Bytes',
       'Init_Win_bytes_forward', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', ' min_seg_size_forward', 'Active Mean',
       ' Active Std', ' Active Max', ' Active Min', 'Idle Mean', ' Idle Std',
       ' Idle Max', ' Idle Min', ' Label']

In [8]:
def create_normalized_dataframe(df, attack_name):
    df_new = df.loc[df[" Label"] == attack_name]

    features = []
    for n in range(len(lbl)-1):
        m = lbl[n]
        tmp = np.unique(df_new[m])
        #print(m, tmp)
        if (len(tmp) > 1):
            features.append(m)
            #plt.figure(n)
            #sns.distplot(df_new[m])

    # normalize each field independently
    df_attack_norm = df_new
    for n in range(len(lbl)-1):
        m = lbl[n]
        if (np.max(df_attack_norm[m]) > 1): 
            if (len(np.unique(df_attack_norm[m])) > 1):
                df_attack_norm[m] = scalex(df_attack_norm[m])
            else:
                df_attack_norm[m] = np.int64(1)  

    #for n in range(len(lbl)-1):
        #print(lbl[n], np.unique(df_attack_norm[lbl[n]]))
    
    return df_attack_norm

attack_name = "DoS GoldenEye"
#attack_name = "DoS slowloris"
df_attack_norm = create_normalized_dataframe(df, attack_name)
#df_normal_norm = create_normalized_dataframe(df, "BENIGN")
print(df_attack_norm)


# In[3]: build GP-WGAN and generate adversarial samples
#
GRADIENT_PENALTY_WEIGHT = 0.01 #0.1
MAX_SIM = 10000 #10000
X_train = np.asarray(df_attack_norm.iloc[:MAX_SIM, :-1])
np.random.shuffle(X_train)
#
MAX_EPOCH = 15000 #15000
TRAINING_RATIO = 5 #5
#
NUM_SAMPLES = 2000 #5000
#
### Building the model
def make_generator(noise_dim):
    model = Sequential()
    model.add(Dense(256,  kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(78,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=noise_dim, activation='linear'))
    return model
#
#    Last card of make_generator.
#    
def make_discriminator():
    model = Sequential()
    model.add(Dense(256, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(78, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=1, activation='linear'))
    return model
#
#    Last card of make_discriminator.
#
print("current_gradpenalty:", GRADIENT_PENALTY_WEIGHT)

INPUT_DIM = X_train.shape[1]
noise_dim = INPUT_DIM

generator = make_generator(noise_dim)
discriminator = make_discriminator()


#### for the generator it is mostly the same as WGAN std
for layer in discriminator.layers:
    layer.trainable = False
discriminator.trainable = False

generator_input = Input(shape=(noise_dim,))
generator_layers = generator(generator_input)
discriminator_layers_for_generator = discriminator(generator_layers)

generator_model = Model(inputs=[generator_input], outputs=[discriminator_layers_for_generator])
generator_model.compile(optimizer=RMSprop(lr=0.001, rho=0.9, epsilon=1e-6), loss=wasserstein_loss)


#### New discriminator model for GPWGAN
for layer in discriminator.layers:
    layer.trainable = True
for layer in generator.layers:
    layer.trainable = False
discriminator.trainable = True
generator.trainable = False 


real_samples = Input(shape=X_train.shape[1:])
generator_input_for_discriminator = Input(shape=(noise_dim,))
generated_samples_for_discriminator = generator(generator_input_for_discriminator)
discriminator_output_from_generator = discriminator(generated_samples_for_discriminator)
discriminator_output_from_real_samples = discriminator(real_samples)

averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator])
averaged_samples_out = discriminator(averaged_samples)

discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator], 
                            outputs=[discriminator_output_from_real_samples, discriminator_output_from_generator, 
                                     averaged_samples_out])


### the loss function takes more inputs than the standard y_true and y_pred 
### values usually required for a loss function. Therefore, we will make it partial.
partial_gp_loss = partial(gradient_penalty_loss, averaged_samples=averaged_samples, lamba_reg=GRADIENT_PENALTY_WEIGHT)
partial_gp_loss.__name__ = 'gp_loss' 


# finally, we compile the model
discriminator_model.compile(optimizer=RMSprop(lr=0.001, rho=0.9, epsilon=1e-6), 
                            loss=[wasserstein_loss, wasserstein_loss, partial_gp_loss])


### Running the Full Model
def discriminator_clip(f,c):
    for l in f.layers:
        weights = l.get_weights()
        weights = [np.clip(w, -c, c) for w in weights]
        l.set_weights(weights)


positive_y = np.ones((BATCH_SIZE, 1), dtype=np.float32)
negative_y = -positive_y
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) # dummy vector mandatory for the train on batch function

for epoch in range(MAX_EPOCH + 1):
    np.random.shuffle(X_train)

    minibatches_size = BATCH_SIZE * TRAINING_RATIO
    for i in range(int(X_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
        discriminator_minibatches = X_train[i * minibatches_size:(i + 1) * minibatches_size]
        for j in range(TRAINING_RATIO):
            sample_batch = discriminator_minibatches[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
            noise = np.random.rand(BATCH_SIZE, noise_dim).astype(np.float32)

            discriminator_model.train_on_batch([sample_batch, noise], [positive_y, negative_y, dummy_y])

        generator_model.train_on_batch(np.random.rand(BATCH_SIZE, noise_dim), positive_y)


    #Visualization of intermediate results
    if (epoch % 1500 == 0):
        gensamples = generate_samples(generator, noise_dim, MAX_SIM)
        rmse_sofar = calcrmse(X_train, gensamples)
        print("Epoch: ", epoch, "\t", "rmse: ", rmse_sofar)
        print(gensamples)

0        Destination Port   Flow Duration   Total Fwd Packets  \
331034                  1        0.041863            0.105263   
331035                  1        0.041857            0.263158   
331036                  1        0.041830            0.263158   
331037                  1        0.041828            0.263158   
331038                  1        0.041836            0.315789   
...                   ...             ...                 ...   
692607                  1        0.096223            0.315789   
692614                  1        0.096233            0.157895   
692653                  1        0.096198            0.263158   
692682                  1        0.096197            0.315789   
692697                  1        0.096223            0.157895   

0        Total Backward Packets  Total Length of Fwd Packets  \
331034                 0.555556                     0.049349   
331035                 0.555556                     0.060347   
331036                 0.66

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.cast instead.
Epoch:  0 	 rmse:  0.20124451812687974
[[ 0.7055432   0.06748597  0.29665145 ...  0.0063615  -0.08035232
  -0.05401191]
 [ 0.7374386   0.13660115  0.17732543 ... -0.02476998  0.07468493
   0.0666346 ]
 [ 0.909714   -0.00995782  0.27798605 ...  0.02341675 -0.00377448
   0.04227001]
 ...
 [ 0.86584383  0.14999308  0.08704047 ...  0.11870323  0.02257396
   0.11967532]
 [ 0.91969883  0.0581191   0.25718465 ...  0.16194351 -0.04140561
   0.02856284]
 [ 0.7616162   0.271179    0.35892928 ... -0.09413546  0.11713497
   0.14246708]]
Epoch:  1500 	 rmse:  0.22191209561357356
[[ 0.98172677  0.6819733   0.01749049 ... -0.00393727  0.67903346
   0.6833975 ]
 [ 1.0014416   0.0660729   0.31600296 ... -0.00399042  0.0284747
   0.03249507]
 [ 1.0032463   0.8597181   0.01379952 ...  0.00373507  0.845089
   0.8598986 ]
 ...
 [ 1.0206844   0.03958408  0.2733135  ...  0.01687605 -0.0187

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').