In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pl

from sklearn.ensemble import GradientBoostingClassifier as gboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (precision_recall_curve, average_precision_score, 
                             roc_curve, auc, confusion_matrix, mean_squared_error,
                             classification_report)
from sklearn import metrics
from sklearn.model_selection import cross_validate

from keras.models import Model, Sequential
from keras.layers import Input, Dense, LSTM, RepeatVector, Lambda
from keras.layers import Activation
from keras.optimizers import RMSprop
from keras.layers.merge import _Merge
from keras import backend as K
from functools import partial
from keras.metrics import binary_crossentropy
from keras.layers import BatchNormalization
from keras import regularizers


pd.options.display.max_rows = 4000

# In[0]: Functions Definitions

def scalex(X):
    """ normalize between 0 and 1 the values in X """
    nmin, nmax = 0.0, 1.0
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (nmax - nmin) + nmin
    return X_scaled
#
# Last card of scalex.
#
def calcrmse(X_train, gensamples):
    """ compute mse for training """
    max_column = X_train.shape[1]
    rmse_lst = []
    for col in range(max_column):
        rmse_lst.append(np.sqrt(mean_squared_error(X_train[:,col], gensamples[:,col])))
    return np.sum(rmse_lst) / max_column
#
# Last card of calcrmse.
#
def wasserstein_loss(y_true, y_pred):
    """ Wasserstein distance """
    return K.mean(y_true * y_pred)
#
# Last card of wasserstein_loss.
#
def gradient_penalty_loss(y_true, y_pred, averaged_samples, lamba_reg):
    """ compute gradient penalty loss for GP-WGAN """
    gradients = K.gradients(y_pred, averaged_samples)[0]
    gradients_sqr = K.square(gradients)
    gradients_sqr_sum = K.sum(gradients_sqr,
                              axis=np.arange(1, len(gradients_sqr.shape)))
    gradient_l2_norm = K.sqrt(gradients_sqr_sum)
    gradient_penalty = lamba_reg * K.square(1 - gradient_l2_norm)
    return K.mean(gradient_penalty)
#
# Last card of gradient_penalty_loss.
#
class RandomWeightedAverage(_Merge):
    def _merge_function(self, inputs):
        weights = K.random_uniform((BATCH_SIZE, 1))
        return (weights * inputs[0]) + ((1 - weights) * inputs[1])
#
# Last card of RandomWeightedAverage.
#
def generate_samples(generator_model, noise_dim, num_samples):
    """ generate samples to be used for futher analysis """
    return generator_model.predict(np.random.rand(num_samples, noise_dim))
#
# Last card of generate_samples.
#
# Last card of generate_images2D.
#
#
# Last card of writetocsv.

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_csv('./data/MachineLearningCVE/Tuesday-workingHours.pcap_ISCX.csv', sep=",", header=None, low_memory=False)
print(df)

                       0               1                   2   \
0        Destination Port   Flow Duration   Total Fwd Packets   
1                      88             640                   7   
2                      88             900                   9   
3                      88            1205                   7   
4                      88             511                   7   
...                   ...             ...                 ...   
445905                 53             155                   2   
445906              59317             110                   1   
445907                 53             166                   2   
445908              54726              81                   1   
445909                 53             202                   2   

                             3                            4   \
0        Total Backward Packets  Total Length of Fwd Packets   
1                             4                          440   
2                          

In [3]:
df = df.dropna()

In [4]:
df.columns = df.iloc[0]
df = df[1:]

In [5]:
df = df.apply(pd.to_numeric, errors='ignore')
print(df.dtypes)

0
 Destination Port                 int64
 Flow Duration                    int64
 Total Fwd Packets                int64
 Total Backward Packets           int64
Total Length of Fwd Packets       int64
 Total Length of Bwd Packets      int64
 Fwd Packet Length Max            int64
 Fwd Packet Length Min            int64
 Fwd Packet Length Mean         float64
 Fwd Packet Length Std          float64
Bwd Packet Length Max             int64
 Bwd Packet Length Min            int64
 Bwd Packet Length Mean         float64
 Bwd Packet Length Std          float64
Flow Bytes/s                    float64
 Flow Packets/s                 float64
 Flow IAT Mean                  float64
 Flow IAT Std                   float64
 Flow IAT Max                     int64
 Flow IAT Min                     int64
Fwd IAT Total                     int64
 Fwd IAT Mean                   float64
 Fwd IAT Std                    float64
 Fwd IAT Max                      int64
 Fwd IAT Min                      int6

In [6]:
print(df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [7]:
for column in df.columns:
    unique_values = np.unique(df[column])
    if len(unique_values) > 1:
        print(f"Column '{column}' has different values.")
    else:
        print(f"Column '{column}' has the same value in all rows.")
        df.drop(column, axis=1, inplace=True)


Column ' Destination Port' has different values.
Column ' Flow Duration' has different values.
Column ' Total Fwd Packets' has different values.
Column ' Total Backward Packets' has different values.
Column 'Total Length of Fwd Packets' has different values.
Column ' Total Length of Bwd Packets' has different values.
Column ' Fwd Packet Length Max' has different values.
Column ' Fwd Packet Length Min' has different values.
Column ' Fwd Packet Length Mean' has different values.
Column ' Fwd Packet Length Std' has different values.
Column 'Bwd Packet Length Max' has different values.
Column ' Bwd Packet Length Min' has different values.
Column ' Bwd Packet Length Mean' has different values.
Column ' Bwd Packet Length Std' has different values.
Column 'Flow Bytes/s' has different values.
Column ' Flow Packets/s' has different values.
Column ' Flow IAT Mean' has different values.
Column ' Flow IAT Std' has different values.
Column ' Flow IAT Max' has different values.
Column ' Flow IAT Min

In [8]:
label_counts = df[' Label'].value_counts()
print(label_counts)

BENIGN         431873
FTP-Patator      7938
SSH-Patator      5897
Name:  Label, dtype: int64


In [9]:
print(df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count',

In [10]:
print(len(df.columns))

69


In [11]:
lbl = df.columns

In [12]:
print(df.isna().sum())

0
 Destination Port               0
 Flow Duration                  0
 Total Fwd Packets              0
 Total Backward Packets         0
Total Length of Fwd Packets     0
 Total Length of Bwd Packets    0
 Fwd Packet Length Max          0
 Fwd Packet Length Min          0
 Fwd Packet Length Mean         0
 Fwd Packet Length Std          0
Bwd Packet Length Max           0
 Bwd Packet Length Min          0
 Bwd Packet Length Mean         0
 Bwd Packet Length Std          0
Flow Bytes/s                    0
 Flow Packets/s                 0
 Flow IAT Mean                  0
 Flow IAT Std                   0
 Flow IAT Max                   0
 Flow IAT Min                   0
Fwd IAT Total                   0
 Fwd IAT Mean                   0
 Fwd IAT Std                    0
 Fwd IAT Max                    0
 Fwd IAT Min                    0
Bwd IAT Total                   0
 Bwd IAT Mean                   0
 Bwd IAT Std                    0
 Bwd IAT Max                    0
 Bwd IAT Min

In [13]:
warnings.filterwarnings("ignore", category=DeprecationWarning)


#def create_normalized_dataframe(df, attack_name):
#    df_new = df.loc[df[" Label"] == attack_name]

#   # normalize each field independently
#    df_attack_norm = df_new
#    for n in range(len(lbl)-1):
#        m = lbl[n] 
#        if (len(np.unique(df_attack_norm[m])) > 1):
#            df_attack_norm[m] = scalex(df_attack_norm[m])
#        else:
#            df_attack_norm[m] = np.int64(1)

#    return df_attack_norm

def create_normalized_dataframe(df, attack_name):
    df_new = df.loc[df[" Label"] == attack_name]

    features = []
    for n in range(len(lbl)-1):
        m = lbl[n]
        tmp = np.unique(df_new[m])
        if (len(tmp) > 1):
            features.append(m)

    # normalize each field independently
    df_attack_norm = df_new
    for n in range(len(lbl)-1):
        m = lbl[n]
        if (np.max(df_attack_norm[m]) > 1).any(): 
            if (len(np.unique(df_attack_norm[m])) > 1):
                df_attack_norm[m] = scalex(df_attack_norm[m])
            else:
                df_attack_norm[m] = np.int64(1)  

    return df_attack_norm

def kill_empty(df):
    for column in df.columns[:-1]:
        unique_values = np.unique(df[column])
        if len(unique_values) > 1:
            print(f"Column '{column}' has different values.")
        else:
            print(f"Column '{column}' has the same value in all rows.")
            df.drop(column, axis=1, inplace=True)

    return df


#attack_name = "DoS slowloris"
#attack_name = "DoS Hulk"
attack_name = "SSH-Patator"
#attack_name = "DoS GoldenEye"
df_attack_norm = create_normalized_dataframe(df, attack_name)
#df_normal_norm = create_normalized_dataframe(df, "BENIGN")
df_attack_norm = kill_empty(df_attack_norm)
print(df_attack_norm)


# In[3]: build GP-WGAN and generate adversarial samples
#
BATCH_SIZE = 256

GRADIENT_PENALTY_WEIGHT = 10 #0.1
MAX_SIM = 5897 #10000
X_train = np.asarray(df_attack_norm.iloc[:MAX_SIM, :-1])
np.random.shuffle(X_train)
#df_attack_norm.to_csv('output.csv', index=False)
#
MAX_EPOCH = 2400 #15000
TRAINING_RATIO = 5
#
NUM_SAMPLES = 2000 #5000
#
### Building the model
def make_generator(noise_dim):
    model = Sequential()
    model.add(Dense(256,  kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128,  kernel_initializer='he_normal', kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=noise_dim, activation='linear', kernel_regularizer=regularizers.l1(0.01)))
    return model
#
#    Last card of make_generator.
#    
def make_discriminator():
    model = Sequential()
    model.add(Dense(256, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(128, kernel_initializer='he_normal', input_dim=INPUT_DIM, kernel_regularizer=regularizers.l1(0.01)))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=1, activation='linear', kernel_regularizer=regularizers.l1(0.01)))
    return model
#
#    Last card of make_discriminator.
#
print("current_gradpenalty:", GRADIENT_PENALTY_WEIGHT)

INPUT_DIM = X_train.shape[1]
noise_dim = INPUT_DIM

generator = make_generator(noise_dim)
discriminator = make_discriminator()


#### for the generator it is mostly the same as WGAN std
for layer in discriminator.layers:
    layer.trainable = False
discriminator.trainable = False

generator_input = Input(shape=(noise_dim,))
generator_layers = generator(generator_input)
discriminator_layers_for_generator = discriminator(generator_layers)

generator_model = Model(inputs=[generator_input], outputs=[discriminator_layers_for_generator])
generator_model.compile(optimizer=RMSprop(lr = 0.005, rho = 0.9, epsilon=1e-6), loss = wasserstein_loss)


#### New discriminator model for GPWGAN
for layer in discriminator.layers:
    layer.trainable = True
for layer in generator.layers:
    layer.trainable = False
discriminator.trainable = True
generator.trainable = False 


real_samples = Input(shape=X_train.shape[1:])
generator_input_for_discriminator = Input(shape=(noise_dim,))
generated_samples_for_discriminator = generator(generator_input_for_discriminator)
discriminator_output_from_generator = discriminator(generated_samples_for_discriminator)
discriminator_output_from_real_samples = discriminator(real_samples)

averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator])
averaged_samples_out = discriminator(averaged_samples)

discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator], 
                            outputs=[discriminator_output_from_real_samples, discriminator_output_from_generator, 
                                     averaged_samples_out])


### the loss function takes more inputs than the standard y_true and y_pred 
### values usually required for a loss function. Therefore, we will make it partial.
partial_gp_loss = partial(gradient_penalty_loss, averaged_samples=averaged_samples, lamba_reg=GRADIENT_PENALTY_WEIGHT)
partial_gp_loss.__name__ = 'gp_loss' 


# finally, we compile the model
discriminator_model.compile(optimizer=RMSprop(lr=0.005, rho=0.9, epsilon=1e-6), 
                            loss=[wasserstein_loss, wasserstein_loss, partial_gp_loss])



positive_y = np.ones((BATCH_SIZE, 1), dtype=np.float32)
negative_y = -positive_y
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) # dummy vector mandatory for the train on batch function

for epoch in range(MAX_EPOCH + 1):
    np.random.shuffle(X_train)

    minibatches_size = BATCH_SIZE * TRAINING_RATIO
    for i in range(int(X_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
        discriminator_minibatches = X_train[i * minibatches_size:(i + 1) * minibatches_size]
        for j in range(TRAINING_RATIO):
            sample_batch = discriminator_minibatches[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
            noise = np.random.rand(BATCH_SIZE, noise_dim).astype(np.float32)

            discriminator_model.train_on_batch([sample_batch, noise], [positive_y, negative_y, dummy_y])

        generator_model.train_on_batch(np.random.rand(BATCH_SIZE, noise_dim), positive_y)


    #Visualization of intermediate results
    if (epoch % 50 == 0):
        gensamples = generate_samples(generator, noise_dim, MAX_SIM)
        rmse_sofar = calcrmse(X_train, gensamples)
        print("Epoch: ", epoch, "\t", "rmse: ", rmse_sofar)

    #if (epoch % 1000 == 0):
        #try:
            #gensamples = generate_samples(generator, noise_dim, MAX_SIM)
            #rmse_sofar = calcrmse(X_train, gensamples)
            #print("Epoch: ", epoch, "\t", "rmse: ", rmse_sofar)
            #print(gensamples)
        #except ValueError:
            #print("ValueError encountered. Skipping this iteration.")
            #continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Column ' Destination Port' has the same value in all rows.
Column ' Flow Duration' has different values.
Column ' Total Fwd Packets' has different values.
Column ' Total Backward Packets' has different values.
Column 'Total Length of Fwd Packets' has different values.
Column ' Total Length of Bwd Packets' has different values.
Column ' Fwd Packet Length Max' has different values.
Column ' Fwd Packet Length Min' has different values.
Column ' Fwd Packet Length Mean' has different values.
Column ' Fwd Packet Length Std' has different values.
Column 'Bwd Packet Length Max' has different values.
Column ' Bwd Packet Length Min' has the same value in all rows.
Column ' Bwd Packet Length Mean' has different values.
Column ' Bwd Packet Length Std' has different values.
Column 'Flow Bytes/s' has different values.
Column ' Flow Packets/s' has different values.
Column ' Flow IAT Mean' has different values.
Column ' Flow IAT Std' has different values.
Column ' Flow IAT Max' has different values.
C

KeyboardInterrupt: 

In [67]:
gensamples=generate_samples(generator, noise_dim, MAX_SIM)

df_gensamples = pd.DataFrame(gensamples)

df_original = df_attack_norm.iloc[:MAX_SIM, :-1]

df_gensamples.columns = df_original.columns

print(df_gensamples)

0      Flow Duration   Total Fwd Packets   Total Backward Packets  \
0           0.030598            0.311499                 0.377765   
1           0.030599            0.311499                 0.377765   
2           0.030599            0.311499                 0.377765   
3           0.030599            0.311499                 0.377765   
4           0.030598            0.311499                 0.377765   
...              ...                 ...                      ...   
5892        0.030599            0.311499                 0.377765   
5893        0.030598            0.311499                 0.377765   
5894        0.030599            0.311499                 0.377765   
5895        0.030599            0.311499                 0.377765   
5896        0.030599            0.311499                 0.377765   

0     Total Length of Fwd Packets   Total Length of Bwd Packets  \
0                        0.285909                      0.244919   
1                        0.285909    

In [68]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Get the column names of df_gensamples and df_original
column_names = df_gensamples.columns
count = 0

# Calculate RMSE for each column
rmse_values = []
for column in column_names:
    rmse = np.sqrt(mean_squared_error(df_gensamples[column], df_original[column]))
    rmse_values.append(rmse)

# Print the RMSE values
for column, rmse in zip(column_names, rmse_values):
    print(f"RMSE for column '{column}': {rmse}")
    if rmse > 0.30:
        count += 1

print(count)

RMSE for column ' Flow Duration': 0.058251356053798664
RMSE for column ' Total Fwd Packets': 0.31636374801146416
RMSE for column ' Total Backward Packets': 0.3611867758565065
RMSE for column 'Total Length of Fwd Packets': 0.2637289498831921
RMSE for column ' Total Length of Bwd Packets': 0.24392518722672515
RMSE for column ' Fwd Packet Length Max': 0.22405188513994362
RMSE for column ' Fwd Packet Length Min': 0.018485526382332754
RMSE for column ' Fwd Packet Length Mean': 0.2743465827838445
RMSE for column ' Fwd Packet Length Std': 0.18611272708800036
RMSE for column 'Bwd Packet Length Max': 0.49999624529277736
RMSE for column ' Bwd Packet Length Mean': 0.2175464280428086
RMSE for column ' Bwd Packet Length Std': 0.29438891064566525
RMSE for column 'Flow Bytes/s': 0.02055510259046375
RMSE for column ' Flow Packets/s': 0.02170334147729094
RMSE for column ' Flow IAT Mean': 0.03991908434018598
RMSE for column ' Flow IAT Std': 0.030393863024458487
RMSE for column ' Flow IAT Max': 0.0259025

In [69]:
import numpy as np

# Calculate RMSE for each column
rmse_values = []
for column in df_original.columns:
    rmse = np.sqrt(mean_squared_error(df_original[column], df_gensamples[column]))
    rmse_values.append(rmse)

# Remove columns with RMSE > 0.3
columns_to_remove = [column for column, rmse in zip(df_gensamples.columns, rmse_values) if rmse > 0.3]
df_filtered_generated = df_gensamples.drop(columns_to_remove, axis=1)
df_filtered_original = df_original.drop(columns_to_remove, axis=1)

rmse_values_after = []
for column in df_filtered_original.columns:
    rmse = np.sqrt(mean_squared_error(df_filtered_original[column], df_filtered_generated[column]))
    rmse_values_after.append(rmse)
# Calculate average RMSE
average_rmse = np.mean(rmse_values)
print(f"Average RMSE: {average_rmse}")

average_after_rmse = np.mean(rmse_values_after)
print(f"Average RMSE after filtering: {average_after_rmse}")

Average RMSE: 0.20752603576970421
Average RMSE after filtering: 0.13176199866387733


In [70]:
print(df_filtered_generated)
print(df_filtered_original)

0      Flow Duration  Total Length of Fwd Packets  \
0           0.030598                     0.285909   
1           0.030599                     0.285909   
2           0.030599                     0.285909   
3           0.030599                     0.285909   
4           0.030598                     0.285909   
...              ...                          ...   
5892        0.030599                     0.285909   
5893        0.030598                     0.285909   
5894        0.030599                     0.285909   
5895        0.030599                     0.285909   
5896        0.030599                     0.285909   

0      Total Length of Bwd Packets   Fwd Packet Length Max  \
0                         0.244919                0.215815   
1                         0.244919                0.215816   
2                         0.244919                0.215815   
3                         0.244919                0.215816   
4                         0.244919                0.2

In [71]:
df_filtered_generated[' Label'] = 'generated'
df_filtered_original[' Label'] = 'normal'

def convertstringtonumber(dfa, lst):
    """ convert string to number """
    for n in range(len(lst)):
        dfa = dfa.replace(lst[n], n)
    return dfa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [72]:
# aggregate generated data with true data
df_aggregated = pd.concat([df_filtered_original[:NUM_SAMPLES], df_filtered_generated[:NUM_SAMPLES]])
df_aggregated = df_aggregated.sample(frac=1)



# separate the data set into train and test data sets
y = df_aggregated.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(df_aggregated.iloc[:,:-1], \
                                                    y, test_size=.2, random_state=42)

In [73]:
# Create an instance of the GradientBoostingClassifier
classifier = gboost(random_state=10, loss='deviance', learning_rate=0.05, 
                    n_estimators=200, criterion='friedman_mse', max_depth=3)

# Fit the classifier on the training data
classifier.fit(X_train, y_train)

# Use the classifier to predict the labels for the training data
y_pred = classifier.predict(X_train)

# Print the classification report
print(classification_report(y_train, y_pred))



              precision    recall  f1-score   support

   generated       1.00      1.00      1.00      1604
      normal       1.00      1.00      1.00      1596

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

