In [1]:
import numpy as np
import pandas as pd 
import os, re, time, math, tqdm, itertools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
import keras
from keras.layers import Conv2D, Conv1D, MaxPooling2D, MaxPooling1D, Flatten, BatchNormalization, Dense
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.callbacks import CSVLogger, ModelCheckpoint
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.utils import resample

def scatterplot(network_data):
    pyo.init_notebook_mode(connected=True)
    fig = px.scatter(x=network_data["Flow Bytes/s"][:100000],
                     y=network_data["Avg Bwd Segment Size"][:100000])
    fig.show()

def plot_number(network_data):
    sns.set(rc={'figure.figsize': (12, 6)})
    plt.xlabel('Attack Type')
    sns.set_theme()
    ax = sns.countplot(x='Label', data=network_data)
    ax.set(xlabel='Attack Type', ylabel='Number of Attacks')
    plt.show()

def loading_data(path):
    """
      https://stackoverflow.com/questions/45529507/unicodedecodeerror-utf-8-codec-cant-decode-byte-0x96-in-position-35-invalid
    """
    return pd.read_csv(path, encoding='cp1252')

def validate(network_data):
    """
        ptr
    """
    try:
        network_data['Label']
    except KeyError:
        columns = [sanitize(i) for i in network_data.columns]
        network_data.columns = columns

def sanitize(column):
    """
        de-recursion
    """
    while ' ' == column[0]:
        column = column[1:]
    return column

def plot_number(network_data):
    sns.set(rc={'figure.figsize': (12, 6)})
    plt.xlabel('Attack Type')
    sns.set_theme()
    ax = sns.countplot(x='Label', data=network_data)
    ax.set(xlabel='Attack Type', ylabel='Number of Attacks')
    plt.show()

def circle(network_data):
    cleaned_data = network_data.dropna()
    cleaned_data.isna().sum().to_numpy()
    label_encoder = LabelEncoder()
    cleaned_data['Label']= label_encoder.fit_transform(cleaned_data['Label'])
    cleaned_data['Label'].unique()
    cleaned_data['Label'].value_counts()
    data_1 = cleaned_data[cleaned_data['Label'] == 0]
    data_2 = cleaned_data[cleaned_data['Label'] == 1]
    data_3 = cleaned_data[cleaned_data['Label'] == 2]

    # make benign feature
    y_1 = np.zeros(data_1.shape[0])
    y_benign = pd.DataFrame(y_1)

    # make bruteforce feature
    y_2 = np.ones(data_2.shape[0])
    y_bf = pd.DataFrame(y_2)

    # make bruteforceSSH feature
    y_3 = np.full(data_3.shape[0], 2)
    y_ssh = pd.DataFrame(y_3)

    # merging the original dataframe
    X = pd.concat([data_1, data_2, data_3], sort=True)
    y = pd.concat([y_benign, y_bf, y_ssh], sort=True)
    data_1_resample = resample(data_1, n_samples=20000, 
                           random_state=123, replace=True)
    data_2_resample = resample(data_2, n_samples=20000, 
                            random_state=123, replace=True)
    data_3_resample = resample(data_3, n_samples=20000, 
                            random_state=123, replace=True)
    train_dataset = pd.concat([data_1_resample, data_2_resample, data_3_resample])
    train_dataset.head(2)
    plt.figure(figsize=(10, 8))
    circle = plt.Circle((0, 0), 0.7, color='white')
    plt.title('Intrusion Attack Type Distribution')
    plt.pie(train_dataset['Label'].value_counts(), labels=['Benign', 'BF', 'BF-SSH'], colors=['blue', 'magenta', 'cyan'])
    p = plt.gcf()
    p.gca().add_artist(circle)


    test_dataset = train_dataset.sample(frac=0.1)
    target_train = train_dataset['Label']
    target_test = test_dataset['Label']
    target_train.unique(), target_test.unique()

    y_train = to_categorical(target_train, num_classes=3)
    y_test = to_categorical(target_test, num_classes=3)

    train_dataset = train_dataset.drop(columns = ["Timestamp", "Protocol","PSH Flag Cnt","Init Fwd Win Byts","Flow Byts/s","Flow Pkts/s", "Label"], axis=1)
    test_dataset = test_dataset.drop(columns = ["Timestamp", "Protocol","PSH Flag Cnt","Init Fwd Win Byts","Flow Byts/s","Flow Pkts/s", "Label"], axis=1)

    X_train = train_dataset.iloc[:, :-1].values
    X_test = test_dataset.iloc[:, :-1].values
    X_test.show()

    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    # reshape the data for CNN
    X_train = X_train.reshape(len(X_train), X_train.shape[1], 1)
    X_test = X_test.reshape(len(X_test), X_test.shape[1], 1)
    X_train.shape, X_test.shape
    

def model():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=6, activation='relu', 
                    padding='same', input_shape=(72, 1)))
    model.add(BatchNormalization())
    
    # adding a pooling layer
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='same'))
    
    model.add(Conv1D(filters=64, kernel_size=6, activation='relu', 
                    padding='same', input_shape=(72, 1)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='same'))
    
    model.add(Conv1D(filters=64, kernel_size=6, activation='relu', 
                    padding='same', input_shape=(72, 1)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='same'))
    
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def main(path):
    network_data = loading_data(path)
    validate(network_data)

    print(network_data.shape)
    print(network_data.info())
    print(network_data['Label'].value_counts())

    # plot_number(network_data)
    # scatterplot(network_data)
    # circle(network_data)
    model = model()
    model.summary()


if __name__ == '__main__':
    path = "/Users/TrungLT/CNN/Input/"

    if os.path.isfile(path):
        main(path)
    elif os.path.isdir(path):
        for dirname, _, filenames in os.walk(path):
            for filename in filenames:
                file = os.path.join(dirname, filename)
                print(f"We will show chart from {file}")
                try:
                    main(file)
                except Exception as err:
                    print(err)
                print("---ENDING---")
    else:
        print("The path is not file or directory")




2022-10-30 14:49:58.843820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


We will show chart from /Users/TrungLT/CNN/Input/TrafficLabelling/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
(288602, 85)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288602 entries, 0 to 288601
Data columns (total 85 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Flow ID                      288602 non-null  object 
 1   Source IP                    288602 non-null  object 
 2   Source Port                  288602 non-null  int64  
 3   Destination IP               288602 non-null  object 
 4   Destination Port             288602 non-null  int64  
 5   Protocol                     288602 non-null  int64  
 6   Timestamp                    288602 non-null  object 
 7   Flow Duration                288602 non-null  int64  
 8   Total Fwd Packets            288602 non-null  int64  
 9   Total Backward Packets       288602 non-null  int64  
 10  Total Length of Fwd Packets  288602 n


Columns (0,1,3,6,84) have mixed types. Specify dtype option on import or set low_memory=False.



(458968, 85)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458968 entries, 0 to 458967
Data columns (total 85 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Flow ID                      170366 non-null  object 
 1   Source IP                    170366 non-null  object 
 2   Source Port                  170366 non-null  float64
 3   Destination IP               170366 non-null  object 
 4   Destination Port             170366 non-null  float64
 5   Protocol                     170366 non-null  float64
 6   Timestamp                    170366 non-null  object 
 7   Flow Duration                170366 non-null  float64
 8   Total Fwd Packets            170366 non-null  float64
 9   Total Backward Packets       170366 non-null  float64
 10  Total Length of Fwd Packets  170366 non-null  float64
 11  Total Length of Bwd Packets  170366 non-null  float64
 12  Fwd Packet Length Max        170366 non-null 