In [246]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.optimizers import SGD

df = pd.read_csv('/media/mengjie/Data/Downloads/02-16-2018.csv') #CSE-CIC-IDS 2018 (02-16-2018)

cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_') )
df.columns = cols
print(df.head())
#replace infinit number
df=df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

#drop null
df.dropna(how = 'all')
print('Data type of each column of Dataframe :')
df.info(verbose=True)
df = df.sort_values('Timestamp')
df.drop(df.tail(1).index,inplace=True)
df = df.reset_index(drop=True)
df = df.drop(columns=['Timestamp', 'Dst_Port', 'Protocol'])
print(df['Label'].value_counts())

df.drop(df.loc[df['Label']=='DoS attacks-SlowHTTPTest'].index, inplace=True)
# df.to_csv('lstm_dataset.csv')

df.Label[df.Label=='Benign'] = 0
df.Label[df.Label =='DoS attacks-Hulk'] = 1
df["Label"].astype('Int64')
print(df['Label'].value_counts())

  df = pd.read_csv('/media/mengjie/Data/Downloads/02-16-2018.csv') #CSE-CIC-IDS 2018 (02-16-2018)


  Dst_Port Protocol            Timestamp Flow_Duration Tot_Fwd_Pkts   
0        0        0  16/02/2018 08:27:23     112640768            3  \
1        0        0  16/02/2018 08:30:12     112641773            3   
2    35605        6  16/02/2018 08:26:55      20784143           23   
3        0        0  16/02/2018 08:33:01     112640836            3   
4       23        6  16/02/2018 08:27:59            20            1   

  Tot_Bwd_Pkts TotLen_Fwd_Pkts TotLen_Bwd_Pkts Fwd_Pkt_Len_Max   
0            0               0               0               0  \
1            0               0               0               0   
2           44            2416            1344             240   
3            0               0               0               0   
4            1               0               0               0   

  Fwd_Pkt_Len_Min  ... Fwd_Seg_Size_Min Active_Mean Active_Std Active_Max   
0               0  ...                0           0        0.0          0  \
1               0  ...

In [247]:
train_df = df.copy() # Use .iloc to avoid a SettingWithCopyWarning

In [248]:
y = train_df.loc[0:52147,'Label'].astype('Int64')
X = train_df.drop(columns=['Label']).astype(np.float64)
X = X.loc[0:52147,:]

In [249]:
print(y.value_counts())

Label
1    33099
0    19049
Name: count, dtype: Int64


In [250]:
# Define sequence length and overlap
seq_length = 10 # Sequence length
seq_overlap = 5 # Overlap between sequences

def generate_sequences(X, y):
    X_seqs = []
    y_seqs = []
    for i in range(0, len(X) - seq_length + 1, seq_overlap):
        X_seqs.append(X.iloc[i:i+seq_length, :].to_numpy())
        y_seqs.append(y.iloc[i+seq_length+1])
    return np.array(X_seqs), np.array(y_seqs)

X_seqs, y_seqs = generate_sequences(X, y)


In [251]:
normal_indices = [i for i, x in enumerate(y_seqs) if x == 0]
abnormal_indices = [i for i, x in enumerate(y_seqs) if x == 1]

print('total normal counts: ', len(normal_indices))
print('total abnormal counts: ', len(abnormal_indices))

total normal counts:  3803
total abnormal counts:  6625


In [252]:
# normalize data
X_seqs = (X_seqs - X_seqs.mean()) / X_seqs.std()

In [253]:
# split dataset
l = len(X_seqs)
train_size = int(0.7 * l)
val_size = int(0.15 * l)
test_size = l - train_size - val_size
train_X = X_seqs[:train_size,:]
train_y = y_seqs[:train_size]
val_X = X_seqs[train_size:train_size+val_size,:]
val_y = y_seqs[train_size:train_size+val_size]
test_X = X_seqs[train_size+val_size:,:]
test_y = y_seqs[train_size+val_size:]

In [254]:
normal_indices = [i for i, x in enumerate(train_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(train_y) if x == 1]

print('normal counts in training set: ', len(normal_indices))
print('abnormal counts in training set: ', len(abnormal_indices))

normal counts in training set:  2445
abnormal counts in training set:  4854


In [255]:
normal_indices = [i for i, x in enumerate(val_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(val_y) if x == 1]

print('normal counts in validation set: ', len(normal_indices))
print('abnormal counts in validation set: ', len(abnormal_indices))

normal counts in validation set:  507
abnormal counts in validation set:  1057


In [256]:
normal_indices = [i for i, x in enumerate(test_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(test_y) if x == 1]

print('normal counts in test set: ', len(normal_indices))
print('abnormal counts in test set: ', len(abnormal_indices))

normal counts in test set:  851
abnormal counts in test set:  714


In [257]:
import random

def sample_even_distribution(X_seqs, y_label):
    # Separate the list into two sublists
    normal_indices = [i for i, x in enumerate(y_label) if x == 0]
    abnormal_indices = [i for i, x in enumerate(y_label) if x == 1]
    
    min_samples = min(len(normal_indices), len(abnormal_indices))
    
    # Sample an equal number of elements from each sublist in order
    sampled_indices = []
    for i in range(min_samples):
        sampled_indices.append(normal_indices[i])
        sampled_indices.append(abnormal_indices[i])
    
    np.random.shuffle(sampled_indices)
    
    # Extract the sampled elements based on their indices
    X = [X_seqs[i] for i in sampled_indices]
    y = [y_label[i] for i in sampled_indices]
    
    return np.array(X), np.array(y)

In [258]:
train_X, train_y = sample_even_distribution(train_X, train_y)

In [259]:
normal_indices = [i for i, x in enumerate(train_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(train_y) if x == 1]

print('normal counts in training set: ', len(normal_indices))
print('abnormal counts in training set: ', len(abnormal_indices))

normal counts in training set:  2445
abnormal counts in training set:  2445


In [260]:
val_X, val_y = sample_even_distribution(val_X, val_y)

In [261]:
normal_indices = [i for i, x in enumerate(val_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(val_y) if x == 1]

print('normal counts in validation set: ', len(normal_indices))
print('abnormal counts in validation set: ', len(abnormal_indices))

normal counts in validation set:  507
abnormal counts in validation set:  507


In [262]:
test_X, test_y = sample_even_distribution(test_X, test_y)

In [263]:
normal_indices = [i for i, x in enumerate(test_y) if x == 0]
abnormal_indices = [i for i, x in enumerate(test_y) if x == 1]

print('normal counts in test set: ', len(normal_indices))
print('abnormal counts in test set: ', len(abnormal_indices))

normal counts in test set:  714
abnormal counts in test set:  714


In [264]:
print(type(train_X))

<class 'numpy.ndarray'>


In [265]:
print(train_X.shape)

(4890, 10, 76)


In [266]:
print(test_X.shape)

(1428, 10, 76)


In [267]:
print(train_y.shape)

(4890,)


In [268]:
seq_length = 10 # set the sequence length
n_features = 76 # number of features in the dataset

# Define model architecture
from keras.layers import Activation


# Define model architecture with ReLU activation function
model = Sequential()

model.add(LSTM(units=64, input_shape=(seq_length, n_features), return_sequences=True))
model.add(LSTM(units=32, return_sequences=False))
model.add(Dropout(0.2))

model.add(Activation('relu'))
model.add(Dense(units=1, activation='sigmoid'))


# Compile model with adam optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(train_X.shape, train_y.shape, val_X.shape, val_y.shape)
model.summary()

(4890, 10, 76) (4890,) (1014, 10, 76) (1014,)
Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_66 (LSTM)              (None, 10, 64)            36096     
                                                                 
 lstm_67 (LSTM)              (None, 32)                12416     
                                                                 
 dropout_23 (Dropout)        (None, 32)                0         
                                                                 
 activation_23 (Activation)  (None, 32)                0         
                                                                 
 dense_23 (Dense)            (None, 1)                 33        
                                                                 
Total params: 48,545
Trainable params: 48,545
Non-trainable params: 0
_________________________________________________________________


In [269]:
# Train model
early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history = model.fit(train_X, train_y, epochs=100, batch_size=16, validation_data=(val_X, val_y), callbacks=[early_stop])
history


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 32: early stopping


<keras.callbacks.History at 0x7f62290a4070>

In [270]:
predictions = model.predict(test_X)
binary_predictions = (predictions > 0.5).astype(int)



In [271]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_y, binary_predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 84.59%
