In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df_udp = pd.read_csv("UDP/UDP_level3.csv")

In [3]:
df_udp.columns

Index([' Source Port', ' Destination Port', ' Protocol', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', ' Flow IAT Mean',
       ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total',
       ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max',
       ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', ' SYN Flag Count',
       ' RST Flag Count', ' ACK Flag Count', ' UR

In [4]:
df_udp[' Protocol'].unique()

array([17,  6,  0])

In [5]:
df_syn = pd.read_csv("Syn/Syn_level3.csv")

In [6]:
df_syn[' Protocol'].unique()

array([ 6, 17,  0])

In [7]:
df_udplag = pd.read_csv("UDPLag/UDPLag_level3.csv")

In [8]:
df_udplag[' Protocol'].unique()

array([ 6, 17,  0])

In [9]:
df_final = pd.concat([df_udp, df_syn, df_udplag])

In [10]:
df_final.shape

(180000, 68)

In [65]:
# df_final.to_csv('df_level3R.csv')

In [11]:
df_final_counts = df_final.groupby(' Label')[' Label'].count().reset_index(name="count")

In [12]:
df_final_counts

Unnamed: 0,Label,count
0,Syn,90000
1,UDP,45000
2,UDP-lag,45000


In [13]:
data_classes = list(df_final[' Label'].unique())

In [14]:
data_classes

['UDP', 'Syn', 'UDP-lag']

In [15]:
df_final[' Label'] = df_final[' Label'].apply(data_classes.index)

In [16]:
df_final[' Label'].unique()

array([0, 1, 2])

In [17]:
X = df_final.drop(' Label', axis = 1)

In [18]:
y = df_final[' Label'].values

In [19]:
for i in range(len(y)):
    if y[i] == 1:
        y[i] = 0
    else:
        y[i] = 1

In [20]:
count0 = 0
count1 = 0
for i in y:
    if i == 0:
        count0 += 1
    else:
        count1 += 1
        
print(count0, count1)

90000 90000


In [21]:
from sklearn.feature_selection import SelectKBest, f_classif

anova_features = SelectKBest(f_classif)

In [22]:
k_best_features_anova = anova_features.fit_transform(X, y)

In [23]:
for i in range(len(anova_features.scores_)):
    if anova_features.scores_[i] > 50000:
        print(X.columns[i], ":", anova_features.scores_[i])

 Protocol : 116052.36318260546
 Fwd Packet Length Max : 90042.46102983289
 Fwd Packet Length Min : 85391.95664558111
 Fwd Packet Length Mean : 87899.27468915284
 Min Packet Length : 85911.5858123487
 Max Packet Length : 89750.01545182003
 Packet Length Mean : 87820.36865260963
 ACK Flag Count : 115585.85166546522
 Average Packet Size : 80522.64043882322
 Avg Fwd Segment Size : 87899.27468915284
Init_Win_bytes_forward : 110790.80725739933


In [24]:
X = df_final[[#' Source Port',
            ' Protocol',
           #'Total Length of Fwd Packets',
            ' Fwd Packet Length Mean',
            ' Fwd Packet Length Max',
            ' Fwd Packet Length Min',
            #'Fwd Packets/s',
            ' Min Packet Length',
            ' Max Packet Length',
            ' Packet Length Mean',
            ' ACK Flag Count',
            #' Down/Up Ratio',
            ' Average Packet Size',
            ' Avg Fwd Segment Size',
            #    ' Flow IAT Std',
             #   ' Fwd IAT Std',
              #  ' Flow IAT Mean',
               # ' Fwd IAT Mean',
                #' ACK Flag Count',
                'Init_Win_bytes_forward']].values

In [25]:
X.shape

(180000, 11)

In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
scaler = StandardScaler()

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21, test_size = 0.25, shuffle = True)

In [28]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.neighbors import KNeighborsClassifier

classifier_knn = KNeighborsClassifier(n_neighbors = 7)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [30]:
pred_knn = classifier_knn.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred_knn))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22560
           1       1.00      1.00      1.00     22440

    accuracy                           1.00     45000
   macro avg       1.00      1.00      1.00     45000
weighted avg       1.00      1.00      1.00     45000



In [31]:
from sklearn.tree import DecisionTreeClassifier

classifier_tree = DecisionTreeClassifier(criterion = 'entropy', splitter = 'random')
classifier_tree.fit(X_train, y_train)

pred_tree = classifier_tree.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred_tree))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22560
           1       1.00      1.00      1.00     22440

    accuracy                           1.00     45000
   macro avg       1.00      1.00      1.00     45000
weighted avg       1.00      1.00      1.00     45000



In [32]:
from sklearn.ensemble import VotingClassifier

classifier_ensemble = VotingClassifier(estimators = [('knn', classifier_knn), ('dt', classifier_tree), 
                                                     ], 
                                       voting = 'soft')
classifier_ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=7,
                                                   p=2, weights='uniform')),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='entropy',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     m

In [None]:
pred_ensemble = classifier_ensemble.predict(X_test)

In [None]:
print(classification_report(y_test, pred_ensemble))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred_tree)

In [None]:
sns.heatmap(cm, annot = True)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred_knn)
sns.heatmap(cm, annot = True)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred_ensemble)
sns.heatmap(cm, annot = True)