# DATA PRE-PROCESSING

## Setup dataset folder

In [20]:
import os

DATASET_FOLDER = os.path.abspath("../../dataset/")
print(DATASET_FOLDER)

d:\Master_2023\Semester_3\MLATTT\Final\Git\NT221-ML\dataset


## Import packages

In [21]:
import pandas as pd
from os import listdir, path

## Load Dataset

In [None]:
# Load the dataset
train_path = os.path.join(DATASET_FOLDER, "train.csv")
#test_path = os.path.join(DATASET_FOLDER, "test.csv")
df = pd.read_csv(train_path)

# Basic column names
column_names = [
    'ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label'
]

# Assign column names to the DataFrame
df.columns = column_names

# Drop the last row of the DataFrame
# df.drop(df.tail(1).index, inplace=True)

## Explain dataset:
**Basic Identifiers:**
- *ID:* Unique identifier for a network flow.

**Flow Characteristics:**
- *flow_duration:* Total time a network flow lasted.
- *Header_Length*: Size of the packet headers in bytes.
- *Protocol type*: Type of protocol used in the flow.
- *Duration*: Similar to flow_duration
- *Rate*: Data transfer rate during the flow
- *Srate*: Data rate from the source host.
- *Drate*: Data rate toward the destination host.

**TCP Flag Counts:**
- *fin_flag_number*: Number of FIN (finish) flags in the flow.
- *syn_flag_number*: Number of SYN (synchronize) flags in the flow.
- *rst_flag_number*: Number of RST (reset) flags in the flow.
- *psh_flag_number*: Number of PSH (push) flags in the flow.
- *ack_flag_number*: Number of ACK (acknowledge) flags in the flow.
- *ece_flag_number*: Number of ECE (Explicit Congestion Notification Echo) flags in the flow.
- *cwr_flag_number*: Number of CWR (Congestion Window Reduced) flags in the flow.

**Packet Counts:**
- *ack_count*: Total number of acknowledgment packets.
- *syn_count*: Total number of synchronization packets.
- *fin_count*: Total number of finish packets.
- *urg_count*: Total number of urgent packets.
- *rst_count*: Total number of reset packets.

**Protocol-Specific Counters:**
- *HTTP, HTTPS, DNS, Telnet, SMTP, SSH, IRC*:
- *TCP, UDP, DHCP, ARP, ICMP, IPv, LLC*: 
  
**Statistical Features:**
- *Tot sum*: Total sum of packet sizes in the flow.
- *Min*: Minimum packet size in the flow.
- *Max*: Maximum packet size in the flow.
- *AVG*: Average packet size in the flow.
- *Std*: Standard deviation of packet sizes.
- *Tot size*: Total size of all packets in the flow.
- *IAT*: Time interval between consecutive packets.
- *Number*: Total number of packets in the flow.

**Advanced Features:**
- *Magnitue*: Magnitude of traffic flow, sometimes derived from vector analysis.
- *Radius*: Measure of the spatial spread of packets (e.g., in feature space).
- *Covariance*: Covariance of features in the flow.
- *Variance*: Variance of traffic metrics like packet size or timing.
- *Weight*: Weighted importance or significance of the flow.

**Target Variable:**
- *Label*: Indicates if the traffic is malicious or benign. 

In [None]:
# Fill missing values for each protocol
protocols = ['TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC']

for protocol in protocols:
    if protocol == 'TCP':  # TCP is service-dependent
        df.loc[(df['TCP'].isnull()) & (df[['HTTP', 'HTTPS', 'Telnet', 'SMTP', 'SSH', 'IRC']].sum(axis=1) > 0), 'TCP'] = 1
    elif protocol == 'UDP':  # UDP is service-dependent
        df.loc[(df['UDP'].isnull()) & (df['DNS'] > 0), 'UDP'] = 1
    else:  # Other protocols (ARP, ICMP, etc.)
        df[protocol] = df[protocol].fillna(0)



In [57]:
print(df.isnull().sum())  # Count nulls in each column
print(df.info())          # Check data types and null count summary

ID                      0
flow_duration           0
Header_Length           0
Protocol type           0
Duration                0
Rate                    0
Srate                   0
Drate                   0
fin_flag_number         0
syn_flag_number         0
rst_flag_number         0
psh_flag_number         0
ack_flag_number         0
ece_flag_number         0
cwr_flag_number         0
ack_count               0
syn_count               0
fin_count               0
urg_count               0
rst_count               0
HTTP                94109
HTTPS               95192
DNS                     0
Telnet              93034
SMTP                93204
SSH                 93690
IRC                     0
TCP                161047
UDP                     0
DHCP                    0
ARP                     0
ICMP                    0
IPv                     0
LLC                     0
Tot sum            195013
Min                195013
Max                195013
AVG                     0
Std         

In [38]:
# Flow Characteristics
df.loc[df['flow_duration'] <= 0, 'flow_duration'] = df['flow_duration'].median()
df.loc[df['Header_Length'] < 20, 'Header_Length'] = df['Header_Length'].median()
df.fillna({'Header_Length': df['Header_Length'].mean()}, inplace=True)

df.fillna({'Protocol type': df['Protocol type'].mode()[0]}, inplace=True)

df.fillna({'Duration': df['Duration'].mean()}, inplace=True)

df.fillna({'Rate': df['Rate'].mean()}, inplace=True)

df.fillna({'Srate': df['Srate'].mean()}, inplace=True)

df.fillna({'Drate': df['Drate'].mean()}, inplace=True)

In [40]:
# TCP Flag Counts
df.fillna({'rst_flag_number': 0}, inplace=True)
df.fillna({'psh_flag_number': 0}, inplace=True)
df.fillna({'ece_flag_number': 0}, inplace=True)
df.fillna({'cwr_flag_number': 0}, inplace=True)

In [41]:
# Packet Counts
df.fillna({'ack_count': 0}, inplace=True)
df.fillna({'syn_count': 0}, inplace=True)

In [36]:
# Inspect Data Ranges
print(df.describe())  # Statistical summary of numeric columns

                 ID  flow_duration  Header_Length  Protocol type  \
count  1.950138e+06   1.950138e+06   1.950138e+06   1.755125e+06   
mean   1.393035e+06   5.997419e+01   2.562040e+05   1.147029e+01   
std    8.040343e+05   9.899781e+02   8.193064e+05   1.157109e+01   
min    0.000000e+00   9.536743e-09   2.000000e+01   0.000000e+00   
25%    6.966822e+05   7.493590e-02   1.066000e+02   6.000000e+00   
50%    1.393132e+06   7.493604e-02   7.450800e+02   6.000000e+00   
75%    2.089356e+06   4.271036e+00   1.758116e+05   1.645000e+01   
max    2.785911e+06   9.968566e+04   9.905298e+06   4.700000e+01   

           Duration          Rate         Srate         Drate  \
count  1.755125e+06  1.755125e+06  1.755125e+06  1.755125e+06   
mean   7.428192e+01  6.785824e+03  6.750602e+03  8.223632e-06   
std    2.871633e+01  8.137564e+04  8.048460e+04  8.870114e-03   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    6.400000e+01  3.076981e+00  3.073111e+00  0.000000e+00 

In [23]:
print(df.shape)
print(df.head())

(1950138, 48)
        ID  flow_duration  Header_Length  Protocol type  Duration       Rate  \
0   769866       0.000000          54.00           6.00     64.00   2.890680   
1  1859874       0.000892          54.58            NaN     64.00  28.594443   
2   396092       0.000000           0.00           1.00     64.00  23.716061   
3   179708     109.146727        1992.80           8.20     50.10  41.102103   
4  1527289       0.025221          58.53           5.95     65.91  25.300629   

       Srate  Drate  fin_flag_number  syn_flag_number  ...         Std  \
0   2.890680    0.0              0.0              0.0  ...         NaN   
1  28.594443    0.0              0.0              1.0  ...    0.000000   
2  23.716061    0.0              0.0              0.0  ...         NaN   
3  41.102103    0.0              0.0              0.0  ...  305.527954   
4  25.300629    NaN              1.0              0.0  ...    1.158718   

   Tot size           IAT  Number   Magnitue      Radius    

In [24]:
# xx types of attacks
df['Label'].value_counts()

Label
MITM-ArpSpoofing           84260
DDoS-TCP_Flood             84208
Recon-HostDiscovery        84139
DoS-UDP_Flood              84139
DDoS-ICMP_Fragmentation    84119
DoS-TCP_Flood              84086
BenignTraffic              84085
DDoS-SYN_Flood             84051
Mirai-greip_flood          84050
DDoS-PSHACK_Flood          84047
DDoS-SynonymousIP_Flood    84021
Mirai-greeth_flood         84018
DDoS-ACK_Fragmentation     84015
DoS-SYN_Flood              83971
DDoS-RSTFINFlood           83949
DDoS-UDP_Flood             83905
DDoS-UDP_Fragmentation     83895
DNS_Spoofing               83746
Mirai-udpplain             83653
DDoS-ICMP_Flood            83587
Recon-OSScan               69603
Recon-PortScan             58311
DoS-HTTP_Flood             50629
VulnerabilityScan          26378
DDoS-HTTP_Flood            20338
DDoS-SlowLoris             16573
DictionaryBruteForce        9213
BrowserHijacking            4134
CommandInjection            3821
SqlInjection                3748
XSS 

## Drop unnecessary columns

In [10]:
# Drop unnecessary columns from the DataFrame
# 'ID': unique identifier for the event
df = df.drop(columns=['ID'])

## Inspect Dataset 
* Check the first few rows.
* Look for null values and data types.

In [11]:
# Inspect the dataset
print(df.head())

   flow_duration  Header_Length  Protocol type  Duration       Rate  \
0       0.000000          54.00           6.00     64.00   2.890680   
1       0.000892          54.58            NaN     64.00  28.594443   
2       0.000000           0.00           1.00     64.00  23.716061   
3     109.146727        1992.80           8.20     50.10  41.102103   
4       0.025221          58.53           5.95     65.91  25.300629   

       Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  ...  \
0   2.890680    0.0              0.0              0.0              0.0  ...   
1  28.594443    0.0              0.0              1.0              0.0  ...   
2  23.716061    0.0              0.0              0.0              0.0  ...   
3  41.102103    0.0              0.0              0.0              0.0  ...   
4  25.300629    NaN              1.0              0.0              1.0  ...   

          Std  Tot size           IAT  Number   Magnitue      Radius  \
0         NaN     54.00  8

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950138 entries, 0 to 1950137
Data columns (total 47 columns):
 #   Column           Dtype  
---  ------           -----  
 0   flow_duration    float64
 1   Header_Length    float64
 2   Protocol type    float64
 3   Duration         float64
 4   Rate             float64
 5   Srate            float64
 6   Drate            float64
 7   fin_flag_number  float64
 8   syn_flag_number  float64
 9   rst_flag_number  float64
 10  psh_flag_number  float64
 11  ack_flag_number  float64
 12  ece_flag_number  float64
 13  cwr_flag_number  float64
 14  ack_count        float64
 15  syn_count        float64
 16  fin_count        float64
 17  urg_count        float64
 18  rst_count        float64
 19  HTTP             float64
 20  HTTPS            float64
 21  DNS              float64
 22  Telnet           float64
 23  SMTP             float64
 24  SSH              float64
 25  IRC              float64
 26  TCP              float64
 27  UDP         

In [13]:
print(df.describe())

       flow_duration  Header_Length  Protocol type      Duration  \
count   1.950138e+06   1.755125e+06   1.755125e+06  1.755125e+06   
mean    5.994925e+01   2.561773e+05   1.147029e+01  7.428192e+01   
std     9.899796e+02   8.636325e+05   1.157109e+01  2.871633e+01   
min     0.000000e+00   0.000000e+00   0.000000e+00  0.000000e+00   
25%     0.000000e+00   5.400000e+01   6.000000e+00  6.400000e+01   
50%     7.493590e-02   1.790400e+02   6.000000e+00  6.400000e+01   
75%     4.271036e+00   3.354800e+04   1.645000e+01  6.670000e+01   
max     9.968566e+04   9.905298e+06   4.700000e+01  2.550000e+02   

               Rate         Srate         Drate  fin_flag_number  \
count  1.755125e+06  1.755125e+06  1.755125e+06     1.950138e+06   
mean   6.785824e+03  6.750602e+03  8.223632e-06     4.325284e-02   
std    8.137564e+04  8.048460e+04  8.870114e-03     2.034258e-01   
min    0.000000e+00  0.000000e+00  0.000000e+00     0.000000e+00   
25%    3.076981e+00  3.073111e+00  0.000000e+00

## Handle Missing Values
* Drop columns or rows with excessive missing data.
* Impute missing values with mean/median for numeric data or mode for categorical data.

In [14]:
# Check for missing values
print("Total: ", len(df))
print(df.isnull().sum())

Total:  1950138
flow_duration           0
Header_Length      195013
Protocol type      195013
Duration           195013
Rate               195013
Srate              195013
Drate              195013
fin_flag_number         0
syn_flag_number         0
rst_flag_number    195013
psh_flag_number    195013
ack_flag_number         0
ece_flag_number    195013
cwr_flag_number    195013
ack_count          195013
syn_count          195013
fin_count               0
urg_count               0
rst_count               0
HTTP               195013
HTTPS              195013
DNS                     0
Telnet             195013
SMTP               195013
SSH                195013
IRC                     0
TCP                195013
UDP                     0
DHCP                    0
ARP                195013
ICMP               195013
IPv                     0
LLC                     0
Tot sum            195013
Min                195013
Max                195013
AVG                     0
Std                195

In [58]:
# List of specific columns to replace NaN with 0
columns_to_fill = ['Header_Length', 'Protocol type', 'Duration', 'Rate', 'Srate', 'Drate',
                   'rst_flag_number', 'psh_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
                   'HTTP', 'HTTPS', 'SMTP', 'SSH', 'TCP', 'ARP', 'ICMP',
                   'Tot sum', 'Min', "Max", 'Std', 'Tot size', 'IAT',
                   'Magnitue', 'Radius','Covariance', 'Variance']

# Fill missing values with 0 for these columns
df[columns_to_fill] = df[columns_to_fill].fillna(0)

## Check Unique Values Per Column
* No records is Telnet -> drop

In [59]:
# Calculate the number of unique values and the total number of rows
unique_counts = df.nunique()
total_counts = len(df)

# Create a Series with the ratio of unique values
ratios = unique_counts.astype(str) + '/' + str(total_counts)

# Display the ratios for each column
print(ratios)

ID                 1950138/1950138
flow_duration      1291511/1950138
Header_Length       656544/1950138
Protocol type         4668/1950138
Duration             12354/1950138
Rate               1574315/1950138
Srate              1574082/1950138
Drate                   63/1950138
fin_flag_number          2/1950138
syn_flag_number          2/1950138
rst_flag_number          2/1950138
psh_flag_number          2/1950138
ack_flag_number          2/1950138
ece_flag_number          2/1950138
cwr_flag_number          2/1950138
ack_count              587/1950138
syn_count             1356/1950138
fin_count              993/1950138
urg_count            32231/1950138
rst_count            64085/1950138
HTTP                     2/1950138
HTTPS                    2/1950138
DNS                      2/1950138
Telnet                   1/1950138
SMTP                     2/1950138
SSH                      2/1950138
IRC                      2/1950138
TCP                      2/1950138
UDP                 

In [60]:
# Drop unnecessary columns from the DataFrame
# 'Telnet': no records is telnet
df = df.drop(columns=['Telnet'])

In [61]:
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame
label_encoder = LabelEncoder()

# Fit and transform the 'Label' column
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Display the mapping of labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

Label mapping: {'Backdoor_Malware': 0, 'BenignTraffic': 1, 'BrowserHijacking': 2, 'CommandInjection': 3, 'DDoS-ACK_Fragmentation': 4, 'DDoS-HTTP_Flood': 5, 'DDoS-ICMP_Flood': 6, 'DDoS-ICMP_Fragmentation': 7, 'DDoS-PSHACK_Flood': 8, 'DDoS-RSTFINFlood': 9, 'DDoS-SYN_Flood': 10, 'DDoS-SlowLoris': 11, 'DDoS-SynonymousIP_Flood': 12, 'DDoS-TCP_Flood': 13, 'DDoS-UDP_Flood': 14, 'DDoS-UDP_Fragmentation': 15, 'DNS_Spoofing': 16, 'DictionaryBruteForce': 17, 'DoS-HTTP_Flood': 18, 'DoS-SYN_Flood': 19, 'DoS-TCP_Flood': 20, 'DoS-UDP_Flood': 21, 'MITM-ArpSpoofing': 22, 'Mirai-greeth_flood': 23, 'Mirai-greip_flood': 24, 'Mirai-udpplain': 25, 'Recon-HostDiscovery': 26, 'Recon-OSScan': 27, 'Recon-PingSweep': 28, 'Recon-PortScan': 29, 'SqlInjection': 30, 'Uploading_Attack': 31, 'VulnerabilityScan': 32, 'XSS': 33}


# Feature Selection

In [None]:
X = df.drop(['Label'], axis=1)  # Features
y = df['Label']  # Target

In [None]:
# Feature Importance from Tree-Based Models
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Select top 30 features
top_features = feature_importances.head(30)['Feature'].tolist()
print("Top 30 Features:", top_features)


# Model: XGBoost

In [62]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [63]:
# Initialize XGBoost Classifier
xgb = XGBClassifier(random_state=10, use_label_encoder=False, eval_metric='mlogloss')  # Disable unnecessary warning for classification

In [64]:
X = df[['flow_duration', 'Header_Length', 'Protocol type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight']]
Y = df['Label']

# Encode labels
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)  # Transform string labels into integers

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, random_state=10, test_size=0.2)


#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=10, test_size=0.2)

In [None]:
start = time.time()
print('program start...')
print()

# Train the model
xgb.fit(X_train, Y_train)
print()

print('prediction:')
# Make predictions
Y_pred = xgb.predict(X_test)
print(Y_pred)
print()

# print('Score:')
# score = xgb.score(X_test,Y_test)
# print(score)

end = time.time()
print('program end...')
print()
print('time cost: ')
print(end - start, 'seconds')


program start...



Parameters: { "use_label_encoder" } are not used.



In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

Accuracy: 0.9365891679571723
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.19      0.26       464
           1       0.77      0.88      0.82     16898
           2       0.85      0.52      0.64       859
           3       0.55      0.33      0.41       797
           4       1.00      1.00      1.00     16846
           5       0.98      0.98      0.98      4131
           6       1.00      1.00      1.00     16783
           7       1.00      1.00      1.00     16790
           8       1.00      1.00      1.00     16620
           9       1.00      1.00      1.00     16725
          10       0.94      0.99      0.96     16873
          11       0.98      0.99      0.98      3314
          12       0.99      0.97      0.98     16997
          13       0.96      0.99      0.98     16854
          14       0.98      0.98      0.98     16795
          15       1.00      1.00      1.00     16711
          16       0.76     