In [23]:
# !pip install ipaddress

Collecting ipaddress
  Downloading ipaddress-1.0.23-py2.py3-none-any.whl.metadata (923 bytes)
Downloading ipaddress-1.0.23-py2.py3-none-any.whl (18 kB)
Installing collected packages: ipaddress
Successfully installed ipaddress-1.0.23


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import ipaddress

In [3]:
training_df = pd.read_csv("Datasets/training_data.csv", low_memory=False)

In [4]:
testing_df = pd.read_csv("Datasets/testing_data.csv", low_memory=False)

# Data Preprocessing

## Checking Dataframe columns and their type:

In [5]:
training_df.shape

(48699876, 67)

In [6]:
training_df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,SimillarHTTP,Inbound,Label,is_attack
0,172.16.0.5,58445,192.168.50.1,4463,17,2018-12-01 13:04:45.928673,1,2,0,766.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,True,UDP-lag,True
1,172.16.0.5,36908,192.168.50.1,9914,17,2018-12-01 13:04:45.928913,1,2,0,778.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,True,UDP-lag,True
2,172.16.0.5,41727,192.168.50.1,32361,17,2018-12-01 13:04:45.928915,2,2,0,750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,True,UDP-lag,True
3,172.16.0.5,55447,192.168.50.1,5691,17,2018-12-01 13:04:45.929024,2,2,0,738.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,True,UDP-lag,True
4,172.16.0.5,58794,192.168.50.1,56335,17,2018-12-01 13:04:45.929096,1,2,0,750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,True,UDP-lag,True


In [7]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48699876 entries, 0 to 48699875
Data columns (total 67 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Source IP                    object 
 1   Source Port                  int64  
 2   Destination IP               object 
 3   Destination Port             int64  
 4   Protocol                     int64  
 5   Timestamp                    object 
 6   Flow Duration                int64  
 7   Total Fwd Packets            int64  
 8   Total Backward Packets       int64  
 9   Total Length of Fwd Packets  float64
 10  Total Length of Bwd Packets  float64
 11  Fwd Packet Length Max        float64
 12  Fwd Packet Length Min        float64
 13  Fwd Packet Length Mean       float64
 14  Fwd Packet Length Std        float64
 15  Bwd Packet Length Max        float64
 16  Bwd Packet Length Min        float64
 17  Bwd Packet Length Mean       float64
 18  Bwd Packet Length Std        float64
 19

## Handlong catgorization data:

### Source IP, Destination IP:

ideas:

- Decode the class of the IP, e,g: Public/Private, Class A,B,C or D.
- Decode a bool feature if the IP belongs to reserved IP addresses.
- Check for which region, geography the IP belongs to:
    - Ethical consequences.
- Chek if the IP belongs to the network or not:
    - already satisfied by Inbound feature.
- Using Hash encoders: to encode IPs into 10 features
    - Problem: potential collosion, testing dataset doesn't have a wide range of IPs, leading to potential overfitting.   

In [8]:
(training_df["Source IP"].nunique(), training_df["Destination IP"].nunique())

(550, 598)

In [9]:
training_df["Source IP"].value_counts().head(20)

Source IP
172.16.0.5        48627645
192.168.50.1         16161
192.168.50.6         15482
192.168.50.7         14971
192.168.50.8         14542
74.208.236.171        1352
192.168.50.253         653
192.168.50.254         633
8.6.0.1                332
72.21.91.29            302
172.217.10.226         188
172.217.7.2            160
172.217.10.130         158
0.0.0.0                154
172.217.6.194          150
172.217.10.98          145
172.217.10.2           137
23.194.142.15          136
172.217.11.34          134
23.194.109.223         133
Name: count, dtype: int64

In [10]:
training_df["Destination IP"].value_counts().head(20)

Destination IP
192.168.50.1      48627754
4.2.2.4              16757
172.16.0.5           15806
8.8.8.8               4319
74.208.236.171        4260
192.168.50.8          3320
192.168.50.6          3031
192.168.50.7          2814
224.0.0.5             1272
72.21.91.29           1215
172.217.9.226          524
172.217.10.226         416
172.217.10.130         383
172.217.7.2            380
172.217.7.14           353
172.217.3.98           349
192.168.10.50          340
23.194.142.213         340
8.0.6.4                332
172.217.6.194          332
Name: count, dtype: int64

#### Finding:

Majority of the samples belongs to specific IPs, as a result of the test bed used to generate the data. Potential overfitting may occur specific to these IPs for any features that indicate the specific Source, or Destination IP.

#### Plan: 
a. Create a feature to highlight if the IP belongs to reserved IPs by [IANA](<https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml>)
b. Create a feature that indicate the [class of the IP](<https://www.rfc-editor.org/rfc/rfc4632.html#section-3.1>)
                                                        

In [11]:
# Creating a subset of training_df for testing:

small_df = training_df.sample(n=1000, random_state=123)

In [12]:
# Function to classify IP address based on first octet
def classify_ip(ip):
    first_octet = int(ip.split('.')[0])
    if 1 <= first_octet <= 126:
        return 'Class A'
    elif 128 <= first_octet <= 191:
        return 'Class B'
    elif 192 <= first_octet <= 223:
        return 'Class C'
    elif 224 <= first_octet <= 239:
        return 'Class D'
    elif 240 <= first_octet <= 255:
        return 'Class E'
    else:
        return 'Unknown'


ip_classes = ['Class A', 'Class B', 'Class C', 'Class D', 'Class E', 'Unknown']

# Apply the function to classify Source and Destination IPs
small_df['Source IP Class'] = small_df['Source IP'].apply(classify_ip)

small_df['Destination IP Class'] = small_df['Destination IP'].apply(classify_ip)

# Defining datatype of the generated column as Categorical values, with a list of possible values, 
# this is important to ensure feature encoding would work well, and reduce memory footprint.
small_df['Source IP Class'] = pd.Categorical(small_df['Source IP Class'], 
                                             categories=ip_classes
                                            )
small_df['Destination IP Class'] = pd.Categorical(small_df['Destination IP Class'], 
                                             categories=ip_classes
                                            )

# Display the updated DataFrame
small_df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,...,Active Max,Active Min,Idle Mean,Idle Std,SimillarHTTP,Inbound,Label,is_attack,Source IP Class,Destination IP Class
21871719,172.16.0.5,60505,192.168.50.1,6884,17,2018-12-01 12:56:48.772547,1,2,0,802.0,...,0.0,0.0,0.0,0.0,0,True,DrDoS_UDP,True,Class B,Class C
7954378,172.16.0.5,23559,192.168.50.1,1945,17,2018-12-01 13:46:30.389793,1,2,0,1032.0,...,0.0,0.0,0.0,0.0,0,True,TFTP,True,Class B,Class C
18930869,172.16.0.5,62478,192.168.50.1,18793,17,2018-12-01 13:56:27.406852,6000036,6,0,3096.0,...,0.0,0.0,0.0,0.0,0,True,TFTP,True,Class B,Class C
14629146,172.16.0.5,62543,192.168.50.1,43286,17,2018-12-01 13:50:40.853321,1,2,0,1032.0,...,0.0,0.0,0.0,0.0,0,True,TFTP,True,Class B,Class C
762879,172.16.0.5,41869,192.168.50.1,55510,17,2018-12-01 13:40:17.126986,6016134,6,0,3096.0,...,0.0,0.0,0.0,0.0,0,True,TFTP,True,Class B,Class C


In [13]:
small_df[['Source IP Class', 'Destination IP Class']].dtypes

Source IP Class         category
Destination IP Class    category
dtype: object

In [14]:
small_df['Destination IP Class'].value_counts() 

Destination IP Class
Class C    998
Class A      1
Class B      1
Class D      0
Class E      0
Unknown      0
Name: count, dtype: int64

In [15]:
( small_df['Destination IP Class'].isna().sum() , small_df['Destination IP Class'].isna().sum())

(0, 0)

In [16]:
# Function to check if the IP belongs to a reserved range
def is_reserved_ip(ip):
    ip_obj = ipaddress.ip_address(ip)
    if ip_obj.is_private:
        return 'Private'
    elif ip_obj.is_multicast:
        return 'Multicast'
    elif ip_obj.is_loopback:
        return 'Loopback'
    elif ip_obj.is_link_local:
        return 'Link-local'
    elif ip_obj.is_reserved:
        return 'Reserved'
    elif ip == '255.255.255.255':
        return 'Broadcast'
    elif ip == "0.0.0.0":
        return 'DHCP/Zeros'
    else:
        return 'Public'

# List of IP reserved categories:
reserved_categories = [
    'Private',
    'Multicast', 
    'Loopback', 
    'Link-local', 
    'Reserved', 
    'Broadcast', 
    'DHCP/Zeros', 
    'Public'
]

# Apply the function to classify Source and Destination IPs
small_df['Source IP Reserved'] = small_df['Source IP'].apply(is_reserved_ip)
small_df['Destination IP Reserved'] = small_df['Destination IP'].apply(is_reserved_ip)

# Updating the datatype of generated columns:
small_df['Source IP Reserved'] = pd.Categorical(small_df['Source IP Reserved'], 
                                             categories=reserved_categories
                                            )
small_df['Destination IP Reserved'] = pd.Categorical(small_df['Destination IP Reserved'], 
                                             categories=reserved_categories
                                            )

# Display the updated DataFrame
small_df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,...,Idle Mean,Idle Std,SimillarHTTP,Inbound,Label,is_attack,Source IP Class,Destination IP Class,Source IP Reserved,Destination IP Reserved
21871719,172.16.0.5,60505,192.168.50.1,6884,17,2018-12-01 12:56:48.772547,1,2,0,802.0,...,0.0,0.0,0,True,DrDoS_UDP,True,Class B,Class C,Private,Private
7954378,172.16.0.5,23559,192.168.50.1,1945,17,2018-12-01 13:46:30.389793,1,2,0,1032.0,...,0.0,0.0,0,True,TFTP,True,Class B,Class C,Private,Private
18930869,172.16.0.5,62478,192.168.50.1,18793,17,2018-12-01 13:56:27.406852,6000036,6,0,3096.0,...,0.0,0.0,0,True,TFTP,True,Class B,Class C,Private,Private
14629146,172.16.0.5,62543,192.168.50.1,43286,17,2018-12-01 13:50:40.853321,1,2,0,1032.0,...,0.0,0.0,0,True,TFTP,True,Class B,Class C,Private,Private
762879,172.16.0.5,41869,192.168.50.1,55510,17,2018-12-01 13:40:17.126986,6016134,6,0,3096.0,...,0.0,0.0,0,True,TFTP,True,Class B,Class C,Private,Private


In [17]:
print( small_df["Source IP Reserved"].value_counts() )
print( small_df["Destination IP Reserved"].value_counts() )

Source IP Reserved
Private       1000
Multicast        0
Loopback         0
Link-local       0
Reserved         0
Broadcast        0
DHCP/Zeros       0
Public           0
Name: count, dtype: int64
Destination IP Reserved
Private       998
Public          2
Multicast       0
Loopback        0
Link-local      0
Reserved        0
Broadcast       0
DHCP/Zeros      0
Name: count, dtype: int64


In [18]:
( small_df["Source IP Reserved"].isna().sum() , small_df["Destination IP Reserved"].isna().sum() )

(0, 0)

### Droping Source and Destination IP addresses and Encoding generated categorical data:

In [19]:
# Initialize OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, drop=None)
to_encode_cols = [
    'Source IP Class', 
    'Destination IP Class',
    'Source IP Reserved', 
    'Destination IP Reserved'
]
# Fit and transform the categorical features
encoded_features = encoder.fit_transform(small_df[to_encode_cols])

In [20]:
# Convert the encoded features to a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(to_encode_cols))

In [21]:
encoded_df.head()

Unnamed: 0,Source IP Class_Class B,Source IP Class_Class C,Destination IP Class_Class A,Destination IP Class_Class B,Destination IP Class_Class C,Source IP Reserved_Private,Destination IP Reserved_Private,Destination IP Reserved_Public
0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [22]:
# Concatenate the encoded DataFrame with the original DataFrame
small_df_one_hot = pd.concat([
    small_df.reset_index(drop=True), 
    encoded_df.reset_index(drop=True)
], axis=1)

In [23]:
small_df_one_hot.isnull().sum()

Source IP                          0
Source Port                        0
Destination IP                     0
Destination Port                   0
Protocol                           0
                                  ..
Destination IP Class_Class B       0
Destination IP Class_Class C       0
Source IP Reserved_Private         0
Destination IP Reserved_Private    0
Destination IP Reserved_Public     0
Length: 79, dtype: int64

In [24]:
encoder.get_feature_names_out(to_encode_cols)

array(['Source IP Class_Class B', 'Source IP Class_Class C',
       'Destination IP Class_Class A', 'Destination IP Class_Class B',
       'Destination IP Class_Class C', 'Source IP Reserved_Private',
       'Destination IP Reserved_Private',
       'Destination IP Reserved_Public'], dtype=object)

In [25]:
to_encode_cols + list(encoder.get_feature_names_out(to_encode_cols))

['Source IP Class',
 'Destination IP Class',
 'Source IP Reserved',
 'Destination IP Reserved',
 'Source IP Class_Class B',
 'Source IP Class_Class C',
 'Destination IP Class_Class A',
 'Destination IP Class_Class B',
 'Destination IP Class_Class C',
 'Source IP Reserved_Private',
 'Destination IP Reserved_Private',
 'Destination IP Reserved_Public']

In [26]:
small_df_one_hot[to_encode_cols + list(encoder.get_feature_names_out(to_encode_cols))]

Unnamed: 0,Source IP Class,Destination IP Class,Source IP Reserved,Destination IP Reserved,Source IP Class_Class B,Source IP Class_Class C,Destination IP Class_Class A,Destination IP Class_Class B,Destination IP Class_Class C,Source IP Reserved_Private,Destination IP Reserved_Private,Destination IP Reserved_Public
0,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
2,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
996,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
997,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
998,Class B,Class C,Private,Private,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [27]:
small_df_one_hot = pd.get_dummies(small_df, columns=['Source IP Class', 'Destination IP Class', 'Source IP Reserved', 'Destination IP Reserved'], drop_first=False)
small_df_one_hot.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,...,Source IP Reserved_DHCP/Zeros,Source IP Reserved_Public,Destination IP Reserved_Private,Destination IP Reserved_Multicast,Destination IP Reserved_Loopback,Destination IP Reserved_Link-local,Destination IP Reserved_Reserved,Destination IP Reserved_Broadcast,Destination IP Reserved_DHCP/Zeros,Destination IP Reserved_Public
21871719,172.16.0.5,60505,192.168.50.1,6884,17,2018-12-01 12:56:48.772547,1,2,0,802.0,...,False,False,True,False,False,False,False,False,False,False
7954378,172.16.0.5,23559,192.168.50.1,1945,17,2018-12-01 13:46:30.389793,1,2,0,1032.0,...,False,False,True,False,False,False,False,False,False,False
18930869,172.16.0.5,62478,192.168.50.1,18793,17,2018-12-01 13:56:27.406852,6000036,6,0,3096.0,...,False,False,True,False,False,False,False,False,False,False
14629146,172.16.0.5,62543,192.168.50.1,43286,17,2018-12-01 13:50:40.853321,1,2,0,1032.0,...,False,False,True,False,False,False,False,False,False,False
762879,172.16.0.5,41869,192.168.50.1,55510,17,2018-12-01 13:40:17.126986,6016134,6,0,3096.0,...,False,False,True,False,False,False,False,False,False,False


In [30]:


def process_ip_features(df):
    # Function to classify IP address based on first octet
    def classify_ip(ip):
        first_octet = int(ip.split('.')[0])
        if 1 <= first_octet <= 126:
            return 'Class A'
        elif 128 <= first_octet <= 191:
            return 'Class B'
        elif 192 <= first_octet <= 223:
            return 'Class C'
        elif 224 <= first_octet <= 239:
            return 'Class D'
        elif 240 <= first_octet <= 255:
            return 'Class E'
        else:
            return 'Unknown'

    ip_classes = ['Class A', 'Class B', 'Class C', 'Class D', 'Class E', 'Unknown']
    
    # Apply the function to classify Source and Destination IPs
    df['Source IP Class'] = df['Source IP'].apply(classify_ip)
    df['Destination IP Class'] = df['Destination IP'].apply(classify_ip)

    # Defining datatype of the generated column as Categorical values
    df['Source IP Class'] = pd.Categorical(df['Source IP Class'], categories=ip_classes)
    df['Destination IP Class'] = pd.Categorical(df['Destination IP Class'], categories=ip_classes)

    # Function to check if the IP belongs to a reserved range
    def is_reserved_ip(ip):
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.is_private:
            return 'Private'
        elif ip_obj.is_multicast:
            return 'Multicast'
        elif ip_obj.is_loopback:
            return 'Loopback'
        elif ip_obj.is_link_local:
            return 'Link-local'
        elif ip_obj.is_reserved:
            return 'Reserved'
        elif ip == '255.255.255.255':
            return 'Broadcast'
        elif ip == "0.0.0.0":
            return 'DHCP/Zeros'
        else:
            return 'Public'

    reserved_categories = [
        'Private', 'Multicast', 'Loopback', 'Link-local', 
        'Reserved', 'Broadcast', 'DHCP/Zeros', 'Public'
    ]

    # Apply the function to classify Source and Destination IPs as reserved
    df['Source IP Reserved'] = df['Source IP'].apply(is_reserved_ip)
    df['Destination IP Reserved'] = df['Destination IP'].apply(is_reserved_ip)

    # Updating the datatype of generated columns:
    df['Source IP Reserved'] = pd.Categorical(df['Source IP Reserved'], categories=reserved_categories)
    df['Destination IP Reserved'] = pd.Categorical(df['Destination IP Reserved'], categories=reserved_categories)

    # Initialize OneHotEncoder with explicit categories for each feature
    encoder = OneHotEncoder(sparse_output=False, categories=[ip_classes, ip_classes, reserved_categories, reserved_categories])
    
    # List of columns to be encoded
    to_encode_cols = ['Source IP Class', 'Destination IP Class', 'Source IP Reserved', 'Destination IP Reserved']

    # Fit and transform the categorical features
    encoded_features = encoder.fit_transform(df[to_encode_cols])

    # Convert the encoded features to a DataFrame
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(to_encode_cols))

    # Concatenate the encoded DataFrame with the original DataFrame in place
    df.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)
    
    for col in encoded_df.columns:
        df[col] = encoded_df[col]

    # Drop the original categorical columns as they are now encoded
    df.drop(columns=to_encode_cols, inplace=True)


In [32]:
process_ip_features(small_df)
small_df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,...,Source IP Reserved_DHCP/Zeros,Source IP Reserved_Public,Destination IP Reserved_Private,Destination IP Reserved_Multicast,Destination IP Reserved_Loopback,Destination IP Reserved_Link-local,Destination IP Reserved_Reserved,Destination IP Reserved_Broadcast,Destination IP Reserved_DHCP/Zeros,Destination IP Reserved_Public
0,172.16.0.5,60505,192.168.50.1,6884,17,2018-12-01 12:56:48.772547,1,2,0,802.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,172.16.0.5,23559,192.168.50.1,1945,17,2018-12-01 13:46:30.389793,1,2,0,1032.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,172.16.0.5,62478,192.168.50.1,18793,17,2018-12-01 13:56:27.406852,6000036,6,0,3096.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,172.16.0.5,62543,192.168.50.1,43286,17,2018-12-01 13:50:40.853321,1,2,0,1032.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,172.16.0.5,41869,192.168.50.1,55510,17,2018-12-01 13:40:17.126986,6016134,6,0,3096.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# applying this to Training and Testing Dataframes:

process_ip_features(testing_df)

In [34]:
testing_df.columns

Index(['Source IP', 'Source Port', 'Destination IP', 'Destination Port',
       'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'SYN Flag Count',
       'RST Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
       'Down/Up Ratio', 'Average Packet Size'

In [35]:
testing_df.drop(columns=['Source IP', 'Destination IP'], inplace=True)

In [36]:
process_ip_features(training_df)
training_df.columns

Index(['Source IP', 'Source Port', 'Destination IP', 'Destination Port',
       'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'SYN Flag Count',
       'RST Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
       'Down/Up Ratio', 'Average Packet Size'

In [37]:
# Renaming Columns to remoe extra Class:

rename = {
    "Source IP Class_Class A":"Source IP_Class A",
    "Source IP Class_Class B":"Source IP_Class B",
    "Source IP Class_Class C":"Source IP_Class C",
    "Source IP Class_Class D":"Source IP_Class D",
    "Source IP Class_Class E":"Source IP_Class E",
    "Source IP Class_Unknown":"Source IP_Class Unknown",
    "Destination IP Class_Class A":"Destination IP_Class A",
    "Destination IP Class_Class B":"Destination IP_Class B",
    "Destination IP Class_Class C":"Destination IP_Class C",
    "Destination IP Class_Class D":"Destination IP_Class D",
    "Destination IP Class_Class E":"Destination IP_Class E",
    "Destination IP Class_Unknown":"Destination IP_Class Unknown"
}

testing_df.rename(columns=rename, inplace=True)
training_df.rename(columns=rename, inplace=True)

In [38]:
training_df.columns

Index(['Source IP', 'Source Port', 'Destination IP', 'Destination Port',
       'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'SYN Flag Count',
       'RST Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
       'Down/Up Ratio', 'Average Packet Size'

In [39]:
# Removing Source IP and Destination IP from training dataframes:

training_df.drop(columns=['Source IP', 'Destination IP'], inplace=True)

In [43]:
# Exporting a snapshot of training_df and testing_df after IP encoding:
training_df.to_csv('Datasets/Snapshots/training_df_after_ip_encoding.csv', index=False)

OSError: [Errno 28] No space left on device

In [None]:
testing_df.to_csv('Datasets/Snapshots/testing_df_after_ip_encoding.csv', index=False)

### Source and Destination Port

In [40]:
training_df['Source Port'].value_counts()[1:20]

Source Port
564      675772
62468    664768
61850    595669
648      584543
62466    478426
62467    383683
62470    351813
900      350556
580      305635
634      225504
62474    208258
62472    176300
62480    162884
62476    152784
62478    143105
62469    143073
63493    134765
62482    132231
62484    122874
Name: count, dtype: int64

In [41]:
# Checking distribution of defined ports 

training_df[training_df['Source Port'] < 100]['Source Port'].value_counts()


Source Port
80    5861
0     2534
22    1913
68     155
53     128
69      20
81       1
25       1
Name: count, dtype: int64

**Approach:** Combing both Binary encoding and OneHot Encoder to avoid exceessive feature list, and ensuring important information on the port number is preserved and value properly in ML models. 

- Using Binary Encoding to encode the port numbers.
- In addition to that creating new features that represent whether a port belongs to assigned ports by IANA *(0-1023)* or if it is a Random port, and if it belongs to common applications, like SSH, HTTP, and HTTPS.

In [100]:
training_df['Source Port'].dtype

dtype('int64')

In [None]:
# Port value mapping
port_value_mapping = {
    0: "Zero",
    20: "FTP",
    21: "FTP",
    22: "SSH",
    80: "HTTP",
    53: "DNS",
    68: "DHCP",
    69: "TFTP",
    123: "NTP",
    443: "HTTPS",
    8080: "HTTP Alternate",
    8443: "HTTPS Alternate",
    1433: "MS SQL",
    3306: "MySQL",
    5432: "PostgreSQL",
    3389: "RDP",
}


# Main function to apply all steps in place
def encode_ports_inplace(df):
    """
    Main function to apply binary encoding, port category mapping, IANA range classification,
    and one-hot encoding, updating the DataFrame in place and removing the original Source and
    Destination Port columns.
    """

    # Binary encoding for ports
    def binary_encode_port(port, num_bits=16):
        return [int(bit) for bit in format(port, f'0{num_bits}b')]

    # Apply binary encoding to Source and Destination Ports and expand them into separate columns
    df[['Source Port Binary', 'Destination Port Binary']] = df.apply(
        lambda x: [binary_encode_port(x['Source Port']), binary_encode_port(x['Destination Port'])], axis=1, result_type='expand'
    )
    
    binary_cols_source = pd.DataFrame(df['Source Port Binary'].tolist(), columns=[f'Source Port Binary_{i}' for i in range(16)])
    binary_cols_dest = pd.DataFrame(df['Destination Port Binary'].tolist(), columns=[f'Destination Port Binary_{i}' for i in range(16)])
    df = pd.concat([df, binary_cols_source, binary_cols_dest], axis=1)

    # Function to map ports to categories
    def map_port_to_category(port):
        """Map ports to their well-defined category or mark as others."""
        return port_value_mapping.get(port, "Others")
    
    # Apply the port category mapping
    df['Source Port Category'] = df['Source Port'].apply(map_port_to_category)
    df['Destination Port Category'] = df['Destination Port'].apply(map_port_to_category)
    
    # IANA Range classification and port categories
    def port_range_category(port):
        if 0 <= port <= 1023:
            return 'Well-Known'
        elif 1024 <= port <= 49151:
            return 'Registered'
        elif 49152 <= port <= 65535:
            return 'Dynamic/Private'
        else:
            return 'Unknown'
    
    # Apply the IANA range classification
    df['Source Port IANA Range'] = df['Source Port'].apply(port_range_category)
    df['Destination Port IANA Range'] = df['Destination Port'].apply(port_range_category)

    # Step 4: One-hot encoding for IANA ranges and port categories
    # Define all possible categories for IANA ranges and port categories
    iana_categories = ['Well-Known', 'Registered', 'Dynamic/Private', 'Unknown']
    port_categories = ['Zero', 'FTP', 'SSH', 'HTTP', 'DNS', 'DHCP', 'TFTP', 'NTP', 'HTTPS', 'HTTP Alternate', 'HTTPS Alternate', 'MS SQL', 'MySQL', 'PostgreSQL', 'RDP', 'Others']

    # Initialize OneHotEncoder with predefined categories
    encoder = OneHotEncoder(sparse_output=False, categories=[port_categories, port_categories, iana_categories, iana_categories])

    # Apply OneHotEncoder to Source/Destination Port Categories and IANA Ranges
    encoded_features = encoder.fit_transform(df[['Source Port Category', 'Destination Port Category', 'Source Port IANA Range', 'Destination Port IANA Range']])
    
    # Create a DataFrame with the encoded features
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Source Port Category', 'Destination Port Category', 'Source Port IANA Range', 'Destination Port IANA Range']))

    # Concatenate the one-hot encoded columns back to the original DataFrame
    df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

    # Step 5: Drop the original port-related columns, including the binary and categorical columns
    df.drop(columns=['Source Port Binary', 'Destination Port Binary', 'Source Port Category', 'Destination Port Category', 'Source Port IANA Range', 'Destination Port IANA Range', 'Source Port', 'Destination Port'], inplace=True)

    return df

In [None]:
t = encode_ports_inplace(small_df)

In [None]:
t.columns[-80:]

In [1]:
small_df.head()

NameError: name 'small_df' is not defined

In [None]:
# After confirming results, applying it to training and testing Dataset:

training_df = encode_ports_inplace(training_df)

testing_df = encode_ports_inplace(testing_df)


### Protocol:


In [None]:
training_df["Protocol"].value_counts()

In [None]:
small_df['Protocol'] = small_df['Protocol'].replace({17: 'UDP', 6: 'TCP', 0: 'HOPOPT'})


### SimilarHTTP:

### Label:

In [9]:
training_df["SimillarHTTP"].value_counts()

SimillarHTTP
0                                                                                         48695469
205.174.165.72/c.php                                                                          1906
ocsp.pki.goog/GTSGIAG3                                                                         325
ocsp.digicert.com/                                                                             291
205.174.165.72null                                                                             102
                                                                                            ...   
talebi.ca/wp-content/plugins/spin360/css/spin-style.css?ver=4.9.8                                1
img.stb.s-msn.com/usappex/tenant/amp/entityid/BBQfAQh.jpg?h=150&w=310&m=6&x=2629&y=505           1
205.174.165.72/c.php?ip=192.168.100.10                                                           1
205.174.165.72/c.php?ip=192.168.1.10                                                            