# Feature Analysis for GDLC

This notebook focuses on analyzing various features extracted from network traffic data to identify potential intrusions. Our goal is to understand the significance of each feature in the context of intrusion detection and to select the most relevant features for building a predictive model.

## Data Loading

In [258]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import json
import seaborn as sns
import os
from src.data.dataset_info import datasets
from sklearn.feature_selection import VarianceThreshold

# defining the thresholds
var_threshold = 0.00
corr_threshold = 0.75

# specifying the dataset
dataset = datasets[0]
name_pca="client_0_pca"
name = dataset.name
print("dataset: {}".format(name_pca))
path = "./datasets_pca_federated_new/{}.parquet".format(name_pca)
# graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)

# loading the dataset as a Pandas dataframe
df = pd.read_parquet(path)

# We are only concerned with features used in training
df.drop(dataset.drop_columns, axis=1, inplace=True)
df.drop(dataset.label_col, axis=1, inplace=True)

dataset: client_0_pca


In [259]:
df.head(0)

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,src_eigenvector,dst_eigenvector,src_closeness,dst_closeness,src_k_core,dst_k_core,src_k_truss,dst_k_truss,pca_1,pca_2


## Variance Thresholding

In [260]:
# Create a VarianceThreshold object
selector = VarianceThreshold(threshold=var_threshold)

# Fit the selector to the data and transform the data
data_filtered = selector.fit_transform(df)

# Get the names of the selected features
selected_features = df.columns[selector.get_support(indices=True)]

dropped_features = [col for col in df.columns if col not in selected_features]

# Create a new DataFrame with the selected features
data_filtered = pd.DataFrame(df, columns=selected_features)

# variances = np.var(df, axis=0)

print(f"==>> dropped_features by VarianceThreshold: {dropped_features}")

==>> dropped_features by VarianceThreshold: ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg']


## Correlation Thresholding

In [261]:
# create the correlation matrix
corr_matrix = data_filtered.corr()

# plotting the heatmap of the matrix
#plt.figure(figsize=(25, 20))
#sns.heatmap(corr_matrix, annot=True, cmap='coolwarm',
 #           fmt='.2f', linewidths=.5)  # type: ignore
#plt.title("Correlation Matrix Heatmap in dataset {}".format(dataset.name))
# plt.savefig(fname="visualization/{}/correlation_matrix".format(dataset.name))

In [262]:
# getting the top correlated features
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlated_features = [
    column for column in upper.columns if any(upper[column] > corr_threshold)]

# Drop only one feature from each highly correlated pair
features_to_remove = set()
for feature in correlated_features:
    correlated_with_feature = list(
        upper.index[upper[feature] > corr_threshold])
    for correlated_feature in correlated_with_feature:
        if correlated_feature not in features_to_remove:
            features_to_remove.add(correlated_feature)
            # features_to_remove.add(np.random.choice([feature, correlated_feature]))


# Drop the highly correlated features
data_filtered = data_filtered.drop(features_to_remove, axis=1)  # type: ignore

print(f"==>> dropped_features by correlation: {features_to_remove}")

==>> dropped_features by correlation: {'Pkt Size Avg', 'Flow IAT Mean', 'Fwd IAT Tot', 'src_k_truss', 'Fwd Pkt Len Max', 'src_degree', 'Fwd URG Flags', 'Pkt Len Var', 'Bwd IAT Tot', 'Active Mean', 'Bwd Pkt Len Max', 'Fwd Header Len', 'Pkt Len Std', 'src_closeness', 'Fwd Pkt Len Std', 'dst_betweenness', 'Flow IAT Max', 'Active Std', 'PSH Flag Cnt', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'ACK Flag Cnt', 'TotLen Bwd Pkts', 'Idle Mean', 'Flow IAT Min', 'Subflow Bwd Byts', 'Tot Fwd Pkts', 'dst_multidigraph_betweenness', 'src_eigenvector', 'Pkt Len Max', 'Fwd IAT Mean', 'Bwd Byts/b Avg', 'src_pagerank', 'Tot Bwd Pkts', 'Flow Pkts/s', 'dst_degree', 'dst_eigenvector', 'Flow Duration', 'Bwd IAT Mean', 'Subflow Bwd Pkts', 'src_betweenness', 'Subflow Fwd Pkts', 'Bwd Pkt Len Std', 'src_multidigraph_betweenness', 'Bwd Header Len', 'Subflow Fwd Byts', 'Idle Max', 'Pkt Len Mean'}


In [263]:
print(f"==>> final_features_names: {list(data_filtered.columns)}")

==>> final_features_names: ['Protocol', 'TotLen Fwd Pkts', 'Fwd Pkt Len Min', 'Bwd Pkt Len Min', 'Flow Byts/s', 'Flow IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Max', 'Active Min', 'Idle Std', 'Idle Min', 'Class', 'dst_pagerank', 'src_multidigraph_degree', 'dst_multidigraph_degree', 'src_multidigraph_pagerank', 'dst_multidigraph_pagerank', 'dst_closeness', 'src_k_core', 'dst_k_core', 'dst_k_truss', 'pca_1', 'pca_2']


In [264]:
dropped_features.extend(list(features_to_remove))

In [265]:
print(f"==>> length of dropped_features: {len(dropped_features)}")

==>> length of dropped_features: 53


In [266]:
print(f"==>> dropped_features: {dropped_features}")

==>> dropped_features: ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Pkt Size Avg', 'Flow IAT Mean', 'Fwd IAT Tot', 'src_k_truss', 'Fwd Pkt Len Max', 'src_degree', 'Fwd URG Flags', 'Pkt Len Var', 'Bwd IAT Tot', 'Active Mean', 'Bwd Pkt Len Max', 'Fwd Header Len', 'Pkt Len Std', 'src_closeness', 'Fwd Pkt Len Std', 'dst_betweenness', 'Flow IAT Max', 'Active Std', 'PSH Flag Cnt', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'ACK Flag Cnt', 'TotLen Bwd Pkts', 'Idle Mean', 'Flow IAT Min', 'Subflow Bwd Byts', 'Tot Fwd Pkts', 'dst_multidigraph_betweenness', 'src_eigenvector', 'Pkt Len Max', 'Fwd IAT Mean', 'Bwd Byts/b Avg', 'src_pagerank', 'Tot Bwd Pkts', 'Flow Pkts/s', 'dst_degree', 'dst_eigenvector', 'Flow Duration', 'Bwd IAT Mean', 'Subflow Bwd Pkts', 'src_betweenness', 'Subflow Fwd Pkts', 'Bwd Pkt Len Std', 'src_multidigraph_betweenness', 'Bwd Header Len', 'Subflow Fwd Byts', 'Idle Max', 'Pkt Len Mean']


In [267]:
cn_measures_type_2 = [
    "dst_global_betweenness",
    "src_global_degree",
    "dst_global_degree",
    "src_mv",
    "src_global_pagerank",
    "dst_global_pagerank",
    "src_global_betweenness",
    "dst_mv"
]

cn_measures_type_1 = [
    "dst_local_pagerank",
    "src_local_betweenness",
    "src_Comm",
    "src_local_degree",
    "dst_local_betweenness",
    "dst_Comm",
    "dst_local_degree",
    "src_local_pagerank"
]

In [269]:
data_filtered = df.drop(columns=cn_measures_type_2, errors='ignore')

data_filtered = df.drop(columns=cn_measures_type_1, errors='ignore')
#data_filtered = df.drop(["src_multidigraph_degree", "dst_multidigraph_degree", "src_multidigraph_betweenness", "dst_multidigraph_betweenness", "src_multidigraph_pagerank", "dst_multidigraph_pagerank"], axis=1, inplace=True)

print(f"==>> final_features_names: {list(data_filtered.columns)}")
print(f"==>> len_final_features_names: {len(data_filtered.columns)}")
output_folder = 'datasets_modified'
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, f'client_{name_pca}.parquet')
data_filtered.to_parquet(output_path)

==>> final_features_names: ['Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'B