In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score, classification_report, f1_score
import numpy as np

In [2]:
df  = pd.read_csv(r'F:\Projects\Cap_github\Capstone\Data\DoS_Mirai_Recon.csv')

In [3]:
df.columns = df.columns.str.replace(' ', '_')

In [5]:
label_mapping = {
    'DoS': 0,
    'Mirai': 1,
    'Recon': 2
}

df['label'] = df['label'].apply(lambda x: label_mapping.get(str(x), -1))  # Use -1 for unmatched labels if needed

In [6]:
df.label.unique()

array([0, 1, 2], dtype=int64)

In [7]:
X = df.drop(['label'],axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, shuffle=True,random_state = 0)

In [8]:
# Remove irrelevant features and select important features
def Feature_Importance_LGBM(data):
    features = data.drop(['label'],axis=1).values  # "label" should be changed to the target class variable name if different
    labels = data['label'].values

    # Extract feature names
    feature_names = list(data.drop(['label'],axis=1).columns)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    model = lgb.LGBMRegressor(verbose = -1)
    model.fit(features, labels)
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})

    # Sort features according to importance
    feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

    # Normalize the feature importances to add up to one
    feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
    feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

    cumulative_importance=0.90 # Only keep the important features with cumulative importance scores>=90%. It can be changed.

    # Make sure most important features are on top
    feature_importances = feature_importances.sort_values('cumulative_importance')

    # Identify the features not needed to reach the cumulative_importance
    record_low_importance = feature_importances[feature_importances['cumulative_importance'] > cumulative_importance]

    to_drop = list(record_low_importance['feature'])
    print(feature_importances.drop(['importance'],axis=1))
    return to_drop

In [9]:
# Remove redundant features
def Feature_Redundancy_Pearson(data):
    correlation_threshold=0.95 # Only remove features with the redundancy>90%. It can be changed
    features = data.drop(['label'],axis=1)
    corr_matrix = features.corr()

    # Extract the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

    # Dataframe to hold correlated pairs
    record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

    # Iterate through the columns to drop
    for column in to_drop:

        # Find the correlated features
        corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

        # Find the correlated values
        corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
        drop_features = [column for _ in range(len(corr_features))]

        # Record the information (need a temp df for now)
        temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                         'corr_feature': corr_features,
                                         'corr_value': corr_values})
        record_collinear = pd.concat([record_collinear, temp_df], ignore_index = True)
#     print(record_collinear)
    return to_drop

In [10]:
def Auto_Feature_Engineering(s1):
    drop1 = Feature_Importance_LGBM(s1)
    df1 = s1.drop(columns = drop1)

    drop4 = Feature_Redundancy_Pearson(df1)
    df4 = df1.drop(columns = drop4)
    return df4

In [11]:
df1 = Auto_Feature_Engineering(df)

            feature  normalized_importance  cumulative_importance
0               IAT               0.162000               0.162000
1          Variance               0.085000               0.247000
2               Min               0.082333               0.329333
3         fin_count               0.072667               0.402000
4         syn_count               0.066000               0.468000
5            Number               0.064667               0.532667
6     Protocol_Type               0.059333               0.592000
7              Rate               0.042333               0.634333
8     flow_duration               0.038000               0.672333
9              HTTP               0.035667               0.708000
10         Duration               0.032000               0.740000
11          Tot_sum               0.028667               0.768667
12        rst_count               0.025333               0.794000
13              Max               0.025000               0.819000
14  ack_fl

  record_collinear = pd.concat([record_collinear, temp_df], ignore_index = True)
