In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score, classification_report, f1_score
import numpy as np

In [3]:
df  = pd.read_csv(r'F:\Projects\Cap_github\Capstone\Data\Dos_Web.csv')

In [4]:
df.columns = df.columns.str.replace(' ', '_')

In [6]:
df['label'] = df['label'].apply(lambda x: 0 if 'DoS' in str(x) else 1)  # Use -1 for unmatched labels if needed


In [7]:
df.label.unique()

array([0, 1], dtype=int64)

In [8]:
X = df.drop(['label'],axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, shuffle=True,random_state = 0)

In [9]:
# Remove irrelevant features and select important features
def Feature_Importance_LGBM(data):
    features = data.drop(['label'],axis=1).values  # "label" should be changed to the target class variable name if different
    labels = data['label'].values

    # Extract feature names
    feature_names = list(data.drop(['label'],axis=1).columns)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    model = lgb.LGBMRegressor(verbose = -1)
    model.fit(features, labels)
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})

    # Sort features according to importance
    feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

    # Normalize the feature importances to add up to one
    feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
    feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

    cumulative_importance=0.90 # Only keep the important features with cumulative importance scores>=90%. It can be changed.

    # Make sure most important features are on top
    feature_importances = feature_importances.sort_values('cumulative_importance')

    # Identify the features not needed to reach the cumulative_importance
    record_low_importance = feature_importances[feature_importances['cumulative_importance'] > cumulative_importance]

    to_drop = list(record_low_importance['feature'])
    print(feature_importances.drop(['importance'],axis=1))
    return to_drop

In [10]:
# Remove redundant features
def Feature_Redundancy_Pearson(data):
    correlation_threshold=0.95 # Only remove features with the redundancy>90%. It can be changed
    features = data.drop(['label'],axis=1)
    corr_matrix = features.corr()

    # Extract the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

    # Dataframe to hold correlated pairs
    record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

    # Iterate through the columns to drop
    for column in to_drop:

        # Find the correlated features
        corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

        # Find the correlated values
        corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
        drop_features = [column for _ in range(len(corr_features))]

        # Record the information (need a temp df for now)
        temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                         'corr_feature': corr_features,
                                         'corr_value': corr_values})
        record_collinear = pd.concat([record_collinear, temp_df], ignore_index = True)
#     print(record_collinear)
    return to_drop

In [11]:
def Auto_Feature_Engineering(s1):
    drop1 = Feature_Importance_LGBM(s1)
    df1 = s1.drop(columns = drop1)

    drop4 = Feature_Redundancy_Pearson(df1)
    df4 = df1.drop(columns = drop4)
    return df4

In [12]:
df1 = Auto_Feature_Engineering(df)

            feature  normalized_importance  cumulative_importance
0          Variance               0.106333               0.106333
1               IAT               0.099333               0.205667
2         fin_count               0.086667               0.292333
3     Header_Length               0.065000               0.357333
4          Duration               0.064333               0.421667
5         urg_count               0.059000               0.480667
6     flow_duration               0.055000               0.535667
7         rst_count               0.052000               0.587667
8     Protocol_Type               0.047667               0.635333
9              Rate               0.047667               0.683000
10           Number               0.044667               0.727667
11          Tot_sum               0.041000               0.768667
12        ack_count               0.036000               0.804667
13              Min               0.025333               0.830000
14        

  record_collinear = pd.concat([record_collinear, temp_df], ignore_index = True)
