In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score, classification_report, f1_score
import numpy as np

In [2]:
df  = pd.read_csv(r'F:\Projects\Cap_github\Capstone\Data\Spoofing_Brute.csv')

In [3]:
df.columns = df.columns.str.replace(' ', '_')

In [4]:
df['label'] = df['label'].apply(lambda x: 0 if 'Brute' in str(x) else 1)  # Use -1 for unmatched labels if needed


In [5]:
df.label.unique()

array([0, 1], dtype=int64)

In [6]:
X = df.drop(['label'],axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, shuffle=True,random_state = 0)

In [7]:
# Remove irrelevant features and select important features
def Feature_Importance_LGBM(data):
    features = data.drop(['label'],axis=1).values  # "label" should be changed to the target class variable name if different
    labels = data['label'].values

    # Extract feature names
    feature_names = list(data.drop(['label'],axis=1).columns)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    model = lgb.LGBMRegressor(verbose = -1)
    model.fit(features, labels)
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})

    # Sort features according to importance
    feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

    # Normalize the feature importances to add up to one
    feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
    feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

    cumulative_importance=0.90 # Only keep the important features with cumulative importance scores>=90%. It can be changed.

    # Make sure most important features are on top
    feature_importances = feature_importances.sort_values('cumulative_importance')

    # Identify the features not needed to reach the cumulative_importance
    record_low_importance = feature_importances[feature_importances['cumulative_importance'] > cumulative_importance]

    to_drop = list(record_low_importance['feature'])
    print(feature_importances.drop(['importance'],axis=1))
    return to_drop

In [8]:
# Remove redundant features
def Feature_Redundancy_Pearson(data):
    correlation_threshold=0.95 # Only remove features with the redundancy>90%. It can be changed
    features = data.drop(['label'],axis=1)
    corr_matrix = features.corr()

    # Extract the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

    # Dataframe to hold correlated pairs
    record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

    # Iterate through the columns to drop
    for column in to_drop:

        # Find the correlated features
        corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

        # Find the correlated values
        corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
        drop_features = [column for _ in range(len(corr_features))]

        # Record the information (need a temp df for now)
        temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                         'corr_feature': corr_features,
                                         'corr_value': corr_values})
        record_collinear = pd.concat([record_collinear, temp_df], ignore_index = True)
#     print(record_collinear)
    return to_drop

In [9]:
def Auto_Feature_Engineering(s1):
    drop1 = Feature_Importance_LGBM(s1)
    df1 = s1.drop(columns = drop1)

    drop4 = Feature_Redundancy_Pearson(df1)
    df4 = df1.drop(columns = drop4)
    return df4

In [10]:
df1 = Auto_Feature_Engineering(df)

            feature  normalized_importance  cumulative_importance
0               IAT               0.138000               0.138000
1     flow_duration               0.137000               0.275000
2     Header_Length               0.109667               0.384667
3              Rate               0.076667               0.461333
4         syn_count               0.066000               0.527333
5         urg_count               0.058667               0.586000
6         rst_count               0.055333               0.641333
7          Duration               0.052333               0.693667
8           Tot_sum               0.048333               0.742000
9               Min               0.046333               0.788333
10    Protocol_Type               0.032333               0.820667
11         Tot_size               0.029333               0.850000
12              Max               0.018667               0.868667
13              AVG               0.018000               0.886667
14        

[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4611
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857


[32m [ 2024-12-11 15:53:25,719 ] [0mFinished iteration #0 with objective value -0.9415320728570526. Current best value is -0.9415320728570526 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2192
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3443
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 23
[LightGBM] [Info

[32m [ 2024-12-11 15:53:37,316 ] [0mFinished iteration #1 with objective value -0.9437118732281391. Current best value is -0.9437118732281391 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3302
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2933
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857


[32m [ 2024-12-11 15:53:48,730 ] [0mFinished iteration #2 with objective value -0.943354357806441. Current best value is -0.9437118732281391 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1913
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3019
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 22
[LightGBM] [Info

[32m [ 2024-12-11 15:54:00,800 ] [0mFinished iteration #3 with objective value -0.9436418676677557. Current best value is -0.9437118732281391 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3257
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2919
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857


[32m [ 2024-12-11 15:54:13,220 ] [0mFinished iteration #4 with objective value -0.944174752771106. Current best value is -0.944174752771106 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3011
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM]

[32m [ 2024-12-11 15:54:25,143 ] [0mFinished iteration #5 with objective value -0.9459282378814184. Current best value is -0.9459282378814184 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2908
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2890
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info

[32m [ 2024-12-11 15:54:38,411 ] [0mFinished iteration #6 with objective value -0.9457111290057933. Current best value is -0.9459282378814184 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3308
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3443
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM]

[32m [ 2024-12-11 15:54:51,330 ] [0mFinished iteration #7 with objective value -0.9473555860065481. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2783
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3194
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 25
[LightGBM] [Info

[32m [ 2024-12-11 15:55:03,408 ] [0mFinished iteration #8 with objective value -0.9455336771750908. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3445
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2425
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info

[32m [ 2024-12-11 15:55:15,429 ] [0mFinished iteration #9 with objective value -0.9473551991907411. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2595
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2673
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 19
[LightGBM] [Info

[32m [ 2024-12-11 15:55:27,304 ] [0mFinished iteration #10 with objective value -0.9449634931962574. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3455
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3687
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857


[32m [ 2024-12-11 15:55:39,599 ] [0mFinished iteration #11 with objective value -0.9472835084057298. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004593 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3266
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3296
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM]

[32m [ 2024-12-11 15:55:51,671 ] [0mFinished iteration #12 with objective value -0.9468548105710078. Current best value is -0.9473555860065481 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2935
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2939
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 24
[LightGBM] [Info

[32m [ 2024-12-11 15:56:03,810 ] [0mFinished iteration #13 with objective value -0.9474985831942099. Current best value is -0.9474985831942099 [0m


[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002963 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2984
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857
[LightGBM] [Info] Start training from score -0.002857
[LightGBM] [Info] Number of positive: 55920, number of negative: 56080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3051
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499286 -> initscore=-0.002857


[32m [ 2024-12-11 15:56:16,445 ] [0mFinished iteration #14 with objective value -0.946462072437772. Current best value is -0.9474985831942099 [0m


['flow_duration',
 'Header_Length',
 'Protocol_Type',
 'Duration',
 'Rate',
 'Drate',
 'syn_flag_number',
 'rst_flag_number',
 'psh_flag_number',
 'ack_flag_number',
 'ece_flag_number',
 'cwr_flag_number',
 'ack_count',
 'syn_count',
 'fin_count',
 'urg_count',
 'rst_count',
 'HTTP',
 'HTTPS',
 'SSH',
 'UDP',
 'DHCP',
 'ARP',
 'ICMP',
 'LLC',
 'Min',
 'AVG',
 'Std',
 'Tot_size',
 'IAT',
 'Number',
 'Magnitue']