In [41]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [42]:
file_paths = [
    'file_1.csv',
    'file_2.csv',
    'file_3.csv',
    'file_4.csv',
    'file_5.csv',
    'file_6.csv',
    'file_7.csv',
    'file_8.csv'
]

# Load all the CSV files into a list of DataFrames
dataframes = [pd.read_csv(file_path) for file_path in file_paths]

# Concatenate all the DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Check the structure of the combined dataset
combined_df.info(), combined_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0    Destination Port             int64  
 1    Flow Duration                int64  
 2    Total Fwd Packets            int64  
 3    Total Backward Packets       int64  
 4   Total Length of Fwd Packets   int64  
 5    Total Length of Bwd Packets  int64  
 6    Fwd Packet Length Max        int64  
 7    Fwd Packet Length Min        int64  
 8    Fwd Packet Length Mean       float64
 9    Fwd Packet Length Std        float64
 10  Bwd Packet Length Max         int64  
 11   Bwd Packet Length Min        int64  
 12   Bwd Packet Length Mean       float64
 13   Bwd Packet Length Std        float64
 14  Flow Bytes/s                  float64
 15   Flow Packets/s               float64
 16   Flow IAT Mean                float64
 17   Flow IAT Std                 float64
 18   Flow IAT Max         

(None,
     Destination Port   Flow Duration   Total Fwd Packets  \
 0              54865               3                   2   
 1              55054             109                   1   
 2              55055              52                   1   
 3              46236              34                   1   
 4              54863               3                   2   
 
     Total Backward Packets  Total Length of Fwd Packets  \
 0                        0                           12   
 1                        1                            6   
 2                        1                            6   
 3                        1                            6   
 4                        0                           12   
 
     Total Length of Bwd Packets   Fwd Packet Length Max  \
 0                             0                       6   
 1                             6                       6   
 2                             6                       6   
 3                     

In [43]:
combined_df.columns = combined_df.columns.str.strip()

selected_columns = [
    'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max',
    'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
    'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
    'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Mean', 'Bwd IAT Mean',
    'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'ACK Flag Count', 
    'Fwd Header Length', 'Bwd Header Length', 'Average Packet Size', 'Avg Fwd Segment Size',
    'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Bwd Packets', 'Label'
]

selected_features_df = combined_df[selected_columns]

In [44]:
# Feature engineering: Add Forward-to-Backward Packet Ratio
selected_features_df['Fwd_Bwd_Packet_Ratio'] = selected_features_df['Total Fwd Packets'] / (selected_features_df['Total Backward Packets'] + 1e-9)

# Feature engineering: Add Forward-to-Backward Length Ratio
selected_features_df['Fwd_Bwd_Length_Ratio'] = selected_features_df['Total Length of Fwd Packets'] / (selected_features_df['Total Length of Bwd Packets'] + 1e-9)

# Drop the original columns used to create ratios
selected_df = selected_features_df.drop(columns=['Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_df['Fwd_Bwd_Packet_Ratio'] = selected_features_df['Total Fwd Packets'] / (selected_features_df['Total Backward Packets'] + 1e-9)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_df['Fwd_Bwd_Length_Ratio'] = selected_features_df['Total Length of Fwd Packets'] / (selected_features_df['Total Length of Bwd Packets'] + 1e-9)


In [45]:
from sklearn.utils.class_weight import compute_class_weight
# Define features and target variable
X = selected_df.drop(columns=['Label'])  # Features
y = selected_df['Label']  # Target

# Split data into train (70%), validation (15%), and test sets (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)  # 15% validation, 15% test

# Create Pool objects for train and validation sets
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)

# Compute class weights to address imbalanced data
class_weights = compute_class_weight('balanced', classes=y_train.unique(), y=y_train)


In [46]:
from catboost import CatBoostClassifier, Pool, cv
# Set up cross-validation for hyperparameter tuning
params = {
    'iterations': 500,                      # Train for 500 iterations
    'depth': 4,                             # Tree depth
    'learning_rate': 0.07,                   # Learning rate
    'loss_function': 'MultiClass',
    'class_weights': class_weights,         # Class weights for handling imbalance
    'verbose': 100                          # Show progress after every 100 iterations
}

# Perform cross-validation (with 5 folds)
cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=5,
    early_stopping_rounds=100,  # Stop after 100 rounds of no improvement
    plot=True                   # Optionally plot the cross-validation curve
)

# Get the best parameters from cross-validation
best_iterations = cv_data['test-MultiClass-mean'].idxmin()

# Train the final model on full training set with the best iteration
catboost_model = CatBoostClassifier(
    iterations=best_iterations,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiClass',
    class_weights=class_weights,
    early_stopping_rounds=1000,  # Early stopping
    verbose=100
)

# Train the model
catboost_model.fit(train_pool, eval_set=val_pool)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 2.1084675	test: 2.1083210	best: 2.1083210 (0)	total: 718ms	remaining: 5m 58s
100:	learn: 0.3053545	test: 0.3109983	best: 0.3109983 (100)	total: 47.1s	remaining: 3m 6s
200:	learn: 0.2746652	test: 0.2835129	best: 0.2835129 (200)	total: 1m 31s	remaining: 2m 16s
300:	learn: 0.2471377	test: 0.2738487	best: 0.2730324 (290)	total: 2m 15s	remaining: 1m 29s
400:	learn: 0.2342842	test: 0.2754127	best: 0.2716219 (324)	total: 2m 59s	remaining: 44.3s

bestTest = 0.271621902
bestIteration = 324

Training on fold [1/5]
0:	learn: 2.1089611	test: 2.1094355	best: 2.1094355 (0)	total: 709ms	remaining: 5m 53s
100:	learn: 0.3103746	test: 0.3004788	best: 0.3004788 (100)	total: 47.2s	remaining: 3m 6s
200:	learn: 0.2774436	test: 0.2719131	best: 0.2719131 (200)	total: 1m 31s	remaining: 2m 16s
300:	learn: 0.2505135	test: 0.2655485	best: 0.2631478 (274)	total: 2m 16s	remaining: 1m 30s

bestTest = 0.2631477806
bestIteration = 274

Training on fold [2/5]
0:	learn: 2.1099159	test: 2

In [47]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred))


Test Set Accuracy: 90.77%
                            precision    recall  f1-score   support

                    BENIGN       1.00      0.89      0.94    340952
                       Bot       0.61      0.63      0.62       289
                      DDoS       0.97      0.99      0.98     19235
             DoS GoldenEye       0.48      0.97      0.64      1549
                  DoS Hulk       0.93      0.94      0.93     34573
          DoS Slowhttptest       0.54      0.99      0.70       796
             DoS slowloris       0.69      0.99      0.82       884
               FTP-Patator       0.94      1.00      0.97      1186
                Heartbleed       1.00      1.00      1.00         1
              Infiltration       0.02      1.00      0.04         1
                  PortScan       0.67      1.00      0.80     23947
               SSH-Patator       0.55      0.99      0.71       855
  Web Attack � Brute Force       0.01      0.42      0.02       231
Web Attack � Sql Inje