In [13]:
# Mount Google Drive (if needed)
from google.colab import drive
try:
    drive.mount('/content/drive')
except:
    print("Drive might already be mounted or running locally.")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import joblib
import os
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. SETUP & CONFIGURATION

In [14]:
# (Adjust these paths to match your actual Drive location)
PROJECT_PATH = '/content/drive/MyDrive/MLProject'
MODEL_NAME = 'xgboost_classifier'
MODEL_PATH = f'{PROJECT_PATH}/models/with'
DATASETS_PATH = f'{PROJECT_PATH}/data/recommended'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'

INDEX_COL = 'pkSeqID'
COLUMNS_ORDERED = [
    'min', 'max', 'mean', 'stddev',
    'saddr', 'sport', 'daddr', 'dport',
    'srate', 'drate',
    'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
    'state_number', 'proto',
    'seq',
    'attack', 'category', 'subcategory'
]


# 2. HELPER FUNCTIONS

In [15]:
def save_model(model, name=MODEL_NAME):
    # Ensure directory exists
    if not os.path.exists(MODEL_PATH):
        os.makedirs(MODEL_PATH)
    joblib.dump(model, f'{MODEL_PATH}/{name}.joblib')
    print(f"Model saved to {MODEL_PATH}/{name}.joblib")

def process_categories(cat):
    if str(cat).lower().startswith('theft'):
        return 'Theft'
    if str(cat).lower().startswith('normal'):
        return 'Normal'
    if cat == 'DoS HTTP':
        return 'DoS TCP'
    if cat == 'DDoS HTTP':
        return 'DDoS TCP'
    return cat

def process_port(p):
    return int(p, 16) if str(p).startswith('0x') else int(p)

def process_ports(ports: pd.DataFrame):
    return ports.applymap(process_port) # applymap for DataFrame element-wise

DEFAULT_RATE_SHIFT = 1.1

def shift_and_log(data, shift=DEFAULT_RATE_SHIFT):
    return np.log10(data + shift)


# 3. CUSTOM TRANSFORMER

In [16]:
class CombinedFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, normalize=True) -> None:
        super().__init__()
        self.normalize = normalize

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        # We use .copy() to avoid SettingWithCopy warnings
        X = X.copy()

        # Calculate ratio
        srate_log = np.log10(X.loc[:, 'srate'] + DEFAULT_RATE_SHIFT)
        drate_log = np.log10(X.loc[:, 'drate'] + DEFAULT_RATE_SHIFT)

        # Avoid division by zero if necessary, though log shift helps
        srate_to_drate = srate_log / drate_log

        if self.normalize:
            X['srate_to_drate'] = np.log1p(srate_to_drate)
        else:
            X['srate_to_drate'] = srate_to_drate
        return X


# 4. DATA LOADING & PROCESSING

In [17]:
print("Loading Data...")
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]

print("Processing Targets...")
# Encode Targets
cat_encoder_label = LabelEncoder()

# Prepare Train
X_train = training.drop(['attack' ,'category', 'subcategory'], axis=1)
# Create merged category column
y_train_cat_raw = (training['category'] + ' ' + training['subcategory']).map(process_categories)
y_train_label = cat_encoder_label.fit_transform(y_train_cat_raw)

# Prepare Test
X_test = testing.drop(['attack', 'category', 'subcategory'], axis=1)
y_test_cat_raw = (testing['category'] + ' ' + testing['subcategory']).map(process_categories)
y_test_label = cat_encoder_label.transform(y_test_cat_raw)

# Clean up memory
del training, testing
gc.collect()


Loading Data...
Processing Targets...


44

# 5. PREPROCESSING PIPELINE DEFINITION

In [18]:
TO_DROP = ['saddr', 'daddr', 'seq']

data_cleaner = ColumnTransformer([
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(handle_unknown='ignore'), ['proto']), # Added handle_unknown for safety
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
  ('rate', FunctionTransformer(func=shift_and_log), ['srate', 'drate']),
], remainder='passthrough')

preprocessing = Pipeline([
    ('augment', CombinedFeatureAdder()), # Custom Feature Engineering
    ('clean', data_cleaner),             # Dropping, OHE, Port fixing, Log transform
    ('std', StandardScaler()),           # Scaling
])


In [19]:
# ==========================================
# 6. FULL PIPELINE WITH XGBOOST
# ==========================================
print("Training XGBoost Pipeline...")

pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model', XGBClassifier(
        n_estimators=300,
        learning_rate=0.2,
        max_depth=20,
        objective='multi:softmax',     # Multi-class classification
        num_class=len(cat_encoder_label.classes_),
        device='cuda',                 # Use GPU (remove if on CPU)
        tree_method='hist',            # Optimized for speed
        random_state=42,
        n_jobs=-1
    ))
])

# Fit the pipeline
pipeline.fit(X_train, y_train_label)


Training XGBoost Pipeline...


  return ports.applymap(process_port) # applymap for DataFrame element-wise
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [20]:
# ==========================================
# 7. EVALUATION
# ==========================================
print("Evaluating...")
predictions = pipeline.predict(X_test)

score = accuracy_score(y_test_label, predictions)
print(f'\nModel accuracy: {score:.4f}')

# Detailed Report
print("\nClassification Report:")
print(classification_report(y_test_label, predictions, target_names=cat_encoder_label.classes_))

# Save
save_model(pipeline, MODEL_NAME)


  return ports.applymap(process_port) # applymap for DataFrame element-wise


Evaluating...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



Model accuracy: 0.9990

Classification Report:
                               precision    recall  f1-score   support

                     DDoS TCP       1.00      1.00      1.00    195355
                     DDoS UDP       1.00      1.00      1.00    189954
                      DoS TCP       1.00      1.00      1.00    123486
                      DoS UDP       1.00      1.00      1.00    206626
                       Normal       0.99      0.99      0.99       107
Reconnaissance OS_Fingerprint       0.97      0.97      0.97      3621
  Reconnaissance Service_Scan       0.99      0.99      0.99     14542
                        Theft       1.00      0.93      0.96        14

                     accuracy                           1.00    733705
                    macro avg       0.99      0.98      0.99    733705
                 weighted avg       1.00      1.00      1.00    733705

Model saved to /content/drive/MyDrive/MLProject/models/with/xgboost_classifier.joblib
