In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Import scikit-learn modules for preprocessing, model selection, and metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report




In [2]:
import pandas as pd

df = pd.read_csv(r"E:\\Datasets\\UNSW-NB15\\Training and Testing Sets\\UNSW_NB15_concatenated_dropped.csv")


# Identify feature and label columns
label_columns = [col for col in df.columns if col.startswith('attack_cat')]
feature_columns = [col for col in df.columns if not col.startswith('attack_cat')]

# Separate features (X) and labels (y)
X = df[feature_columns]
y = df[label_columns]

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Training set shape (X_train, y_train): (206138, 41) (206138, 1)
Testing set shape (X_test, y_test): (51535, 41) (51535, 1)


In [3]:
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # 'ignore' handles cases where test set has categories not in train set

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

classifiers = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(),
}

# Loop through each classifier
for name, classifier in classifiers.items():
    print(f"\n--- Evaluating: {name} ---")

    # Create the full pipeline: Preprocessing -> Classifier
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', classifier)])

    # Train the model
    print("Training model...")
    model_pipeline.fit(X_train, y_train)
    print("Model training complete.")

    # Make predictions on the test set
    print("Making predictions...")
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Get all unique labels from training and testing sets for the report
    labels = sorted(y_train['attack_cat'].unique())
    report = classification_report(y_test, y_pred, labels=labels, zero_division=0)
    # cm = confusion_matrix(y_test, y_pred, labels=labels)

    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("\nConfusion Matrix:")
    #print(cm)
    print(f"----------------------------------------")


--- Evaluating: Decision Tree ---
Training model...
Model training complete.
Making predictions...

Accuracy: 0.8666

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.22      0.22      0.22       538
      Backdoor       0.18      0.21      0.19       471
           DoS       0.38      0.40      0.39      3293
      Exploits       0.74      0.72      0.73      8853
       Fuzzers       0.86      0.86      0.86      4742
       Generic       0.98      0.98      0.98     11849
        Normal       1.00      1.00      1.00     18675
Reconnaissance       0.77      0.77      0.77      2770
     Shellcode       0.59      0.54      0.56       318
         Worms       0.26      0.46      0.33        26

      accuracy                           0.87     51535
     macro avg       0.60      0.62      0.60     51535
  weighted avg       0.87      0.87      0.87     51535


Confusion Matrix:
----------------------------------------

--- Evalua

  return fit_method(estimator, *args, **kwargs)


Model training complete.
Making predictions...

Accuracy: 0.8782

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.26      0.23      0.25       538
      Backdoor       0.13      0.14      0.14       471
           DoS       0.40      0.38      0.39      3293
      Exploits       0.75      0.79      0.77      8853
       Fuzzers       0.88      0.89      0.88      4742
       Generic       1.00      0.98      0.99     11849
        Normal       1.00      1.00      1.00     18675
Reconnaissance       0.80      0.77      0.79      2770
     Shellcode       0.65      0.59      0.62       318
         Worms       0.50      0.31      0.38        26

      accuracy                           0.88     51535
     macro avg       0.64      0.61      0.62     51535
  weighted avg       0.88      0.88      0.88     51535


Confusion Matrix:
----------------------------------------

--- Evaluating: KNeighborsClassifier ---
Training model...


  return self._fit(X, y)


Model training complete.
Making predictions...

Accuracy: 0.8302

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.19      0.12      0.15       538
      Backdoor       0.15      0.05      0.08       471
           DoS       0.31      0.35      0.33      3293
      Exploits       0.62      0.69      0.66      8853
       Fuzzers       0.74      0.77      0.76      4742
       Generic       0.99      0.97      0.98     11849
        Normal       1.00      1.00      1.00     18675
Reconnaissance       0.68      0.55      0.61      2770
     Shellcode       0.47      0.15      0.23       318
         Worms       0.50      0.12      0.19        26

      accuracy                           0.83     51535
     macro avg       0.57      0.48      0.50     51535
  weighted avg       0.83      0.83      0.83     51535


Confusion Matrix:
----------------------------------------
