In [None]:
!pip install --upgrade jupyter ipywidgets
!pip install shap
!pip install catboost
!pip install imbalanced-learn
!pip install --upgrade joblib

In [None]:
# Enabled custom widget manager for better visualization support
from google.colab import output
output.enable_custom_widget_manager()

# Import necessary libraries
import joblib
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import shap

# Load dataset 
folder_path = r'/content/'  # Adjust to your local path
csv_files = ['at1.csv', 'at16.csv', 'at2.csv', 'at4.csv', 'at8.csv']
df = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in csv_files], ignore_index=True)

# Clean data
df.columns = df.columns.str.strip()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Rename Attacker Types
df['AttackerType'] = df['AttackerType'].replace({
    0: "BENIGN",
    16: "Eventual stop Attack",
    4: "Random Attack",
    2: "Constant Offset Attack",
    1: "Constant Attack",
    8: "Random Offset Attack"
})

# Split dataset into features and labels
y = df['AttackerType']
X = df.drop(columns='AttackerType')

# Extract feature names
feature_names = X.columns.to_list()

# Save feature names for later use in SHAP analysis
joblib.dump(feature_names, '/content/feature_names.joblib')
print("Feature names saved")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Normalize data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Feature selection using SelectKBest
sel_top_cols1 = SelectKBest(score_func=chi2, k=10)
sel_top_cols1.fit(X_train, y_train)
X_train_selected = sel_top_cols1.transform(X_train)
X_test_selected = sel_top_cols1.transform(X_test)

# Saved the selected feature indices for SHAP analysis
selected_feature_indices = sel_top_cols1.get_support(indices=True)
joblib.dump(selected_feature_indices, '/content/selected_feature_indices.joblib')
print("Selected feature indices saved")

# Applied SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)

# Define base learners
estimators = [
    ('rf', RandomForestClassifier(
        n_estimators=50, max_depth=20, min_samples_split=10,
        min_samples_leaf=2, criterion='entropy', n_jobs=-1)),
    ('catboost', CatBoostClassifier(
        iterations=100, learning_rate=0.1, depth=6, verbose=0))
]

# Define the stacking classifier
clf = StackingClassifier(
    cv=3,
    estimators=estimators,
    final_estimator=LogisticRegression(solver='lbfgs', max_iter=1000),
    stack_method='auto',
    passthrough=False,
    n_jobs=-1
)

# Perform cross-validation
cv_scores = cross_val_score(clf, X_train_smote, y_train_smote, cv=3, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}% ± {cv_scores.std() * 100:.2f}%")

# Train the model
clf.fit(X_train_smote, y_train_smote)
y_pred = clf.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

# Classification report
report = classification_report(y_test, y_pred, target_names=le.classes_)
print("\nClassification Report:\n", report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Detection accuracy for each attack type
class_accuracy = cm.diagonal() / cm.sum(axis=1)
for idx, class_label in enumerate(le.classes_):
    print(f"Detection Accuracy for {class_label}: {class_accuracy[idx] * 100:.2f}%")

# Save test dataset for SHAP analysis
joblib.dump(X_test_selected, '/content/X_test_selected.joblib')
print("Test data saved for SHAP analysis.")

# Save the model using joblib
model_filename = '/content/stacking_model.joblib'
joblib.dump(clf, model_filename)
print(f"Model saved to {model_filename}")

# Load the saved model for SHAP analysis
clf = joblib.load('/content/stacking_model.joblib')

# Load selected test data and feature indices for SHAP analysis
X_test_selected = joblib.load('/content/X_test_selected.joblib')
selected_feature_indices = joblib.load('/content/selected_feature_indices.joblib')
feature_names = joblib.load('/content/feature_names.joblib')

# SHAP analysis for CatBoostClassifier
catboost_model = clf.named_estimators_['catboost']
explainer = shap.Explainer(catboost_model)
shap_values = explainer(X_test_selected)
shap.summary_plot(shap_values, X_test_selected, feature_names=np.array(feature_names)[selected_feature_indices])

# Set a title
plt.title("SHAP analysis for CatBoostClassifier")
plt.show()

# SHAP analysis for RandomForestClassifier
rf_model = clf.named_estimators_['rf']
rf_explainer = shap.Explainer(rf_model)
rf_shap_values = rf_explainer(X_test_selected)
shap.summary_plot(rf_shap_values, X_test_selected, feature_names=np.array(feature_names)[selected_feature_indices])

# Set a title
plt.title("SHAP analysis for RandomForestClassifier")
plt.show()
