# Train Model on 5 C2s + Normal, test on remaining C2

This uses random forest

In [1]:
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_curve, auc
)


## Define Training/Test CSVs

In [2]:
TRAIN_CSV_PATHS = [
    "../../data/normal_only/normal_1.csv",
    "../../data/c2_only/covenant_only.csv",
    "../../data/c2_only/empire_only.csv",
    "../../data/c2_only/sliver_only.csv",
    "../../data/c2_only/merlin_only.csv",
    "../../data/c2_only/metasploit_only.csv"
]



TEST_CSV_PATHS = [
   "../../data/c2_only/posh_only.csv",  # Held-out C2
]

DATASET_NAME = "LOGO_posh"
FRAMEWORKS = ["Covenant", "Empire", "Sliver", "Merlin", "Metasploit"]

NOTES = (
    f"Leave-One-Group-Out (LOGO) evaluation: the test set includes C2 traffic from the held-out {TEST_CSV_PATHS[0].split('/')[-1].replace('_only.csv', '')} framework, "
    "combined with sampled normal traffic (80:20 ratio). The training set includes C2 traffic from the remaining frameworks and sampled normal traffic, also at 80:20. "
    "This setup tests how well the model generalizes to unseen C2 traffic under realistic class distributions."
)




In [3]:
# === Load C2 training files ===
c2_train_paths = TRAIN_CSV_PATHS[1:]  # Skip the first (normal)
c2_train_df = pd.concat([pd.read_csv(path) for path in c2_train_paths], ignore_index=True)

# === Load normal traffic ===
normal_path = TRAIN_CSV_PATHS[0]
normal_df = pd.read_csv(normal_path)

# === Subsample training normal to match 80:20 ratio ===
desired_train_ratio = 0.8
normal_train_needed = int((len(c2_train_df) / (1 - desired_train_ratio)) * desired_train_ratio)
normal_train_sample = normal_df.sample(n=normal_train_needed, random_state=42)

train_df = pd.concat([c2_train_df, normal_train_sample], ignore_index=True)

# === Load and balance test set ===
c2_test_df = pd.read_csv(TEST_CSV_PATHS[0])
desired_test_ratio = 0.8
normal_test_needed = int((len(c2_test_df) / (1 - desired_test_ratio)) * desired_test_ratio)
normal_test_sample = normal_df.sample(n=normal_test_needed, random_state=42)

test_df = pd.concat([c2_test_df, normal_test_sample], ignore_index=True)

# === Sanity Check ===
print(f"Training samples: {train_df.shape[0]}")
print("Train class counts:\n", train_df["c2_label"].value_counts())
print(f"Test samples: {test_df.shape[0]}")
print("Test class counts:\n", test_df["c2_label"].value_counts())


Training samples: 253395
Train class counts:
 c2_label
0    202716
1     50679
Name: count, dtype: int64
Test samples: 51210
Test class counts:
 c2_label
0    40968
1    10242
Name: count, dtype: int64


## Preprocessing

In [4]:
# Split predictors and target
X_train = train_df.drop(columns=["c2_label"])
y_train = train_df["c2_label"]

X_test = test_df.drop(columns=["c2_label"])
y_test = test_df["c2_label"]

# Drop frame len field
X_train = X_train.drop(columns=["frame.len"])
X_test = X_test.drop(columns=["frame.len"])

# Normalize using the training set stats
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Train

In [5]:
# Train the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


In [6]:
# Evaluate Model Performance
y_pred = model.predict(X_test_scaled)
y_probs = model.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9172
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     40968
           1       1.00      0.59      0.74     10242

    accuracy                           0.92     51210
   macro avg       0.95      0.79      0.85     51210
weighted avg       0.92      0.92      0.91     51210



## Reporting

Don't change anything here. Only adjust the second code cell.

In [7]:
# === LOGO Report Generation ===
import os

# === Config ===
REPORT_DIR = "leave_one_out_reports_rf_no_frame_len"
IMG_DIR = os.path.join(REPORT_DIR, "images")
os.makedirs(REPORT_DIR, exist_ok=True)
os.makedirs(IMG_DIR, exist_ok=True)

EXPERIMENT_ID = f"{DATASET_NAME}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

# === Class Counts ===
class_counts = test_df["c2_label"].value_counts()

# === Plot 1: Pie Chart of Class Distribution ===
plt.figure(figsize=(5, 5))
labels = ["Normal", "C2"]
sizes = [class_counts.get(0, 0), class_counts.get(1, 0)]
colors = ["green", "red"]
explode = (0, 0.1)

def autopct_format(pct, all_vals):
    count = int(round(pct/100. * sum(all_vals)))
    return f"{pct:.1f}%\n({count})"

plt.pie(
    sizes,
    labels=labels,
    autopct=lambda pct: autopct_format(pct, sizes),
    startangle=90,
    colors=colors,
    explode=explode,
    textprops={"fontsize": 10}
)
plt.title("Class Distribution in Test Set")
pie_path = os.path.join(IMG_DIR, f"{EXPERIMENT_ID}_pie.png")
plt.savefig(pie_path)
plt.close()

# === Plot 2: Confusion Matrix ===
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "C2"], yticklabels=["Normal", "C2"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
conf_path = os.path.join(IMG_DIR, f"{EXPERIMENT_ID}_confusion.png")
plt.savefig(conf_path)
plt.close()

# === Plot 3: Feature Importance ===
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x="Importance", y="Feature", data=feature_importance)
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importance in Random Forest")
feat_path = os.path.join(IMG_DIR, f"{EXPERIMENT_ID}_feature_importance.png")
plt.tight_layout()
plt.savefig(feat_path)
plt.close()

# === Plot 4: ROC Curve ===
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
roc_path = os.path.join(IMG_DIR, f"{EXPERIMENT_ID}_roc.png")
plt.savefig(roc_path)
plt.close()

# === Plot 5: Prediction Probabilities by Class ===
plt.figure(figsize=(8, 5))
sns.histplot(y_probs[y_test == 0], bins=20, kde=True, color="green", label="Normal (Label 0)", stat="count", alpha=0.5)
sns.histplot(y_probs[y_test == 1], bins=20, kde=True, color="red", label="C2 (Label 1)", stat="count", alpha=0.5)
plt.xlabel("Predicted Probability of C2")
plt.ylabel("Count")
plt.title("Prediction Probability Distribution by True Class")
plt.legend()
plt.grid(True)
plt.tight_layout()
hist_path = os.path.join(IMG_DIR, f"{EXPERIMENT_ID}_hist.png")
plt.savefig(hist_path)
plt.close()

# === Write Markdown Report ===
report_path = os.path.join(REPORT_DIR, f"{EXPERIMENT_ID}.md")
with open(report_path, "w") as f:
    f.write(f"# LOGO Report: {DATASET_NAME}\n\n")
    f.write(f"- **Experiment ID:** {EXPERIMENT_ID}\n")
    f.write(f"- **Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"- **Frameworks (Train):** {', '.join(FRAMEWORKS)}\n")
    f.write(f"- **Framework (Test):** {TEST_CSV_PATHS[0].split('/')[-1].replace('_only.csv', '')}\n\n")

    f.write("## Notes\n")
    f.write(f"{NOTES}\n\n")

    f.write("## Test Set Class Distribution\n")
    f.write(f"![Pie Chart](images/{os.path.basename(pie_path)})\n\n")

    f.write("## Confusion Matrix\n")
    f.write(f"![Confusion Matrix](images/{os.path.basename(conf_path)})\n\n")

    f.write("## Classification Report\n")
    f.write("```\n")
    f.write(classification_report(y_test, y_pred))
    f.write("```\n\n")

    f.write("## ROC Curve\n")
    f.write(f"![ROC Curve](images/{os.path.basename(roc_path)})\n\n")

    f.write("## Feature Importance\n")
    f.write(f"![Feature Importance](images/{os.path.basename(feat_path)})\n\n")

    f.write("## Prediction Probability Distribution by True Class\n")
    f.write(f"![Prediction Histogram](images/{os.path.basename(hist_path)})\n")

print(f"Markdown LOGO report saved: {report_path}")


Markdown LOGO report saved: leave_one_out_reports_rf_no_frame_len\LOGO_posh_2025-04-11_15-49-53.md
