# Inference on Mixed Traffic

The completed model was trained on normal and attack/beaconing traffic from six C2 frameworks. A 20:80 C2-to-normal traffic ratio was enforced in the training data. C2 traffic was isolated and then merged. This notebook evaluates how well the trained model can detect C2 traffic mixed into normal activity. The mixed PCAPs used for testing contain an approximate 10:90 C2-to-normal traffic ratio, reflecting more realistic conditions.

## Imports and Setup

In [57]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os


## Load Saved Model

In [58]:
# Load model, scaler, and expected feature column order
MODEL_DIR = "./experiment_reports_20_80_ratio_enforced/models"

model = joblib.load(os.path.join(MODEL_DIR, "combined_6_rf_model.joblib"))
scaler = joblib.load(os.path.join(MODEL_DIR, "combined_6_scaler.joblib"))
feature_columns = joblib.load(os.path.join(MODEL_DIR, "combined_6_feature_columns.joblib"))


## Evaluation

In [59]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

def evaluate_on_mixed_csv(csv_path, model, scaler, feature_columns, thresholds=[0.5]):
    print(f"== Evaluating {os.path.basename(csv_path)} ==")

    # Load and align features
    df = pd.read_csv(csv_path)
    X = df[feature_columns]
    y = df["c2_label"]

    # Scale features
    X_scaled = scaler.transform(X)

    # Predict probabilities
    y_probs = model.predict_proba(X_scaled)[:, 1]

    # AUC is always based on probability
    auc = roc_auc_score(y, y_probs)
    print(f"AUC: {auc:.4f}")

    all_reports = {}

    # Evaluate across all requested thresholds
    for thresh in thresholds:
        y_pred = (y_probs > thresh).astype(int)
        acc = accuracy_score(y, y_pred)
        report = classification_report(y, y_pred, digits=4)

        print(f"\n-- Threshold: {thresh} --")
        print(f"Accuracy: {acc:.4f}")
        print(report)

        all_reports[thresh] = {
            "threshold": thresh,
            "accuracy": acc,
            "report": report
        }

    return {
        "name": os.path.basename(csv_path),
        "auc": auc,
        "y_true": y,
        "y_probs": y_probs,
        "reports": all_reports  # stores report for each threshold
    }


In [60]:
MIXED_CSV_PATHS = [
    "../../data/mixed/merlin_mixed_0.csv",
    "../../data/mixed/sliver_mixed_0.csv",
    "../../data/mixed/sliver_mixed_1.csv"
]

# Evaluate
results = []
for path in MIXED_CSV_PATHS:
    res = evaluate_on_mixed_csv(path, model, scaler, feature_columns, thresholds=[0.3, 0.4, 0.5])
    results.append(res)



== Evaluating merlin_mixed_0.csv ==
AUC: 0.9703

-- Threshold: 0.3 --
Accuracy: 0.9745
              precision    recall  f1-score   support

           0     0.9953    0.9786    0.9869     51746
           1     0.4084    0.7620    0.5318      1004

    accuracy                         0.9745     52750
   macro avg     0.7019    0.8703    0.7593     52750
weighted avg     0.9841    0.9745    0.9782     52750


-- Threshold: 0.4 --
Accuracy: 0.9815
              precision    recall  f1-score   support

           0     0.9945    0.9867    0.9906     51746
           1     0.5106    0.7171    0.5965      1004

    accuracy                         0.9815     52750
   macro avg     0.7526    0.8519    0.7935     52750
weighted avg     0.9853    0.9815    0.9831     52750


-- Threshold: 0.5 --
Accuracy: 0.9842
              precision    recall  f1-score   support

           0     0.9932    0.9907    0.9919     51746
           1     0.5744    0.6494    0.6096      1004

    accuracy     

## Summary

In [61]:
import pandas as pd
from sklearn.metrics import classification_report

summary_rows = []

for r in results:
    name = r["name"].replace("_mixed", "").replace(".csv", "")
    
    for thresh, _ in r["reports"].items():
        y_pred = (r["y_probs"] > thresh).astype(int)
        report_dict = classification_report(r["y_true"], y_pred, output_dict=True)

        f1 = report_dict["1"]["f1-score"]
        recall = report_dict["1"]["recall"]

        summary_rows.append({
            "Framework": name,
            "Threshold": thresh,
            "F1 (C2)": round(f1, 4),
            "Recall (C2)": round(recall, 4),
            "AUC": round(r["auc"], 4)
        })

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(["Framework", "Threshold"]).reset_index(drop=True)

def color_grade(val):
    if val >= 0.90:
        return "background-color: #c6f6d5; color: black"  # green, good
    elif val >= 0.70:
        return "background-color: #fefcbf; color: black"  # yellow, fair
    else:
        return "background-color: #feb2b2; color: black"  # red, poor

styled_summary = summary_df.style.applymap(color_grade, subset=["F1 (C2)", "Recall (C2)", "AUC"])

styled_summary


  styled_summary = summary_df.style.applymap(color_grade, subset=["F1 (C2)", "Recall (C2)", "AUC"])


Unnamed: 0,Framework,Threshold,F1 (C2),Recall (C2),AUC
0,merlin_0,0.3,0.5318,0.762,0.9703
1,merlin_0,0.4,0.5965,0.7171,0.9703
2,merlin_0,0.5,0.6096,0.6494,0.9703
3,sliver_0,0.3,0.7306,0.7109,0.9555
4,sliver_0,0.4,0.7147,0.6261,0.9555
5,sliver_0,0.5,0.6667,0.5452,0.9555
6,sliver_1,0.3,0.5109,0.6492,0.9137
7,sliver_1,0.4,0.5181,0.5723,0.9137
8,sliver_1,0.5,0.5175,0.5015,0.9137


## Reporting

In [62]:
from datetime import datetime
import os

# Ensure directory exists
REPORT_DIR = "inference_reports"
os.makedirs(REPORT_DIR, exist_ok=True)

MODEL_USED = "combined_6_rf"

notes = "Used the `combined_6_rf` model to evaluate mixed traffic from various frameworks.\n\n"

# Define export path
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
report_path = os.path.join(REPORT_DIR, f"{MODEL_USED}_mixed_inference_report_{timestamp}.md")

with open(report_path, "w") as f:
    f.write("# Mixed Traffic Inference Summary\n\n")
    f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

    # Notes Section
    f.write("## Notes\n")
    f.write(notes)

    # Results Section
    for r in results:
        f.write(f"## {r['name']}\n\n")
        f.write(f"- AUC (threshold-independent): `{r['auc']:.4f}`\n\n")

        for thresh, metrics in r["reports"].items():
            f.write(f"### Threshold {thresh:.1f}\n")
            f.write(f"- Accuracy: `{metrics['accuracy']:.4f}`\n")
            f.write("```\n")
            f.write(metrics["report"])
            f.write("```\n\n")

    # Summary
    f.write("---\n")
    f.write("## Summary Table (Best per Threshold)\n\n")
    f.write("Note: Conditional formatting is only visible in the Jupyter Notebook.\n\n")
    f.write(summary_df.to_markdown(index=False))
    f.write("\n")

print(f"[+] Report saved to: {report_path}")


[+] Report saved to: inference_reports\combined_6_rf_mixed_inference_report_2025-04-14_19-39-04.md
