# imports

### Bootstrap

In [21]:
# bootstrap.py
import os
import warnings
import logging

# --- TensorFlow env vars ---
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# --- Python warnings ---
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*np.object.*")

# --- Logging ---
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("keras").setLevel(logging.ERROR)


### Else

In [22]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.ensemble import IsolationForest
from tf_keras.models import Model
from tf_keras.layers import Input, Dense
from tf_keras.optimizers import Adam
from tf_keras.callbacks import EarlyStopping
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)


# Preprocessing

### Prep

In [23]:

# -----------------------------
# Encode labels
# -----------------------------
def encode_labels(y, label_map={'benign':0, 'suspicious':1}):
    """
    Encode string labels to numeric
    """
    return y.map(label_map)

# -----------------------------
# Timestamp processing (fully cyclic)
# -----------------------------
def process_timestamp(X, timestamp_col):
    """
    Convert timestamp to datetime and extract numeric + cyclic features.
    Fully robust to ISO format (YYYY-MM-DD HH:MM:SS) and other formats.
    """
    X = X.copy()
    
    # -----------------------------
    # Convert to datetime
    # -----------------------------
    X[timestamp_col] = pd.to_datetime(X[timestamp_col], dayfirst=True, errors='coerce')

    # -----------------------------
    # Basic numeric features
    # -----------------------------
    X['year'] = X[timestamp_col].dt.year
    X['month'] = X[timestamp_col].dt.month
    X['day'] = X[timestamp_col].dt.day
    X['hour'] = X[timestamp_col].dt.hour
    X['minute'] = X[timestamp_col].dt.minute
    X['dayofweek'] = X[timestamp_col].dt.dayofweek  # Monday=0, Sunday=6
    # Shift to Sunday=0
    X['dayofweek'] = (X['dayofweek'] + 1) % 7
    X['is_weekend'] = X['dayofweek'].isin([0,6]).astype(int)

    # -----------------------------
    # Cyclic encoding
    # -----------------------------
    # Hour
    X['hour_sin'] = np.sin(2 * np.pi * X['hour']/24)
    X['hour_cos'] = np.cos(2 * np.pi * X['hour']/24)
    # Minute
    X['minute_sin'] = np.sin(2 * np.pi * X['minute']/60)
    X['minute_cos'] = np.cos(2 * np.pi * X['minute']/60)
    # Day of month (manual month lengths)
    month_days = {1:31,2:28,3:31,4:30,5:31,6:30,
                  7:31,8:31,9:30,10:31,11:30,12:31}
    days_in_month = X['month'].map(month_days)
    X['day_sin'] = np.sin(2 * np.pi * X['day'] / days_in_month)
    X['day_cos'] = np.cos(2 * np.pi * X['day'] / days_in_month)
    # Day of week
    X['dow_sin'] = np.sin(2 * np.pi * X['dayofweek']/7)
    X['dow_cos'] = np.cos(2 * np.pi * X['dayofweek']/7)
    # Month
    X['month_sin'] = np.sin(2 * np.pi * X['month']/12)
    X['month_cos'] = np.cos(2 * np.pi * X['month']/12)

    # -----------------------------
    # Drop raw columns
    # -----------------------------
    drop_cols = [timestamp_col, 'hour','minute','day','dayofweek','month']
    X = X.drop(columns=drop_cols, errors='ignore')  # safer in case some columns missing

    return X


# -----------------------------
# Handle missing values
# -----------------------------
def handle_missing_values(X, numeric_cols, categorical_cols):
    """
    Fill missing values in numeric and categorical columns
    """
    X = X.copy()
    X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
    X[categorical_cols] = X[categorical_cols].fillna('Unknown')
    return X

# -----------------------------
# Handle outliers
# -----------------------------
def handle_outliers(X, numeric_cols):
    """
    Clip numeric features to remove extreme outliers (IQR method)
    """
    X = X.copy()
    Q1 = X[numeric_cols].quantile(0.25)
    Q3 = X[numeric_cols].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    X[numeric_cols] = X[numeric_cols].clip(lower=lower, upper=upper, axis=1)
    return X

# -----------------------------
# Remove highly correlated features
# -----------------------------
def remove_highly_correlated(X, numeric_cols, threshold=0.9):
    """
    Remove highly correlated numeric features
    Returns: reduced X, updated numeric_cols, list of removed features
    """
    X = X.copy()
    corr_matrix = X[numeric_cols].corr().abs()
    upper_tri = corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))
    to_drop = [col for col in upper_tri.columns if any(upper_tri[col] > threshold)]
    X = X.drop(columns=to_drop)
    numeric_cols = [col for col in numeric_cols if col not in to_drop]
    return X, numeric_cols, to_drop

# -----------------------------
# Embed command_text semantically
# -----------------------------
def embed_command_text(X, text_col='command_text', model_name='all-MiniLM-L6-v2'):
    """
    Convert command_text into semantic embeddings using a pretrained sentence transformer
    Returns a DataFrame of embeddings
    """
    X = X.copy()
    model = SentenceTransformer(model_name, device="cpu")
    embeddings = model.encode(X[text_col].tolist(), show_progress_bar=True)
    X_emb = pd.DataFrame(embeddings, columns=[f"{text_col}_embed_{i}" for i in range(embeddings.shape[1])])
    X_emb.index = X.index
    return X_emb

# -----------------------------
# Full preprocessing pipeline
# -----------------------------
"""
X_structured, numeric_cols, y, X_text_emb = preprocess_data(
    df,
    timestamp_col="timestamp",
    label_col="is_anomaly",
    text_col="command_text",
    numeric_cols=None,       # auto-detect
    categorical_cols=None    # auto-detect
)
"""
def preprocess_data(
    X,
    timestamp_col=None,
    text_col=None,
    label_col=None,
    numeric_cols=None,
    categorical_cols=None
):
    X = X.copy()

    # --------------------------
    # Auto-detect numeric/categorical if not provided
    # --------------------------
    if numeric_cols is None:
        numeric_cols = X.select_dtypes(include="number").columns.tolist()
    if categorical_cols is None:
        categorical_cols = X.select_dtypes(include="object").columns.tolist()
        # remove label/text/timestamp if present
        categorical_cols = [col for col in categorical_cols if col not in [label_col, text_col, timestamp_col]]

    # --------------------------
    # Process timestamp
    # --------------------------
    if timestamp_col and timestamp_col in X.columns:
        X = process_timestamp(X, timestamp_col)
        # Add new time features to numeric_cols
        new_time_features = [
            'hour_sin','hour_cos',
            'minute_sin','minute_cos',
            'day_sin','day_cos',
            'dow_sin','dow_cos',
            'month_sin','month_cos'
        ]
        numeric_cols += new_time_features

    # --------------------------
    # Handle missing values
    # --------------------------
    X = handle_missing_values(X, numeric_cols, categorical_cols)

    # --------------------------
    # Handle outliers
    # --------------------------
    X = handle_outliers(X, numeric_cols)

    # --------------------------
    # Remove highly correlated features
    # --------------------------
    X, numeric_cols, dropped = remove_highly_correlated(X, numeric_cols)

    # --------------------------
    # Encode labels
    # --------------------------
    y_encoded = None
    if label_col and label_col in X.columns:
        y_encoded = encode_labels(X[label_col])
        X = X.drop(columns=[label_col])

    # --------------------------
    # Embed command_text
    # --------------------------
    X_text_emb = None
    if text_col and text_col in X.columns:
        X_text_emb = embed_command_text(X, text_col=text_col)

    return X, numeric_cols, y_encoded, X_text_emb


### Feature Extraction

In [24]:
# -----------------------------
# Isolation Forest
# -----------------------------
def features_for_isolation_forest(X_structured, numeric_cols):
    """
    Isolation Forest:
    - Numeric features only
    - No scaling required
    """
    return X_structured[numeric_cols]


# -----------------------------
# SVM
# -----------------------------
def features_for_svm(X_structured, numeric_cols):
    """
    SVM:
    - Numeric features
    - Requires scaling
    """
    scaler = StandardScaler()
    X_num = X_structured[numeric_cols]
    X_scaled = scaler.fit_transform(X_num)
    return pd.DataFrame(X_scaled, columns=numeric_cols, index=X_structured.index)


# -----------------------------
# Autoencoder
# -----------------------------
def features_for_autoencoder(X_structured, numeric_cols, X_text_emb=None, use_text=True):
    """
    Autoencoder:
    - Numeric features
    - Optionally semantic embeddings
    """
    X_num = X_structured[numeric_cols]

    if use_text and X_text_emb is not None:
        return pd.concat([X_num, X_text_emb], axis=1)

    return X_num


# -----------------------------
# Meta-Agent Input
# -----------------------------
def features_for_meta_agent(model_outputs):
    """
    Meta-agent does NOT see raw data.
    It sees model-level outputs like:
    - anomaly scores
    - reconstruction errors
    - decision margins
    """
    return pd.DataFrame(model_outputs)


# Agents

### Base Agent

In [25]:
class BaseAgent(ABC):
    """
    Abstract base class for all model agents.
    Enforces a common interface across agents.
    """

    def __init__(self, name):
        self.name = name
        self.model = None

    @abstractmethod
    def fit(self, X):
        """
        Train the model.
        """
        pass

    @abstractmethod
    def score(self, X):
        """
        Produce anomaly scores.
        Higher score = more anomalous.
        """
        pass

    def predict(self, X, threshold=None):
        """
        Optional binary prediction from anomaly scores.
        """
        scores = self.score(X)

        if threshold is None:
            return scores

        return (scores > threshold).astype(int)

    def get_name(self):
        """
        Return agent name.
        """
        return self.name


### IF Agent

In [26]:
class IsolationForestAgent(BaseAgent):
    """
    Isolation Forest agent for anomaly detection.
    Higher score = more anomalous.
    """

    def __init__(
        self,
        name="IsolationForest",
        n_estimators=100,
        max_samples="auto",
        contamination=0.05,
        random_state=42
    ):
        super().__init__(name)

        self.model = IsolationForest(
            n_estimators=n_estimators,
            max_samples=max_samples,
            contamination=contamination,
            random_state=random_state
        )

    def fit(self, X):
        """
        Train the Isolation Forest on numeric features only.
        """
        self.model.fit(X)

    def score(self, X):
        """
        Return anomaly scores.
        Higher = more anomalous.
        """

        # sklearn: decision_function
        #   higher = more normal
        #   lower = more anomalous
        scores = self.model.decision_function(X)

        # Flip sign: now higher = more anomalous
        return -scores


### AE Agent

In [27]:
class AutoencoderAgent(BaseAgent):
    """
    Autoencoder-based anomaly detection agent.
    Anomaly score = reconstruction error.
    """

    def __init__(
        self,
        name="Autoencoder",
        input_dim=None,
        hidden_dims=(64, 32),
        latent_dim=16,
        learning_rate=1e-3,
        epochs=50,
        batch_size=32
    ):
        super().__init__(name)

        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.latent_dim = latent_dim
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size

        self.model = None

    def _build_model(self):
        """
        Build symmetric autoencoder.
        """
        inputs = Input(shape=(self.input_dim,))

        x = inputs
        for dim in self.hidden_dims:
            x = Dense(dim, activation="relu")(x)

        latent = Dense(self.latent_dim, activation="relu")(x)

        x = latent
        for dim in reversed(self.hidden_dims):
            x = Dense(dim, activation="relu")(x)

        outputs = Dense(self.input_dim, activation="linear")(x)

        model = Model(inputs, outputs)
        model.compile(
            optimizer=Adam(learning_rate=self.learning_rate),
            loss="mse"
        )

        return model

    def fit(self, X):
        """
        Train autoencoder on (mostly) normal data.
        """
        if self.input_dim is None:
            self.input_dim = X.shape[1]

        self.model = self._build_model()

        early_stop = EarlyStopping(
            monitor="loss",
            patience=5,
            restore_best_weights=True
        )

        self.model.fit(
            X,
            X,
            epochs=self.epochs,
            batch_size=self.batch_size,
            shuffle=True,
            callbacks=[early_stop],
            verbose=0
        )

    def score(self, X):
        """
        Return anomaly scores.
        Higher = more anomalous.
        """
        reconstructions = self.model.predict(X, verbose=0)

        # Mean Squared Error per sample
        reconstruction_error = np.mean(
            np.square(X - reconstructions),
            axis=1
        )

        return reconstruction_error


### SVM Agent

In [28]:
class SVMAgent(BaseAgent):
    """
    One-Class SVM agent for anomaly detection.
    """

    def __init__(
        self,
        name="OneClassSVM",
        kernel="rbf",
        nu=0.05,
        gamma="scale"
    ):
        super().__init__(name)

        self.model = OneClassSVM(
            kernel=kernel,
            nu=nu,
            gamma=gamma
        )

    def fit(self, X):
        """
        Train the One-Class SVM.
        X must be numeric features only.
        """
        self.model.fit(X)

    def score(self, X):
        """
        Return anomaly scores.
        Higher score = more anomalous.
        """

        # decision_function:
        #   positive → inlier
        #   negative → outlier
        scores = self.model.decision_function(X)

        # Flip sign so:
        #   higher = more anomalous
        return -scores


### META Agent

In [29]:
class MetaAgent(BaseAgent):
    """
    Meta-agent that aggregates multiple anomaly detection agents.
    Uses ensemble scoring (mean, weighted, or custom) to produce
    a final anomaly score.
    """

    def __init__(self, agents, name="MetaAgent", weights=None):
        """
        :param agents: list of BaseAgent instances
        :param weights: optional list of weights for each agent
        """
        super().__init__(name)
        self.agents = agents

        if weights is None:
            # Equal weight by default
            self.weights = np.ones(len(agents)) / len(agents)
        else:
            assert len(weights) == len(agents), "weights length must match number of agents"
            self.weights = np.array(weights) / np.sum(weights)  # normalize to sum=1

    def fit(self, X_dict):
        """
        Fit all agents.
        :param X_dict: dict of {agent_name: X_features_for_agent}
        """
        for agent in self.agents:
            X_agent = X_dict[agent.get_name()]
            agent.fit(X_agent)

    def score(self, X_dict):
        """
        Return ensemble anomaly scores.
        :param X_dict: dict of {agent_name: X_features_for_agent}
        :return: np.array of final anomaly scores (higher = more anomalous)
        """
        all_scores = []

        for agent in self.agents:
            X_agent = X_dict[agent.get_name()]
            scores = agent.score(X_agent)
            all_scores.append(scores)

        all_scores = np.array(all_scores)  # shape: (num_agents, num_samples)

        # Weighted sum across agents
        final_scores = np.dot(self.weights, all_scores)

        return final_scores

    def predict(self, X_dict, threshold=None):
        """
        Optional binary prediction based on ensemble score.
        """
        final_scores = self.score(X_dict)

        if threshold is None:
            return final_scores

        return (final_scores > threshold).astype(int)


# Main

In [30]:
def evaluate_model(y_true, y_pred):
    """
    Evaluate anomaly detection predictions
    """
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    try:
        auc = roc_auc_score(y_true, y_pred)
    except ValueError:
        auc = None  # במקרה שאין מספיק מחלקות
    
    cm = confusion_matrix(y_true, y_pred)
    
    print("=== Evaluation Metrics ===")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    if auc is not None:
        print(f"ROC AUC:   {auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc, "confusion_matrix": cm}


def main():
    # --------------------------
    # 1. Load your dataset
    # --------------------------
    df = pd.read_csv("data/enterprise_A.csv")

    # --------------------------
    # 2. Preprocess data
    # --------------------------
    # Preprocess numeric, categorical, timestamp, label, and embed text
    # Updated: unpack all 4 outputs
    X_structured, numeric_cols, y, X_text_emb = preprocess_data(
        df,
        timestamp_col="timestamp",
        label_col="is_anomaly",
        text_col="command_text",
        numeric_cols=None,       # auto-detect
        categorical_cols=None    # auto-detect
    )

    # --------------------------
    # 3. Standardize numeric features
    # --------------------------
    print("Numeric columns used:", numeric_cols)
    scaler = StandardScaler()
    X_structured[numeric_cols] = scaler.fit_transform(X_structured[numeric_cols])

    # --------------------------
    # 4. Prepare features for each agent
    # --------------------------
    X_if = features_for_isolation_forest(X_structured, numeric_cols)
    X_svm = features_for_svm(X_structured, numeric_cols)
    X_ae = features_for_autoencoder(X_structured, numeric_cols, X_text_emb=X_text_emb)

    # Build feature dict for meta-agent
    X_dict = {
        "IsolationForest": X_if,
        "OneClassSVM": X_svm,
        "Autoencoder": X_ae
    }

    # --------------------------
    # 5. Initialize agents
    # --------------------------
    if_agent = IsolationForestAgent(contamination=0.05)
    svm_agent = SVMAgent(nu=0.05)
    ae_agent = AutoencoderAgent(input_dim=X_ae.shape[1], epochs=50, latent_dim=16)

    agents = [if_agent, svm_agent, ae_agent]

    # --------------------------
    # 6. Initialize meta-agent
    # --------------------------
    meta_agent = MetaAgent(agents, weights=[0.3, 0.3, 0.4])

    # --------------------------
    # 7. Fit all agents
    # --------------------------
    meta_agent.fit(X_dict)  # pass y if needed by agents

    # --------------------------
    # 8. Compute final anomaly scores
    # --------------------------
    final_scores = meta_agent.score(X_dict)

    # --------------------------
    # 9. Optional: threshold for binary anomalies
    # --------------------------
    threshold = pd.Series(final_scores).quantile(0.95)  # top 5% = anomalies
    preds = (final_scores > threshold).astype(int)
    metrics = evaluate_model(y, preds)
    # --------------------------
    # 10. Save results
    # --------------------------
    results = df.copy()
    results["anomaly_score"] = final_scores
    results["predicted_anomaly"] = preds

    results.to_csv("anomaly_results.csv", index=False)
    print("Anomaly detection complete. Results saved to anomaly_results.csv")


if __name__ == "__main__":
    main()


Batches: 100%|██████████| 32/32 [00:01<00:00, 19.06it/s]


Numeric columns used: ['command_length', 'num_arguments', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'day_sin', 'day_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
=== Evaluation Metrics ===
Accuracy:  0.9650
Precision: 0.6600
Recall:    0.6471
F1-score:  0.6535
ROC AUC:   0.8146
Confusion Matrix:
[[932  17]
 [ 18  33]]
Anomaly detection complete. Results saved to anomaly_results.csv
