In [None]:
import pandas as pd
import numpy as np

# Example: synthetic SOC dataset
np.random.seed(42)
NUM_ALERTS = 1000

df = pd.DataFrame({
    "alert_count": np.random.randint(1, 10, NUM_ALERTS),
    "max_severity": np.random.randint(1, 6, NUM_ALERTS),
    "asset_criticality": np.random.randint(1, 6, NUM_ALERTS),
    "lateral_movement": np.random.randint(0, 2, NUM_ALERTS),
    "privilege_escalation": np.random.randint(0, 2, NUM_ALERTS),
    "malware_detected": np.random.randint(0, 2, NUM_ALERTS)
})

# Generate priority_score (target) as weighted sum + noise
df["priority_score"] = (
    df["max_severity"]*20 +
    df["asset_criticality"]*15 +
    df["lateral_movement"]*20 +
    df["privilege_escalation"]*20 +
    df["malware_detected"]*15 +
    np.random.normal(0, 5, NUM_ALERTS)  # noise
).clip(0, 100)

# Generate recommended actions (multi-label)
def actions_from_features(row):
    acts = []
    if row["lateral_movement"]: acts.append("isolate_host")
    if row["privilege_escalation"]: acts.append("reset_credentials")
    if row["malware_detected"]: acts.append("run_malware_scan")
    if row["asset_criticality"] >= 4: acts.append("escalate_tier2")
    return acts

df["actions"] = df.apply(actions_from_features, axis=1)

# Convert actions to multi-label binary columns
possible_actions = ["isolate_host", "reset_credentials", "run_malware_scan", "escalate_tier2"]
for action in possible_actions:
    df[action] = df["actions"].apply(lambda x: 1 if action in x else 0)

df.head()


Unnamed: 0,alert_count,max_severity,asset_criticality,lateral_movement,privilege_escalation,malware_detected,priority_score,actions,isolate_host,reset_credentials,run_malware_scan,escalate_tier2
0,7,3,2,1,0,0,100.0,[isolate_host],1,0,0,0
1,4,5,3,0,0,1,100.0,[run_malware_scan],0,0,1,0
2,8,1,3,1,1,0,100.0,"[isolate_host, reset_credentials]",1,1,0,0
3,5,5,4,0,1,0,100.0,"[reset_credentials, escalate_tier2]",0,1,0,1
4,7,3,5,1,1,1,100.0,"[isolate_host, reset_credentials, run_malware_...",1,1,1,1


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np # Import numpy for sqrt

features = ["alert_count", "max_severity", "asset_criticality",
            "lateral_movement", "privilege_escalation", "malware_detected"]

X = df[features]
y = df["priority_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

priority_model = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
priority_model.fit(X_train, y_train)

y_pred = priority_model.predict(X_test)
print("R2:", r2_score(y_test, y_pred))
# Calculate RMSE by taking the square root of MSE
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R2: 0.870507472989941
RMSE: 3.2004200085932992


In [None]:
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

y_actions_full = df[possible_actions]

# Split y_actions into train and test sets, ensuring consistency with X_train and X_test
_, y_actions_test, _, y_actions_train = train_test_split(
    X, y_actions_full, test_size=0.2, random_state=42)

# Re-order the y_actions_train and y_actions_test to align with their respective X splits
# The train_test_split function returns X_train, X_test, y_train, y_test in that order.
# So for y_actions_train and y_actions_test, we need to ensure the order is correct for the fit method.
# The line above was incorrect, let's fix it.

# Correct splitting for y_actions
_, _, y_actions_train, y_actions_test = train_test_split(
    X, y_actions_full, test_size=0.2, random_state=42)

action_model = MultiOutputClassifier(XGBClassifier(n_estimators=200, max_depth=5, eval_metric='logloss'))
action_model.fit(X_train, y_actions_train)

y_actions_pred = action_model.predict(X_test)
from sklearn.metrics import accuracy_score

exact_match_accuracy = accuracy_score(y_actions_test, y_actions_pred)
print("Exact Match Accuracy:", exact_match_accuracy)


Exact Match Accuracy: 1.0


In [None]:
import joblib

joblib.dump(priority_model, "priority_model.joblib", compress=3)
joblib.dump(action_model, "action_model.joblib", compress=3)

print("Models saved successfully!")


Models saved successfully!


In [None]:
import xgboost as xgb
import numpy as np
print(xgb.__version__)
print(np.__version__)

3.1.2
2.0.2


In [None]:
# Example new alert
new_alert = pd.DataFrame({
    "alert_count": [3],
    "max_severity": [10],
    "asset_criticality": [5],
    "lateral_movement": [3],
    "privilege_escalation": [2],
    "malware_detected": [0]
})

# Predict priority
priority_score = priority_model.predict(new_alert)[0]
print("Predicted Priority Score:", round(priority_score, 0))

# Predict actions
actions_pred = action_model.predict(new_alert)
recommended_actions = [action for action, val in zip(possible_actions, actions_pred[0]) if val==1]
print("Recommended Actions:", recommended_actions)


Predicted Priority Score: 100.0
Recommended Actions: ['isolate_host', 'reset_credentials', 'escalate_tier2']
