<a href="https://colab.research.google.com/github/mchelliah1/CVE2ATT-CK/blob/main/CveAtt_CK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

# Load data
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Extract text
X_train_text = X_train['Text'].astype(str)
X_test_text = X_test['Text'].astype(str)

# TF-IDF vectorization with more features
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Logistic Regression with balanced class weights
base_lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model = MultiOutputClassifier(base_lr)

# Train model
model.fit(X_train_tfidf, y_train)

# Get predicted probabilities (shape: n_samples x n_labels)
y_prob = np.array([estimator.predict_proba(X_test_tfidf)[:,1] for estimator in model.estimators_]).T

# Find best threshold per label by maximizing F1-score on test set (just for demo)
optimal_thresholds = []
print("Finding optimal thresholds per label...")
for i, label in enumerate(y_test.columns):
    best_thresh = 0.5
    best_f1 = 0
    for thresh in np.arange(0.1, 0.9, 0.05):
        preds = (y_prob[:, i] >= thresh).astype(int)
        f1 = f1_score(y_test[label], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    optimal_thresholds.append(best_thresh)
    print(f"{label}: Best threshold={best_thresh:.2f}, Best F1={best_f1:.3f}")

# Predict using optimal thresholds
y_pred_thresholded = np.zeros_like(y_prob, dtype=int)
for i, thresh in enumerate(optimal_thresholds):
    y_pred_thresholded[:, i] = (y_prob[:, i] >= thresh).astype(int)

y_pred_df = pd.DataFrame(y_pred_thresholded, columns=y_test.columns)

# Evaluate final predictions
precision_scores = []
recall_scores = []
f1_scores = []

print("\nMetrics after threshold tuning:")
for label in y_test.columns:
    precision = precision_score(y_test[label], y_pred_df[label], zero_division=0)
    recall = recall_score(y_test[label], y_pred_df[label], zero_division=0)
    f1 = f1_score(y_test[label], y_pred_df[label], zero_division=0)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    print(f"{label}: Precision={precision:.3f}, Recall={recall:.3f}, F1-score={f1:.3f}")

print("\nOverall average metrics:")
print(f"Average Precision: {np.mean(precision_scores):.3f}")
print(f"Average Recall:    {np.mean(recall_scores):.3f}")
print(f"Average F1-score:  {np.mean(f1_scores):.3f}")

Finding optimal thresholds per label...
Process Injection: Best threshold=0.70, Best F1=0.125
Access Token Manipulation: Best threshold=0.30, Best F1=0.250
Hijack Execution Flow: Best threshold=0.50, Best F1=0.462
Data from Local System: Best threshold=0.70, Best F1=0.426
External Remote Services: Best threshold=0.45, Best F1=0.242
Data Manipulation: Best threshold=0.50, Best F1=0.486
Network Sniffing: Best threshold=0.65, Best F1=0.400
Exploitation for Privilege Escalation: Best threshold=0.50, Best F1=0.581
Command and Scripting Interpreter: Best threshold=0.45, Best F1=0.557
Phishing: Best threshold=0.55, Best F1=0.348
Server Software Component: Best threshold=0.45, Best F1=0.571
Archive Collected Data: Best threshold=0.35, Best F1=0.471
Data Destruction: Best threshold=0.55, Best F1=0.375
Browser Session Hijacking: Best threshold=0.30, Best F1=0.340
Exploitation for Credential Access: Best threshold=0.10, Best F1=0.037
Abuse Elevation Control Mechanism: Best threshold=0.50, Best F1

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from xgboost import XGBClassifier

# Load data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

# TF-IDF
tfidf = TfidfVectorizer(max_features=6000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train["Text"].astype(str))
X_test_tfidf = tfidf.transform(X_test["Text"].astype(str))

# Label stats
low_count_labels = y_train.columns[y_train.sum() < 20]
high_count_labels = y_train.columns[y_train.sum() >= 20]

# Ensemble predictions
probs_list = []

for label in y_train.columns:
    y = y_train[label]

    if label in low_count_labels:
        # Logistic Regression only
        model = LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear")
        model.fit(X_train_tfidf, y)
        probs = model.predict_proba(X_test_tfidf)[:, 1]
    else:
        # XGB + LR ensemble
        xgb = XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            max_depth=6,
            learning_rate=0.1,
            n_estimators=100,
            subsample=0.85,
            colsample_bytree=0.85,
            gamma=1,
            verbosity=0,
            random_state=42,
        )
        xgb.fit(X_train_tfidf, y)
        xgb_probs = xgb.predict_proba(X_test_tfidf)[:, 1]

        lr = LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear")
        lr.fit(X_train_tfidf, y)
        lr_probs = lr.predict_proba(X_test_tfidf)[:, 1]

        probs = (xgb_probs + lr_probs) / 2

    probs_list.append(probs)

# Matrix of predictions
probs_array = np.array(probs_list).T

# Threshold tuning
thresholds = []
for i, label in enumerate(y_train.columns):
    best_f1, best_thresh = 0, 0.5
    for t in np.arange(0.1, 0.9, 0.05):
        pred = (probs_array[:, i] >= t).astype(int)
        f1 = f1_score(y_test[label], pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thresh = f1, t
    thresholds.append(best_thresh)

# Final predictions
y_pred = np.zeros_like(probs_array)
for i, t in enumerate(thresholds):
    y_pred[:, i] = (probs_array[:, i] >= t).astype(int)

# Evaluation
f1s, precisions, recalls = [], [], []
for i, label in enumerate(y_test.columns):
    p = precision_score(y_test[label], y_pred[:, i], zero_division=0)
    r = recall_score(y_test[label], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_test[label], y_pred[:, i], zero_division=0)
    precisions.append(p)
    recalls.append(r)
    f1s.append(f1)
    print(f"{label}: P={p:.3f}, R={r:.3f}, F1={f1:.3f}")

print("\nMacro Averages:")
print(f"Precision: {np.mean(precisions):.3f}")
print(f"Recall:    {np.mean(recalls):.3f}")
print(f"F1 Score:  {np.mean(f1s):.3f}")