In [None]:
import os, sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_ROOT = os.path.join(PROJECT_ROOT, "data", "EIRSAT-1 Dataset")
SRC_DIR      = os.path.join(PROJECT_ROOT, "src")

if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

import preprocessing

FLIGHT_PATH = os.path.join(DATASET_ROOT, "Flight Test Data", "flighttest_dataset.csv")
TVAC_PATH   = os.path.join(DATASET_ROOT, "TVAC Test Data", "tvac_dataset.csv")

flight_df = pd.read_csv(FLIGHT_PATH, low_memory=False)
tvac_df   = pd.read_csv(TVAC_PATH, low_memory=False)

# Add labels
flight_df = preprocessing.add_anomaly_bin(flight_df)
tvac_df   = preprocessing.add_anomaly_bin(tvac_df)

assert "Anomaly_Bin" in tvac_df.columns, "Anomaly_Bin not created in TVAC. Check anomaly_* columns."

# -----------------------------
# TRUE TVAC dataset (X, y)
# -----------------------------
y = tvac_df["Anomaly_Bin"].astype(int).values

# Split TVAC first (prevents preprocessing leakage)
tvac_train_df, tvac_test_df = train_test_split(
    tvac_df, test_size=0.30, random_state=42, stratify=y
)

# Fit preprocessing ONLY on TVAC train
# (using your preprocessing function signature)
prep = preprocessing.fit_preprocessing(tvac_train_df, tvac_train_df, pca_variance=0.95)

# Transform train/test TVAC into PCA space
X_train = preprocessing.transform_dataset(tvac_train_df, prep)
X_test  = preprocessing.transform_dataset(tvac_test_df, prep)

y_train = tvac_train_df["Anomaly_Bin"].astype(int).values
y_test  = tvac_test_df["Anomaly_Bin"].astype(int).values

print("TVAC train X:", X_train.shape, "Anom%:", y_train.mean())
print("TVAC test  X:", X_test.shape,  "Anom%:", y_test.mean())
print("PCA components:", prep["pca"].n_components_)

X shape: (72390, 46)
Anomaly %: 0.2394667771791684
PCA components: 46


In [12]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

scaler2 = StandardScaler()
X_train_s = scaler2.fit_transform(X_train)
X_test_s  = scaler2.transform(X_test)

print("Train anomaly %:", y_train.mean(), "Test anomaly %:", y_test.mean())

Train anomaly %: 0.23947664436682256 Test anomaly %: 0.23944375374130866


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "LinearSVC": LinearSVC(class_weight="balanced"),  # no predict_proba
    "SVC_RBF": SVC(kernel="rbf", probability=True, class_weight="balanced"),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", min_samples_leaf=10, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15),
    
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1),
    
}

In [14]:
results = []

for name, clf in models.items():
    # Some models need scaled data, some don't; weâ€™ll use scaled for everything for fairness
    clf.fit(X_train_s, y_train)
    pred = clf.predict(X_test_s)

    # Score for AUC: probability if available, else decision function if available
    score = None
    if hasattr(clf, "predict_proba"):
        score = clf.predict_proba(X_test_s)[:, 1]
    elif hasattr(clf, "decision_function"):
        score = clf.decision_function(X_test_s)

    print("\n==============================")
    print(name)
    print(classification_report(y_test, pred, digits=3))

    roc = None
    ap = None
    if score is not None:
        roc = roc_auc_score(y_test, score)
        ap  = average_precision_score(y_test, score)
        print("ROC AUC:", round(roc, 4))
        print("PR AUC :", round(ap, 4))

    # store summary
    results.append((name, roc, ap))

# summary table
results_df = pd.DataFrame(results, columns=["Model", "ROC_AUC", "PR_AUC"]).sort_values(
    by=["PR_AUC", "ROC_AUC"], ascending=False
)
print("\n===== Summary (sorted by PR_AUC) =====")
display(results_df)


LogisticRegression
              precision    recall  f1-score   support

           0      0.969     0.947     0.958     16517
           1      0.843     0.904     0.873      5200

    accuracy                          0.937     21717
   macro avg      0.906     0.926     0.915     21717
weighted avg      0.939     0.937     0.938     21717

ROC AUC: 0.9776
PR AUC : 0.9416

LinearSVC
              precision    recall  f1-score   support

           0      0.965     0.952     0.958     16517
           1      0.853     0.891     0.872      5200

    accuracy                          0.937     21717
   macro avg      0.909     0.921     0.915     21717
weighted avg      0.938     0.937     0.938     21717

ROC AUC: 0.972
PR AUC : 0.9269

SVC_RBF
              precision    recall  f1-score   support

           0      0.982     0.987     0.984     16517
           1      0.959     0.941     0.950      5200

    accuracy                          0.976     21717
   macro avg      0.970  

Unnamed: 0,Model,ROC_AUC,PR_AUC
5,RandomForest,0.999521,0.998772
4,KNN,0.99712,0.993371
2,SVC_RBF,0.995161,0.989632
3,DecisionTree,0.990933,0.982128
0,LogisticRegression,0.977554,0.941644
1,LinearSVC,0.972035,0.926943


FROM ALL THE MODELS EVALUATED WE CAN COME INTO A CONCLUSION THAT RANDOM FOREST GIVE BETTER RESULTS.