In [1]:
import os, sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_ROOT = os.path.join(PROJECT_ROOT, "data", "EIRSAT-1 Dataset")
SRC_DIR      = os.path.join(PROJECT_ROOT, "src")

if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

import preprocessing

FLIGHT_PATH = os.path.join(DATASET_ROOT, "Flight Test Data", "flighttest_dataset.csv")

flight_df = pd.read_csv(FLIGHT_PATH, low_memory=False)
flight_df = preprocessing.add_anomaly_bin(flight_df)

print("Flight shape:", flight_df.shape)
print("Anomaly %:", flight_df["Anomaly_Bin"].mean())

Flight shape: (72390, 259)
Anomaly %: 0.2394667771791684


In [2]:
def build_features(df):
    X = df.select_dtypes(include=[np.number]).copy()

    # Remove label + anomaly columns
    X = X.drop(columns=["Anomaly_Bin"], errors="ignore")
    X = X.drop(columns=[c for c in X.columns if "anomaly" in c.lower()], errors="ignore")

    # Remove metadata (very important)
    X = X.drop(columns=[c for c in ["OBT", "Row_ID"] if c in X.columns], errors="ignore")

    # Drop columns that are mostly missing (instead of drop-any-NaN)
    missing_frac = X.isna().mean()
    X = X.loc[:, missing_frac < 0.8]

    # Fill remaining missing safely
    X = X.fillna(0)

    return X

X = build_features(flight_df)
y = flight_df["Anomaly_Bin"].values

print("Final feature count:", X.shape[1])

Final feature count: 152


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Fit scaler ONLY on training data
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

models = {
    "LogisticRegression": LogisticRegression(max_iter=3000, class_weight="balanced"),
    "SVM_RBF": SVC(kernel="rbf", probability=True, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", min_samples_leaf=10, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15),
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1)
}


In [None]:
results = []

for name, model in models.items():
    model.fit(X_train_s, y_train)
    pred = model.predict(X_test_s)

    if hasattr(model, "predict_proba"):
        score = model.predict_proba(X_test_s)[:, 1]
    else:
        score = model.decision_function(X_test_s)

    print("\n==============================")
    print(name)
    print(classification_report(y_test, pred, digits=3))

    roc = roc_auc_score(y_test, score)
    pr  = average_precision_score(y_test, score)

    print("ROC AUC:", round(roc, 4))
    print("PR AUC :", round(pr, 4))

    results.append((name, roc, pr))

results_df = pd.DataFrame(results, columns=["Model", "ROC_AUC", "PR_AUC"]) \
               .sort_values(by=["PR_AUC", "ROC_AUC"], ascending=False)

print("\n===== CLEAN GLOBAL BENCHMARK =====")
display(results_df)


LogisticRegression
              precision    recall  f1-score   support

           0      0.990     0.881     0.932     16517
           1      0.719     0.970     0.826      5200

    accuracy                          0.902     21717
   macro avg      0.854     0.926     0.879     21717
weighted avg      0.925     0.902     0.907     21717

ROC AUC: 0.9838
PR AUC : 0.9517


FROM ALL THE MODELS EVALUATED WE CAN COME INTO A CONCLUSION THAT RANDOM FOREST GIVE BETTER RESULTS.