In [1]:
import os
import sys
import pandas as pd

# Go one level up from notebooks/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

DATASET_ROOT = os.path.join(PROJECT_ROOT, "data", "EIRSAT-1 Dataset")
SRC_DIR      = os.path.join(PROJECT_ROOT, "src")

# Add src/ to Python path
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

import preprocessing

FLIGHT_PATH = os.path.join(DATASET_ROOT, "Flight Test Data", "flighttest_dataset.csv")
TVAC_PATH   = os.path.join(DATASET_ROOT, "TVAC Test Data", "tvac_dataset.csv")

print("Flight exists:", os.path.exists(FLIGHT_PATH))
print("TVAC exists  :", os.path.exists(TVAC_PATH))

Flight exists: True
TVAC exists  : True


In [6]:
flight_df = pd.read_csv(FLIGHT_PATH, low_memory=False)

print("Flight shape:", flight_df.shape)
display(flight_df.head())

Flight shape: (72390, 258)


Unnamed: 0,OBT,Channel,Row_ID,core.OBT.uptime,platform.obc.OBC.currBootImage,platform.BAT.batteryCurrent[2],platform.BAT.batteryVoltage[2],platform.BAT.batteryCurrentDir,platform.EPS.actualSwitchStatesBitmap,platform.ADCS.rawGyroRate_2,...,obsw.platform.obc.TelemetryADCB.channelOutput[5],obsw.platform.obc.TelemetryADCC.channelOutput[4],obsw.platform.obc.TelemetryADCB.channelOutput[6],obsw.platform.obc.TelemetryADCA.channelOutput[5],obsw.platform.obc.TelemetryADCC.channelOutput[3],Anomaly_4,Anomaly_1,Anomaly_3,Anomaly_2,Anomaly_5
0,4542903,15,,13.0,1.0,21.0,913.0,0.0,640.0,0.0,...,,,,,,,,,,
1,4542932,15,,42.0,1.0,21.0,913.0,0.0,640.0,0.0,...,,,,,,,,,,
2,4542952,17,0.0,,,,,,,,...,,,,,,,,,,
3,4542953,19,0.0,,,,,0.0,640.0,,...,5.0,2304.0,34.0,2437.0,2301.0,,,,,
4,4542962,15,,73.0,1.0,21.0,913.0,0.0,640.0,0.0,...,,,,,,,,,,


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Build Anomaly_Bin using your preprocessing logic
flight_df = preprocessing.add_anomaly_bin(flight_df)

assert "Anomaly_Bin" in flight_df.columns, "Anomaly_Bin not created."

y = flight_df["Anomaly_Bin"].astype(int).values
print("Anomaly %:", y.mean())

# Numeric features only
X = flight_df.select_dtypes(include=[np.number]).drop(columns=["Anomaly_Bin"], errors="ignore")

# Remove anomaly_* columns
X = X.drop(columns=[c for c in X.columns if "anomaly" in c.lower()], errors="ignore")

# Drop columns with missing values (paper-style behavior)
X = X.dropna(axis=1)

print("Feature matrix shape:", X.shape)

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

Anomaly %: 0.2394667771791684
Feature matrix shape: (72390, 2)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [9]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "LinearSVM": LinearSVC(class_weight="balanced"),
    "SVM_RBF": SVC(kernel="rbf", probability=True, class_weight="balanced"),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", min_samples_leaf=10, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15),
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1)
}

In [10]:
results = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    score = None
    if hasattr(clf, "predict_proba"):
        score = clf.predict_proba(X_test)[:, 1]
    elif hasattr(clf, "decision_function"):
        score = clf.decision_function(X_test)

    print("\n==============================")
    print(name)
    print(classification_report(y_test, pred, digits=3))

    roc = None
    pr = None
    if score is not None:
        roc = roc_auc_score(y_test, score)
        pr  = average_precision_score(y_test, score)
        print("ROC AUC:", round(roc, 4))
        print("PR AUC :", round(pr, 4))

    results.append((name, roc, pr))

import pandas as pd
results_df = pd.DataFrame(results, columns=["Model", "ROC_AUC", "PR_AUC"]) \
               .sort_values(by=["PR_AUC", "ROC_AUC"], ascending=False)

print("\n===== MODEL RANKING =====")
display(results_df)


LogisticRegression
              precision    recall  f1-score   support

           0      0.909     0.715     0.801     16517
           1      0.461     0.773     0.578      5200

    accuracy                          0.729     21717
   macro avg      0.685     0.744     0.689     21717
weighted avg      0.802     0.729     0.747     21717

ROC AUC: 0.788
PR AUC : 0.3968

LinearSVM
              precision    recall  f1-score   support

           0      0.913     0.714     0.801     16517
           1      0.463     0.785     0.583      5200

    accuracy                          0.731     21717
   macro avg      0.688     0.749     0.692     21717
weighted avg      0.806     0.731     0.749     21717

ROC AUC: 0.7885
PR AUC : 0.3968

SVM_RBF
              precision    recall  f1-score   support

           0      0.973     0.944     0.958     16517
           1      0.837     0.917     0.875      5200

    accuracy                          0.937     21717
   macro avg      0.905  

Unnamed: 0,Model,ROC_AUC,PR_AUC
5,RandomForest,1.0,0.999999
4,KNN,0.999994,0.999977
3,DecisionTree,0.999939,0.999616
2,SVM_RBF,0.973244,0.956464
0,LogisticRegression,0.787952,0.39681
1,LinearSVM,0.788509,0.396798


FROM ALL THE MODELS EVALUATED WE CAN COME INTO A CONCLUSION THAT RANDOM FOREST GIVE BETTER RESULTS.