# Fusing results from both Isolation Forest and Autoencoder #

Importing libraries and datasets

In [10]:
import pandas as pd
import numpy as np


AE_PATH  = "AE_final_dataset_with_scores.parquet"
IF_PATH  = "IF_final_dataset_with_scores.parquet"
BASE_IN  = "final_dataset.parquet"
BASE_OUT_PARQUET = "final_dataset_with_fusion.parquet"
BASE_OUT_CSV     = "final_dataset_with_fusion.csv"

In [11]:
KEY_COL = None          # use this col to merge; else align by row order if 'None'
AE_SCORE_COL = "ae_reconstruction_error_zscore" # score column name in AE file
IF_SCORE_COL = "anomaly_score" # score column name in IF file

NORMALIZE = True        # min-max per model before fusion
FUSION    = "mean"      # fusion strategy: "mean" | "max" | "rank_mean"
PERCENTILE = 95.0       # threshold = top 5% by default
FIXED_THRESHOLD = None  # to override percentile

minmax_0_1 function scales values to [0,1]. Handles NaNs and constant columns gracefully

to_rank_0_1 function converts to percentile ranks [0,1]. Useful if score scales are incomparable.


In [12]:
def minmax_0_1(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    mn, mx = s.min(skipna=True), s.max(skipna=True)
    if pd.isna(mn) or pd.isna(mx) or mn == mx:
        return pd.Series(0.0, index=s.index)
    return (s - mn) / (mx - mn)

def to_rank_0_1(s: pd.Series) -> pd.Series:
    return s.rank(method="average", pct=True)  # uniform [0,1]

In [13]:
AE = pd.read_parquet(AE_PATH)
IF = pd.read_parquet(IF_PATH)
BASE = pd.read_parquet(BASE_IN)

Extracting columns and renaming for clarity

In [14]:
ae_df = AE[[KEY_COL, AE_SCORE_COL]].copy() if KEY_COL else AE[[AE_SCORE_COL]].copy()
if KEY_COL: ae_df = ae_df.rename(columns={AE_SCORE_COL: "ae_score", KEY_COL: KEY_COL})
else:       ae_df = ae_df.rename(columns={AE_SCORE_COL: "ae_score"})

if_df = IF[[KEY_COL, IF_SCORE_COL]].copy() if KEY_COL else IF[[IF_SCORE_COL]].copy()
if KEY_COL: if_df = if_df.rename(columns={IF_SCORE_COL: "if_score", KEY_COL: KEY_COL})
else:       if_df = if_df.rename(columns={IF_SCORE_COL: "if_score"})


Aligning IF and AE results

In [15]:
if KEY_COL:
    fused = pd.merge(if_df, ae_df, on=KEY_COL, how="inner", validate="one_to_one")
else:
    # row-order alignment
    n = min(len(ae_df), len(if_df))
    fused = pd.concat([if_df.iloc[:n].reset_index(drop=True),
                       ae_df.iloc[:n].reset_index(drop=True)], axis=1)

if NORMALIZE is toggled to yes, then normalise score

In [16]:
if NORMALIZE:
    s_if = minmax_0_1(fused["if_score"])
    s_ae = minmax_0_1(fused["ae_score"])
else:
    s_if = pd.to_numeric(fused["if_score"], errors="coerce")
    s_ae = pd.to_numeric(fused["ae_score"], errors="coerce")

Fuse scores based on FUSION mode

In [17]:
if FUSION == "mean":
    fusion = (s_if + s_ae) / 2.0
elif FUSION == "max":
    fusion = pd.concat([s_if, s_ae], axis=1).max(axis=1)
elif FUSION == "rank_mean":
    fusion = (to_rank_0_1(s_if) + to_rank_0_1(s_ae)) / 2.0
else:
    raise ValueError(f"Unknown fusion method: {FUSION}")

fused["fusion_anomaly_score"] = fusion

Pitting against threshold to determine the anomaly flag

In [18]:
if FIXED_THRESHOLD is not None:
    th = float(FIXED_THRESHOLD)
else:
    th = float(np.percentile(fused["fusion_anomaly_score"].dropna().values, PERCENTILE))

fused["is_anomaly"] = (fused["fusion_anomaly_score"] >= th).astype(int)

print(f"Fusion threshold: {th:.6f}")
print(f"Flagged anomalies: {int(fused['is_anomaly'].sum())}/{len(fused)}")


Fusion threshold: 0.478526
Flagged anomalies: 1332/26622


Merging back to base dataset

In [19]:
if KEY_COL and KEY_COL in BASE.columns and KEY_COL in fused.columns:
    out = pd.merge(
        BASE,
        fused[[KEY_COL, "if_score", "ae_score", "fusion_anomaly_score", "is_anomaly"]],
        on=KEY_COL, how="left", validate="one_to_one"
    )
else:
    m = min(len(BASE), len(fused))
    out = BASE.iloc[:m].copy()
    for c in ["if_score", "ae_score", "fusion_anomaly_score", "is_anomaly"]:
        out[c] = fused.loc[:m-1, c].values

In [20]:
out.to_parquet(BASE_OUT_PARQUET, index=False)
out.to_csv(BASE_OUT_CSV, index=False)
print("Saved:")
print(f"- Parquet: {BASE_OUT_PARQUET}")
print(f"- CSV    : {BASE_OUT_CSV}")

Saved:
- Parquet: final_dataset_with_fusion.parquet
- CSV    : final_dataset_with_fusion.csv


In [22]:
print(out["is_anomaly"])

0        0
1        0
2        0
3        0
4        0
        ..
26617    0
26618    0
26619    0
26620    1
26621    0
Name: is_anomaly, Length: 26622, dtype: int64
