## Install Dependencies Required 

In [17]:

import sys
!{sys.executable} -m pip install -r requirements.txt



## Define Environment Variables

In [31]:
import os, json
DATA_PATH = 'data/creditcard.csv'  
Y_COL_LABEL = 'Class'                   
OUTDIR = 'outputs'
os.makedirs(OUTDIR, exist_ok=True)

## Imports needed libraries

In [32]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report, roc_curve, auc, precision_recall_curve
)
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120

from pyod.models.auto_encoder import AutoEncoder
from typing import Optional, Dict


## Utilities Functions

In [33]:

def build_preprocessor(
    df: pd.DataFrame,
    label_col: str,
) -> ColumnTransformer:
    """
    Build a ColumnTransformer that standardizes numeric columns and
    one-hot encodes categorical columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing features + label.
    label_col : str
        Name of the target/label column to exclude from preprocessing.

    Returns
    -------
    ColumnTransformer
        A transformer that applies StandardScaler to numeric cols and
        OneHotEncoder to categorical cols.

    Raises
    ------
    KeyError
        If `label_col` is not present in `df`.
    ValueError
        If no usable feature columns remain after excluding the label.
    """
    if label_col not in df.columns:
        raise KeyError(f"label_col '{label_col}' not in dataframe columns.")

    # Exclude the label from features
    X = df.drop(columns=[label_col])

    # Identify numeric vs categorical (preserve original column order)
    num_mask = X.dtypes.apply(lambda dt: np.issubdtype(dt, np.number))
    num_cols = X.columns[num_mask].tolist()
    cat_cols = X.columns[~num_mask].tolist()

    transformers = []
    if num_cols:
        transformers.append((
            "num",
            StandardScaler(with_mean=True, with_std=True),
            num_cols
        ))
    if cat_cols:
        # Note: sparse_output=False returns a dense array, which is convenient
        # for many downstream estimators and plotting. Change if memory is tight.
        transformers.append((
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            cat_cols
        ))

    if not transformers:
        raise ValueError(
            "No usable feature columns found after excluding the label. "
            "Ensure the dataframe has at least one numeric or categorical feature."
        )

    return ColumnTransformer(transformers=transformers, remainder="drop")


def plot_hist(
    scores: np.ndarray,
    labels: np.ndarray,
    path: Optional[str] = None,
    *,
    bins: int = 50,
    title: str = "Reconstruction Error / Outlier Scores",
    ax: Optional[plt.Axes] = None,
) -> plt.Axes:
    """
    Plot overlapping histograms of anomaly scores for inliers (0) vs outliers (1).

    Parameters
    ----------
    scores : np.ndarray
        1D array of anomaly/reconstruction scores.
    labels : np.ndarray
        1D array of true labels (0=inlier, 1=outlier), same length as `scores`.
    path : Optional[str], default=None
        If provided, save the figure to this path; otherwise display.
    bins : int, default=50
        Number of histogram bins.
    title : str, default="Reconstruction Error / Outlier Scores"
        Plot title.
    ax : Optional[plt.Axes], default=None
        Axes to draw on. If None, a new figure/axes is created.

    Returns
    -------
    matplotlib.axes.Axes
        The axes containing the plot.

    Raises
    ------
    ValueError
        If shapes mismatch or labels are not binary {0,1}.
    """
    scores = np.asarray(scores).ravel()
    labels = np.asarray(labels).ravel()

    if scores.shape[0] != labels.shape[0]:
        raise ValueError("`scores` and `labels` must have the same length.")
    uniq = np.unique(labels)
    if not np.all(np.isin(uniq, [0, 1])):
        raise ValueError("`labels` must be binary with values in {0, 1}.")

    if ax is None:
        fig, ax = plt.subplots(figsize=(7, 4))

    ax.hist(scores[labels == 0], bins=bins, alpha=0.7, label="Inliers")
    ax.hist(scores[labels == 1], bins=bins, alpha=0.7, label="Outliers")
    ax.set_title(title)
    ax.set_xlabel("Score")
    ax.set_ylabel("Frequency")
    ax.legend()
    fig = ax.get_figure()
    fig.tight_layout()

    if path:
        fig.savefig(path, dpi=150)
        plt.close(fig)
    else:
        plt.show()

    return ax


def plot_roc_pr(
    y_true: np.ndarray,
    scores: np.ndarray,
    outdir: Optional[str] = None,
    *,
    roc_filename: str = "roc_curve.png",
    pr_filename: str = "pr_curve.png",
) -> Dict[str, float]:
    """
    Plot ROC and Precision-Recall curves given ground-truth labels and anomaly scores.

    Parameters
    ----------
    y_true : np.ndarray
        1D array of true labels (0=inlier, 1=outlier).
    scores : np.ndarray
        1D array of anomaly scores (higher = more anomalous).
    outdir : Optional[str], default=None
        Directory to save plots. If None, plots are shown instead.
    roc_filename : str, default="roc_curve.png"
        Filename for the saved ROC figure (used if outdir is not None).
    pr_filename : str, default="pr_curve.png"
        Filename for the saved PR figure (used if outdir is not None).

    Returns
    -------
    Dict[str, float]
        Dictionary containing {"roc_auc": float, "average_precision": float}.

    Raises
    ------
    ValueError
        If shapes mismatch or labels are not binary {0,1}.
    """
    y_true = np.asarray(y_true).ravel()
    scores = np.asarray(scores).ravel()

    if y_true.shape[0] != scores.shape[0]:
        raise ValueError("`y_true` and `scores` must have the same length.")
    uniq = np.unique(y_true)
    if not np.all(np.isin(uniq, [0, 1])):
        raise ValueError("`y_true` must be binary with values in {0, 1}.")

    # --- ROC ---
    fpr, tpr, _ = roc_curve(y_true, scores)
    roc_auc = auc(fpr, tpr)

    fig1, ax1 = plt.subplots(figsize=(5, 5))
    ax1.plot(fpr, tpr, lw=2, label=f"ROC AUC = {roc_auc:.4f}")
    ax1.plot([0, 1], [0, 1], lw=1, linestyle="--")
    ax1.set_xlabel("False Positive Rate")
    ax1.set_ylabel("True Positive Rate")
    ax1.set_title("ROC Curve")
    ax1.legend(loc="lower right")
    fig1.tight_layout()

    if outdir:
        fig1.savefig(f"{outdir.rstrip('/')}/{roc_filename}", dpi=150)
        plt.close(fig1)
    else:
        plt.show()

    # --- Precision-Recall ---
    precision, recall, _ = precision_recall_curve(y_true, scores)
    ap = average_precision_score(y_true, scores)

    fig2, ax2 = plt.subplots(figsize=(5, 5))
    ax2.plot(recall, precision, lw=2, label=f"AP = {ap:.4f}")
    ax2.set_xlabel("Recall")
    ax2.set_ylabel("Precision")
    ax2.set_title("Precision-Recall Curve")
    ax2.legend(loc="lower left")
    fig2.tight_layout()

    if outdir:
        fig2.savefig(f"{outdir.rstrip('/')}/{pr_filename}", dpi=150)
        plt.close(fig2)
    else:
        plt.show()

    return {"roc_auc": float(roc_auc), "average_precision": float(ap)}


## Load data

In [34]:
import pandas as pd
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


## Data Partitioning, Preprocessing and Training

In [54]:

# --- Labels: ensure binary {0,1} for 'Class'
y_raw = df[Y_COL_LABEL].to_numpy()
uniq = pd.unique(y_raw)

if set(uniq) <= {0, 1}:
    y = y_raw.astype(int)
else:
    vc = pd.Series(y_raw).value_counts()
    mapping = {vc.idxmax(): 0, vc.idxmin(): 1}
    y = pd.Series(y_raw).map(mapping).astype(int).to_numpy()

# --- Features & preprocessing
X = df.drop(columns=[Y_COL_LABEL])
preprocessor = build_preprocessor(df, Y_COL_LABEL)

# --- Splits: train/val/test (stratified for class balance)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.2, stratify=y_trainval, random_state=42
)

# --- Sanity checks on dataset splits ---
print("Train+Validation set shape : X =", X_trainval.shape, ", y =", y_trainval.shape,"  # Expect ~227,845 samples (80% of total)")
print("Train set shape            : X =", X_train.shape, ", y =", y_train.shape,"  # Expect ~182,276 samples (80% of 227,845)")
print("Validation set shape       : X =", X_val.shape, ", y =", y_val.shape,"  # Expect ~45,569 samples (20% of 227,845)")
print("Test set shape             : X =", X_test.shape, ", y =", y_test.shape,"  # Expect ~56,962 samples (20% of total)")


Train+Validation set shape : X = (227845, 30) , y = (227845,)   # Expect ~227,845 samples (80% of total)
Train set shape            : X = (182276, 30) , y = (182276,)   # Expect ~182,276 samples (80% of 227,845)
Validation set shape       : X = (45569, 30) , y = (45569,)   # Expect ~45,569 samples (20% of 227,845)
Test set shape             : X = (56962, 30) , y = (56962,)   # Expect ~56,962 samples (20% of total)


## Train PyOD AutoEncoder

In [76]:
from pyod.models.auto_encoder import AutoEncoder
from sklearn.pipeline import Pipeline

encoder = AutoEncoder(
    contamination=0.3,                   # fraud ratio in dataset
    preprocessing=False,                 # we already handle scaling via sklearn pipeline
    lr=0.001,                            # standard learning rate 
    epoch_num=40,                        # ~50 passes through data 
    batch_size=128,                      # efficient batch size for stability & GPU/CPU use
    optimizer_name='adam',               # Adam usually works well for deep networks
    optimizer_params={'weight_decay': 1e-5},  # small L2 regularization to prevent overfitting
    device=None,                         # None = auto (CPU)
    random_state=42,                     # reproducibility
    use_compile=False,                   # keep default unless profiling speed
    compile_mode='default',              # default computation mode
    verbose=1,                           # show training progress

    # --- Network architecture ---
    hidden_neuron_list=[64, 30, 30, 64], 
    hidden_activation_name='relu',       # ReLU is stable for anomaly detection
    batch_norm=True,                     # normalize activations for faster convergence
    dropout_rate=0.1                     # small dropout to reduce overfitting (0.1 = 10%)
)

pipe = Pipeline([
    ("pre", pre),                        # your ColumnTransformer
    ("ae", encoder)
])

pipe.fit(X_train, y=None)                # unsupervised; y is ignored
print("Model trained with PyOD 2.0.5 AutoEncoder.")


Training: 100%|██████████| 40/40 [12:41<00:00, 19.04s/it]


Model trained with PyOD 2.0.5 AutoEncoder.


## Evaluate & visualize

In [77]:
# --- Grab steps once
pre, ae = pipe.named_steps['pre'], pipe.named_steps['ae']

# --- Helper: preprocess + score
def score(X):
    return ae.decision_function(pre.transform(X))

# --- Scores
train_scores = score(X_train)
val_scores   = score(X_val)
test_scores  = score(X_test)

# --- Metrics on test (use scores as anomaly likelihood)
roc = roc_auc_score(y_test, test_scores)
ap  = average_precision_score(y_test, test_scores)

# --- Threshold (use model's if present; else percentile by contamination)
threshold = getattr(ae, "threshold_", None)
if threshold is None:
    threshold = np.percentile(train_scores, 100 * (1 - CONTAMINATION))

# --- Hard predictions from scores
y_pred = (test_scores > threshold).astype(int)

# --- Confusion matrix & report
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
report = classification_report(y_test, y_pred, digits=4)

# --- Print summary
print("=== Evaluation ===")
print(f"ROC AUC: {roc:.6f}")
print(f"Average Precision (AP): {ap:.6f}")
print(f"Threshold: {threshold:.6f}")
print("Classification report:", report)


# --- Plots: score hist + ROC/PR
plot_hist(test_scores, y_test, path=f"{OUTDIR}/reconstruction_error_hist.png")
plot_roc_pr(y_test, test_scores, outdir=OUTDIR)

# --- Confusion matrix plot (compact)
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Inlier", "Fraud"])
fig, ax = plt.subplots(figsize=(4.8, 4.2))
disp.plot(ax=ax, colorbar=True, values_format="d")
ax.set_title("Confusion Matrix")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/confusion_matrix.png", dpi=150)
plt.close(fig)

print(f"Output results have been written to following directory path: {os.path.abspath(OUTDIR)}")

=== Evaluation ===
ROC AUC: 0.938422
Average Precision (AP): 0.289595
Threshold: 3.091191
Classification report:               precision    recall  f1-score   support

           0     0.9998    0.6983    0.8223     56864
           1     0.0053    0.9388    0.0106        98

    accuracy                         0.6987     56962
   macro avg     0.5026    0.8186    0.4165     56962
weighted avg     0.9981    0.6987    0.8209     56962

Output results have been written to following directory path: /home/studio-lab-user/sagemaker-studiolab-notebooks/outputs
