In [1]:
"""
Random‑Forest Classification & Feature‑Ablation Study – Jupyter Edition
======================================================================

Trains a tuned **RandomForestClassifier** on a labelled dataset and
systematically measures performance drops when individual features or
feature‑pairs are removed (ablation study).

What this notebook does
-----------------------
1. **Load & inspect** the dataset specified in `DATA_PATH`.
2. **Pre‑process** numerical & categorical columns (imputation, scaling,
   one‑hot) via `ColumnTransformer`.
3. **Hyper‑parameter search** with `GridSearchCV` (balanced classes).
4. **Baseline evaluation** with all features.
5. **Per‑feature ablation** – drops one feature at a time and records metrics.
6. **Pairwise ablation** – drops every pair of features (⚠️ O(N²) loops).
7. **Save results** to CSVs (`ablation_per_feature_results.csv`,
   `ablation_pairs_results.csv`).

> TIP 💡 For large feature sets (>150) the pairwise loop can be extremely slow.
> Adjust `MAX_PAIR_FEATURES` below or comment that section if needed.
"""



In [2]:
from __future__ import annotations

import logging
import os
from pathlib import Path
from itertools import combinations

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# -----------------------------------------------------------------------------
# Configuration – edit paths & flags here
# -----------------------------------------------------------------------------
BASE_PATH = Path("../data/processed_data")
USECASE = "case_1.csv"
DATA_PATH = BASE_PATH / USECASE 
TARGET_COL = "class"          # label column name

# Hyper‑parameter grid ---------------------------------------------------------
RF_PARAM_GRID = {
    "n_estimators": [100, 200],
    "max_depth": [None, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt"],
    "criterion": ["gini"],
}

TEST_SIZE = 0.2
RANDOM_STATE = 42

# Pairwise ablation guard ------------------------------------------------------
MAX_PAIR_FEATURES = 50   # If len(feature_names) exceeds this, skip pair ablation

# Output paths ----------------------------------------------------------------
OUT_PER_FEATURE = Path("ablation_per_feature_results.csv")
OUT_PAIRWISE    = Path("ablation_pairs_results.csv")

# Logging ---------------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("rf‑ablation")

In [6]:
# -----------------------------------------------------------------------------
# 1. Load data
# -----------------------------------------------------------------------------
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
log.info("Loaded %s rows × %s columns from %s", *df.shape, DATA_PATH)

if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found")

X = df.drop(columns=TARGET_COL)
y = df[TARGET_COL]
log.info("Target distribution: %s", y.value_counts().to_dict())

2025-06-04 09:30:10,968 [INFO] Loaded 15176 rows × 22 columns from ../data/processed_data/case_1.csv
2025-06-04 09:30:10,970 [INFO] Target distribution: {'licit': 8833, 'fraud': 6343}


In [7]:
# -----------------------------------------------------------------------------
# 2. Pre‑processing pipeline
# -----------------------------------------------------------------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
log.info("Numeric(%s): %s", len(num_cols), num_cols)
log.info("Categorical(%s): %s", len(cat_cols), cat_cols)

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols),
])

log.info("Fitting pre‑processor…")
X_prep = preprocessor.fit_transform(X)
log.info("Data shape after preprocessing: %s", X_prep.shape)

onehot_features = (
    preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(cat_cols)
    if cat_cols else []
)
feature_names = np.concatenate([num_cols, onehot_features])
log.info("Total features after encoding: %s", len(feature_names))

# Encode labels
le = LabelEncoder().fit(y)
y_enc = le.transform(y)
log.info("Label classes: %s", list(le.classes_))

2025-06-04 09:30:14,154 [INFO] Numeric(21): ['total_in_degree', 'total_out_degree', 'total_unique_in', 'total_unique_out', 'avg_in_transaction', 'avg_out_transaction', 'total_received', 'total_sent', 'net_balance', 'avg_in_time_interval', 'avg_out_time_interval', 'avg_active_duration', 'in_out_ratio', 'unique_in_ratio', 'unique_out_ratio', 'volume_ratio', 'net_balance_ratio', 'activity_index', 'time_interval_ratio', 'weighted_avg_tx', 'wallet_lifetime_sec']
2025-06-04 09:30:14,155 [INFO] Categorical(0): []
2025-06-04 09:30:14,155 [INFO] Fitting pre‑processor…
2025-06-04 09:30:14,191 [INFO] Data shape after preprocessing: (15176, 21)
2025-06-04 09:30:14,192 [INFO] Total features after encoding: 21
2025-06-04 09:30:14,193 [INFO] Label classes: ['fraud', 'licit']


In [8]:
# -----------------------------------------------------------------------------
# 3. Train‑test split & hyper‑parameter search
# -----------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_prep,
    y_enc,
    test_size=TEST_SIZE,
    stratify=y_enc,
    random_state=RANDOM_STATE,
)
log.info("Split: %s train / %s test", X_train.shape[0], X_test.shape[0])

rf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced")
log.info("Running GridSearchCV …")

grid = GridSearchCV(rf, RF_PARAM_GRID, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
best_params = grid.best_params_
log.info("Best RF params: %s", best_params)

2025-06-04 09:30:17,067 [INFO] Split: 12140 train / 3036 test
2025-06-04 09:30:17,067 [INFO] Running GridSearchCV …


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimato

2025-06-04 09:30:42,234 [INFO] Best RF params: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [9]:
# -----------------------------------------------------------------------------
# 4. Baseline model with all features
# -----------------------------------------------------------------------------
best_rf = RandomForestClassifier(**best_params, random_state=RANDOM_STATE, class_weight="balanced")
best_rf.fit(X_train, y_train)
base_pred = best_rf.predict(X_test)

print("=== Baseline classification report ===")
print(classification_report(y_test, base_pred, target_names=list(le.classes_)))
print("Confusion matrix:\n", confusion_matrix(y_test, base_pred))

base_acc = accuracy_score(y_test, base_pred)
log.info("Baseline accuracy: %.4f", base_acc)

2025-06-04 09:30:49,926 [INFO] Baseline accuracy: 0.8300


=== Baseline classification report ===
              precision    recall  f1-score   support

       fraud       0.80      0.79      0.80      1269
       licit       0.85      0.86      0.85      1767

    accuracy                           0.83      3036
   macro avg       0.83      0.82      0.83      3036
weighted avg       0.83      0.83      0.83      3036

Confusion matrix:
 [[1005  264]
 [ 252 1515]]


In [10]:
# -----------------------------------------------------------------------------
# 5. Per‑feature ablation
# -----------------------------------------------------------------------------
log.info("Per‑feature ablation …")
per_feat_records = []
for idx, feat in enumerate(tqdm(feature_names, desc="Ablating single features")):
    keep_idx = [i for i in range(len(feature_names)) if i != idx]
    X_tr = X_train[:, keep_idx]
    X_te = X_test[:, keep_idx]

    clf = RandomForestClassifier(**best_params, random_state=RANDOM_STATE, class_weight="balanced")
    clf.fit(X_tr, y_train)
    pred = clf.predict(X_te)
    report = classification_report(y_test, pred, output_dict=True)

    per_feat_records.append({
        "feature_removed": feat,
        "accuracy": accuracy_score(y_test, pred),
        "precision_macro": report["macro avg"]["precision"],
        "recall_macro": report["macro avg"]["recall"],
        "f1_macro": report["macro avg"]["f1-score"],
    })

per_feat_df = pd.DataFrame(per_feat_records)
per_feat_df.to_csv(OUT_PER_FEATURE, index=False)
log.info("Saved per‑feature ablation results → %s", OUT_PER_FEATURE)

2025-06-04 09:30:49,931 [INFO] Per‑feature ablation …
Ablating single features: 100%|██████████| 21/21 [01:36<00:00,  4.60s/it]
2025-06-04 09:32:26,534 [INFO] Saved per‑feature ablation results → ablation_per_feature_results.csv


In [11]:
# -----------------------------------------------------------------------------
# 6. Pairwise ablation (optional)
# -----------------------------------------------------------------------------
if len(feature_names) <= MAX_PAIR_FEATURES:
    log.info("Pairwise ablation on %s features …", len(feature_names))
    pair_records = []
    for i, j in tqdm(list(combinations(range(len(feature_names)), 2)), desc="Ablating pairs"):
        keep_idx = [k for k in range(len(feature_names)) if k not in (i, j)]
        X_tr = X_train[:, keep_idx]
        X_te = X_test[:, keep_idx]
        clf = RandomForestClassifier(**best_params, random_state=RANDOM_STATE, class_weight="balanced")
        clf.fit(X_tr, y_train)
        pred = clf.predict(X_te)
        f1 = classification_report(y_test, pred, output_dict=True)["macro avg"]["f1-score"]
        pair_records.append({
            "features_removed": f"{feature_names[i]},{feature_names[j]}",
            "accuracy": accuracy_score(y_test, pred),
            "f1_macro": f1,
        })
    pair_df = pd.DataFrame(pair_records)
    pair_df.to_csv(OUT_PAIRWISE, index=False)
    log.info("Saved pairwise ablation results → %s", OUT_PAIRWISE)
else:
    log.warning("Skipped pairwise ablation: %s features > MAX_PAIR_FEATURES=%s", len(feature_names), MAX_PAIR_FEATURES)

2025-06-04 09:32:26,538 [INFO] Pairwise ablation on 21 features …
Ablating pairs: 100%|██████████| 210/210 [16:07<00:00,  4.61s/it]
2025-06-04 09:48:33,855 [INFO] Saved pairwise ablation results → ablation_pairs_results.csv


In [12]:
# -----------------------------------------------------------------------------
# 7. Quick summary of worst degradations
# -----------------------------------------------------------------------------
print("\nTop 5 single‑feature removals (lowest f1_macro):")
print(per_feat_df.sort_values("f1_macro").head(5)[["feature_removed", "f1_macro"]])

if OUT_PAIRWISE.exists():
    pair_df = pd.read_csv(OUT_PAIRWISE)
    print("\nTop 5 feature‑pair removals (lowest f1_macro):")
    print(pair_df.sort_values("f1_macro").head(5)[["features_removed", "f1_macro"]])


Top 5 single‑feature removals (lowest f1_macro):
          feature_removed  f1_macro
20    wallet_lifetime_sec  0.809513
13        unique_in_ratio  0.817162
15           volume_ratio  0.817647
10  avg_out_time_interval  0.818570
11    avg_active_duration  0.818610

Top 5 feature‑pair removals (lowest f1_macro):
                            features_removed  f1_macro
208  time_interval_ratio,wallet_lifetime_sec  0.803229
131           total_sent,wallet_lifetime_sec  0.804266
194     unique_out_ratio,wallet_lifetime_sec  0.804583
143          net_balance,wallet_lifetime_sec  0.806025
19       total_in_degree,wallet_lifetime_sec  0.806025
