In [None]:
"""
Bitcoin Fraud Detection using Multi-Layer Perceptron (MLP)

This notebook trains a neural network classifier for wallet fraud detection.
We perform a simple hyperparameter search over network architectures and
evaluate the best configuration on a held-out test set.
"""

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# Load the preprocessed dataset
df = pd.read_csv("../../data/processed_data/case_1.csv")
X = df.drop(columns="class")
y_raw = df["class"]


In [None]:
# Separate numeric and categorical columns for preprocessing
num_cols = X.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipelines
numeric_pipeline = SkPipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = SkPipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols),
])


In [None]:
# Apply preprocessing and encode labels
X_proc = preprocessor.fit_transform(X)
le = LabelEncoder()
y = le.fit_transform(y_raw)

# Compute balanced class weights to handle imbalanced data
class_weight = dict(enumerate(
    compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
))

# Split into train/val/test sets
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

input_dim = X_train.shape[1]
num_classes = len(le.classes_)


In [None]:
# Define architectures to try (simple manual grid search)
param_grid = [
    {"layers": [64, 32], "dropout": 0.3, "lr": 1e-3},
    {"layers": [128, 64, 32], "dropout": 0.5, "lr": 5e-4},
    {"layers": [128, 64], "dropout": 0.2, "lr": 1e-3},
]

results = []

# Train each configuration and track validation accuracy
for params in param_grid:
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(input_dim,)))

    for units in params["layers"]:
        model.add(tf.keras.layers.Dense(units, activation="relu"))
        if params["dropout"] > 0:
            model.add(tf.keras.layers.Dropout(params["dropout"]))

    model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=params["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        class_weight=class_weight,
        verbose=0
    )

    val_acc = history.history["val_accuracy"][-1]
    results.append((params, val_acc))
    print(f"Params: {params} -> Val acc: {val_acc:.4f}")


In [None]:
# Choose best based on validation accuracy
best_params = max(results, key=lambda x: x[1])[0]
print("Best params:", best_params)

# Rebuild & retrain on full training set
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(input_dim,)))

for units in best_params["layers"]:
    model.add(tf.keras.layers.Dense(units, activation="relu"))
    if best_params["dropout"] > 0:
        model.add(tf.keras.layers.Dropout(best_params["dropout"]))

model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=best_params["lr"]),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    class_weight=class_weight,
    verbose=0
)

y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred, target_names=le.classes_))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=le.classes_)
plt.tight_layout()
plt.show()
