
# WIRE Wallet — Scaling Pipelines (MinMax vs Standard vs Hybrid)

This notebook loads your engineered dataset and produces three scaled versions:

1. **All-MinMax** → `_minmax.csv` (baseline / current)
2. **All-Standard** → `_standard.csv`
3. **Hybrid (log+Robust + Standard + Passthrough + cyclical)** → `_hybrid.csv` (**recommended**)

It also persists the fitted preprocessors with joblib so you can reuse them in your training and serving pipelines.


In [None]:

# ==== Configuration ====
INPUT_CSV = "../../data/combined_wallets_with_transactions_and_balances_2.csv"  # change if needed
OUTPUT_MINMAX = "../../data/combined_wallets_with_transactions_and_balances_minmax.csv"
OUTPUT_STANDARD = "../../data/combined_wallets_with_transactions_and_balances_standard.csv"
OUTPUT_HYBRID = "../../data/combined_wallets_with_transactions_and_balances_hybrid.csv"

SCALER_MINMAX_PATH = "../../models/preprocessor_minmax.pkl"
SCALER_STANDARD_PATH = "../../models/preprocessor_standard.pkl"
SCALER_HYBRID_PATH = "../../models/preprocessor_hybrid.pkl"

RANDOM_STATE = 42


In [5]:

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
import joblib

# For optional quick plots (Fig-7-like feature sanity checks)
import matplotlib.pyplot as plt


In [None]:
# --- PATCH: safe numeric functions for FunctionTransformer ---

import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Force numeric matrix, clip negatives (log1p domain), handle NaNs/inf
def safe_log1p(X):
    X = pd.DataFrame(X)  # ensure 2D
    X = X.apply(pd.to_numeric, errors='coerce')  # to float
    X = X.fillna(0.0).to_numpy(dtype=float, copy=False)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    X = np.clip(X, 0.0, None)
    return np.log1p(X)

def hour_to_sin_cos(X):
    X = pd.DataFrame(X).apply(pd.to_numeric, errors='coerce').fillna(0.0).to_numpy(dtype=float)
    X = np.nan_to_num(X, nan=0.0)
    X = np.clip(X, 0, 23)
    sin = np.sin(2*np.pi*X/24.0)
    cos = np.cos(2*np.pi*X/24.0)
    # return two columns per original hour column
    return np.concatenate([sin, cos], axis=1)

# 1) Choose columns for each transform (only those that exist)
log_robust_cols = [c for c in ["Balance","noOfTrx.1","avg_txn_value_eth","value_std_dev","average_txn_interval"] if c in features]
std_cols        = [c for c in ["active_days","wallet_age_days","unique_counterparties","avg_gas_used","avg_gas_price"] if c in features]
cyc_cols        = [c for c in ["first_txn_time_of_day"] if c in features]

cyc_and_scaled  = set(log_robust_cols + std_cols + cyc_cols)
passthrough_cols = [c for c in features if c not in cyc_and_scaled]

log_then_robust = Pipeline([
    ("log", FunctionTransformer(safe_log1p, validate=False)),
    ("robust", RobustScaler())
])

cyclical = FunctionTransformer(hour_to_sin_cos, validate=False)

pre_hybrid = ColumnTransformer(
    transformers=[
        ("log_robust", log_then_robust, log_robust_cols),
        ("standard", StandardScaler(), std_cols),
        ("cyclical", cyclical, cyc_cols),
        ("passthrough", "passthrough", passthrough_cols),
    ],
    remainder="drop"
)

print("Hybrid groups:")
print("  log+robust:", log_robust_cols)
print("  standard  :", std_cols)
print("  cyclical  :", cyc_cols, "-> (sin, cos)")
print("  passthrough:", passthrough_cols)


In [6]:

# ==== Load ====
df = pd.read_csv(INPUT_CSV)

print("Loaded:", INPUT_CSV)
print("Rows, Cols:", df.shape)
print("\nColumns:", list(df.columns))

# Ensure isSafe exists if present, but it's not a feature to scale
label_col = "isSafe" if "isSafe" in df.columns else None

# Keep a copy of raw for passthrough of non-feature columns at the end
raw_df = df.copy()


Loaded: ../../data/combined_wallets_with_transactions_and_balances_2.csv
Rows, Cols: (33749, 34)

Columns: ['Address', 'Name', 'isSafe', 'Tags', 'Transactions', 'noOfTrx.1', 'Balance', 'total_transactions', 'self_transfer_ratio', 'circular_txn_count', 'circular_txn_ratio', 'avg_txn_value_eth', 'txn_spike_score', 'value_std_dev', 'avg_gas_used', 'avg_gas_price', 'active_days', 'wallet_age_days', 'unique_counterparties', 'failed_txn_ratio', 'eth_inflow_outflow_ratio', 'erc20_txn_count', 'nft_txn_count', 'first_txn_time_of_day', 'erc20_token_diversity', 'tx_direction_ratio', 'contract_interaction_ratio', 'value_entropy', 'tx_burst_count', 'average_txn_interval', 'new_token_interaction_count', 'token_approval_count', 'sbt_poap_event_count', 'approved_token_list']


In [None]:

# ==== Feature Columns (from your message) ====
all_features = [
    'noOfTrx.1', 'Balance', 'self_transfer_ratio', 'circular_txn_count',
    'circular_txn_ratio', 'avg_txn_value_eth', 'txn_spike_score',
    'value_std_dev', 'avg_gas_used', 'avg_gas_price', 'active_days',
    'wallet_age_days', 'unique_counterparties', 'failed_txn_ratio',
    'eth_inflow_outflow_ratio', 'erc20_txn_count', 'nft_txn_count',
    'first_txn_time_of_day', 'erc20_token_diversity', 'tx_direction_ratio',
    'contract_interaction_ratio', 'value_entropy', 'average_txn_interval',
    'new_token_interaction_count', 'token_approval_count',
    'sbt_poap_event_count'
]

# Keep only columns that exist in the file (avoid KeyErrors if some aren't present)
features = [c for c in all_features if c in df.columns]

missing = [c for c in all_features if c not in df.columns]
if missing:
    print("Warning: missing columns (will be skipped):", missing)

# Separate features from non-features
non_feature_cols = [c for c in df.columns if c not in features]
if label_col and label_col in non_feature_cols:
    print("Label column detected:", label_col)

X = df[features].apply(pd.to_numeric, errors='coerce').fillna(0.0)


Label column detected: isSafe


In [8]:

# ==== Preprocessors ====

# A) All-MinMax on features
pre_minmax = ColumnTransformer([
    ("mm", MinMaxScaler(), features)
], remainder="drop")

# B) All-Standard on features
pre_standard = ColumnTransformer([
    ("std", StandardScaler(), features)
], remainder="drop")

# C) Hybrid pipeline (recommended)
# - Log + RobustScaler: heavy-tailed numerics
log_robust_cols = [c for c in ["Balance","noOfTrx.1","avg_txn_value_eth","value_std_dev","average_txn_interval"] if c in features]
# - StandardScaler: mid-skew continuous
std_cols = [c for c in ["active_days","wallet_age_days","unique_counterparties","avg_gas_used","avg_gas_price"] if c in features]
# - Cyclical: first_txn_time_of_day (if present)
cyc_cols = [c for c in ["first_txn_time_of_day"] if c in features]
# - Passthrough (ratios & sparse counts)
cyc_and_scaled = set(log_robust_cols + std_cols + cyc_cols)
passthrough_cols = [c for c in features if c not in cyc_and_scaled]

log_then_robust = Pipeline([
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("robust", RobustScaler())
])

# Encode hour-of-day as sin/cos pair
def hour_to_sin_cos(x):
    arr = np.asarray(x).astype(float)
    # Protect against out-of-range or negative hours
    arr = np.clip(arr, 0, 23)
    sin = np.sin(2*np.pi*arr/24)
    cos = np.cos(2*np.pi*arr/24)
    return np.c_[sin, cos]

cyclical = FunctionTransformer(hour_to_sin_cos, validate=False)

pre_hybrid = ColumnTransformer(
    transformers=[
        ("log_robust", log_then_robust, log_robust_cols),
        ("standard", StandardScaler(), std_cols),
        ("cyclical", cyclical, cyc_cols),
        ("passthrough", "passthrough", passthrough_cols)
    ],
    remainder="drop"
)

print("Hybrid groups:")
print("  log+robust:", log_robust_cols)
print("  standard  :", std_cols)
print("  cyclical  :", cyc_cols, "-> will expand to 2 cols (sin, cos) if present")
print("  passthrough:", passthrough_cols)


Hybrid groups:
  log+robust: ['Balance', 'noOfTrx.1', 'avg_txn_value_eth', 'value_std_dev', 'average_txn_interval']
  standard  : ['active_days', 'wallet_age_days', 'unique_counterparties', 'avg_gas_used', 'avg_gas_price']
  cyclical  : ['first_txn_time_of_day'] -> will expand to 2 cols (sin, cos) if present
  passthrough: ['self_transfer_ratio', 'circular_txn_count', 'circular_txn_ratio', 'txn_spike_score', 'failed_txn_ratio', 'eth_inflow_outflow_ratio', 'erc20_txn_count', 'nft_txn_count', 'erc20_token_diversity', 'tx_direction_ratio', 'contract_interaction_ratio', 'value_entropy', 'new_token_interaction_count', 'token_approval_count', 'sbt_poap_event_count']


In [None]:
def fit_transform_and_export(preprocessor, X, base_df, out_csv, scaler_path):
    # Ensure the DataFrame columns passed to the preprocessor are numeric (objects -> numbers)
    X_num = pd.DataFrame(X.copy())
    for col in X_num.columns:
        X_num[col] = pd.to_numeric(X_num[col], errors='coerce')
    X_num = X_num.fillna(0.0)

    Xt = preprocessor.fit_transform(X_num)

    # Build output column names
    try:
        out_cols = preprocessor.get_feature_names_out()
        # clean names like 'cyclical__first_txn_time_of_day0'/'1' -> '_sin'/'_cos'
        out_cols = [
            name.replace("cyclical__first_txn_time_of_day0", "first_txn_time_of_day_sin")
                .replace("cyclical__first_txn_time_of_day1", "first_txn_time_of_day_cos")
            for name in out_cols
        ]
    except Exception:
        # Fallback: make generic names
        out_cols = [f"f_{i}" for i in range(np.asarray(Xt).shape[1])]

    Xt_df = pd.DataFrame(Xt, index=base_df.index, columns=out_cols)

    # Re-attach non-feature columns
    out = pd.concat([base_df.drop(columns=[c for c in features if c in base_df.columns], errors="ignore"),
                     Xt_df], axis=1)

    out.to_csv(out_csv, index=False)
    import joblib
    joblib.dump(preprocessor, scaler_path)

    print(f"Saved scaled CSV -> {out_csv}")
    print(f"Saved fitted preprocessor -> {scaler_path}")
    return out


In [10]:

print("\n=== Fitting All-MinMax ===")
out_minmax = fit_transform_and_export(pre_minmax, X, raw_df, OUTPUT_MINMAX, SCALER_MINMAX_PATH)

print("\n=== Fitting All-Standard ===")
out_standard = fit_transform_and_export(pre_standard, X, raw_df, OUTPUT_STANDARD, SCALER_STANDARD_PATH)

print("\n=== Fitting HYBRID (recommended) ===")
out_hybrid = fit_transform_and_export(pre_hybrid, X, raw_df, OUTPUT_HYBRID, SCALER_HYBRID_PATH)



=== Fitting All-MinMax ===
Saved scaled CSV -> ../../data/combined_wallets_with_transactions_and_balances_minmax.csv
Saved fitted preprocessor -> preprocessor_minmax.pkl

=== Fitting All-Standard ===
Saved scaled CSV -> ../../data/combined_wallets_with_transactions_and_balances_standard.csv
Saved fitted preprocessor -> preprocessor_standard.pkl

=== Fitting HYBRID (recommended) ===


TypeError: loop of ufunc does not support argument 0 of type str which has no callable log1p method

In [None]:

# Optional: quick Fig-7-like checks on one or two heavy-tailed features
plt.figure(figsize=(10,4))
plt.subplot(1,3,1)
plt.hist(out_minmax.get('Balance', pd.Series(dtype=float)), bins=30)
plt.title('Balance (MinMax)')
plt.subplot(1,3,2)
plt.hist(out_standard.get('Balance', pd.Series(dtype=float)), bins=30)
plt.title('Balance (Standard)')
plt.subplot(1,3,3)
plt.hist(out_hybrid.get('Balance', pd.Series(dtype=float)), bins=30)
plt.title('Balance (Hybrid)')
plt.tight_layout()
plt.show()
