In [1]:
import sys
sys.path.append('./utils')  # make sure Python knows where to look

import os
os.environ['DYLD_INSERT_LIBRARIES'] = ''
if 'MallocStackLogging' in os.environ:
    del os.environ['MallocStackLogging']

import numpy as np
# Fix deprecated np.int for NumPy 1.20+ (mlens library still uses deprecated aliases)
if not hasattr(np, 'int'):
    np.int = np.int64
    np.float = np.float64
    np.complex = np.complex128
    np.object = np.object_
    np.str = np.str_
    np.long = np.int64
    np.unicode = np.str_

# Display full output in Jupyter
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

import datetime
import pandas as pd
from sklearn.base import clone
from numpy import hstack, vstack
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, average_precision_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import ml_training_functions as utils_training
import file_handler_functions as utils_file


In [2]:
# Load data from the 2018-07-25 to the 2018-08-14
DIR_INPUT='./data/simulated-data-transformed/' 

BEGIN_DATE = "2018-06-11"
END_DATE = "2018-08-14"

print("Load  files")
%time transactions_df = utils_file.read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),transactions_df.TX_FRAUD.sum()))

START_DATE = "2018-07-25"
delta_train = delta_delay = delta_test = delta_valid = delta_assessment = 7

# Number of folds for the prequential validation
n_folds = 4

start_date_training = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
start_date_test = start_date_training+datetime.timedelta(days=delta_train+delta_delay)

start_date_training_for_valid = start_date_training+datetime.timedelta(days=-(delta_delay+delta_valid))
start_date_training_for_test = start_date_training+datetime.timedelta(days=(n_folds-1)*delta_test)

Load  files
CPU times: user 99.2 ms, sys: 121 ms, total: 220 ms
Wall time: 277 ms
622892 transactions loaded, containing 5515 fraudulent transactions


In [3]:
output_feature = "TX_FRAUD"

input_features = ['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

# Only keep columns that are needed as argument to the custom scoring function
# (in order to reduce the serialization time of transaction dataset)
transactions_df_scorer = transactions_df[['CUSTOMER_ID', 'TX_FRAUD','TX_TIME_DAYS']]

# Split dataset for Cross validation
prequential_split_indices=utils_training.prequentialSplit(transactions_df,
                                               start_date_training=start_date_training, 
                                               n_folds=n_folds, 
                                               delta_train=delta_train, 
                                               delta_delay=delta_delay, 
                                               delta_assessment=delta_assessment)


In [None]:
def get_base_models_tuning():
    return {
        'XGBClassifier': XGBClassifier(
            learning_rate = 0.3,
            max_depth = 3,
            n_estimators = 100,
            n_jobs = -1,
            random_state = 0,
            scale_pos_weight = 10
        ),
        'LGBMClassifier': LGBMClassifier(
            learning_rate = 0.1,
            max_depth = 3,
            n_estimators = 100,
            n_jobs=-1,
            random_state=0,
            scale_pos_weight=1, 
            verbosity=-1
        ),
        'CatBoostClassifier': CatBoostClassifier(
            depth = 4,
            iterations = 500,
            learning_rate = 0.05,
            random_state = 0,
            scale_pos_weight = 5,
            # prefer class_weights if heavy imbalance; we'll rely on default + tuned params
            # class_weights can be added if desired, e.g. class_weights=[1, 99]
            verbose = 0
        ),
        'HistGradientBoostingClassifier': HistGradientBoostingClassifier(
            max_iter = 100,
            learning_rate = 0.05,
            max_depth = 5,
            random_state = 0,
            class_weight = 'balanced'
        ),
        'BalancedRandomForestClassifier': BalancedRandomForestClassifier(
            max_depth = 20,
            n_estimators = 100,
            n_jobs = -1,
            random_state = 0,
            sampling_strategy = 0.05
        ),
        'BalancedBaggingClassifier': BalancedBaggingClassifier(
            bootstrap = True,
            estimator = DecisionTreeClassifier(max_depth=20, random_state=0),
            n_estimators = 100,
            n_jobs = -1,
            random_state = 0,
            sampler = RandomUnderSampler(),
            sampling_strategy = 0.1
        ),
        'LogisticRegression': LogisticRegression(
            C=0.1,
            random_state=0,
            class_weight='balanced'
        )
    }
    
def get_base_models():
    return {
        'XGBClassifier': XGBClassifier(
            learning_rate = 0.1,
            max_depth = 3,
            n_estimators = 50,
            n_jobs = -1,
            random_state = 0,
            scale_pos_weight = 10
        ),
        'LGBMClassifier': LGBMClassifier(
            learning_rate = 0.1,
            max_depth = 3,
            n_estimators = 50,
            n_jobs=-1,
            random_state=0,
            scale_pos_weight=1
        ),
        'CatBoostClassifier': CatBoostClassifier(
            depth = 4,
            iterations = 500,
            learning_rate = 0.05,
            random_state = 0,
            class_weights=[1, 99],
            verbose = 0
        ),
        'HistGradientBoostingClassifier': HistGradientBoostingClassifier(
            max_iter = 100,
            learning_rate = 0.05,
            max_depth = 5,
            random_state = 0,
            class_weight = 'balanced'
        ),
        'BalancedRandomForestClassifier': BalancedRandomForestClassifier(
            max_depth = 50,
            n_estimators = 100,
            n_jobs = -1,
            random_state = 0,
            sampling_strategy = 0.1
        ),
        'BalancedBaggingClassifier': BalancedBaggingClassifier(
            bootstrap = True,
            estimator = DecisionTreeClassifier(max_depth=20, random_state=0),
            n_estimators = 100,
            n_jobs = -1,
            random_state = 0,
            sampler = RandomUnderSampler(),
            sampling_strategy = 0.1
        ),
        'LogisticRegression': LogisticRegression(
            class_weight='balanced',
            C=0.1,
            random_state=0,
            solver='liblinear'
        ),
    }

# for this function, the transactions_df_scorer must contain CUSTOMER_ID and TX_TIME_DAYS for the full df; X_index are the indices for the current test fold.
def card_precision_top_k_wrapper(probs, X_index, transactions_df_scorer, k=100):
    preds_df = transactions_df_scorer.loc[X_index].copy()
    preds_df['predictions'] = probs
    nb, per_day_list, mean_cp = utils_training.card_precision_top_k(preds_df, k)
    return mean_cp

# out-of-fold builder (prequential-friendly)
def get_out_of_fold_predictions_no_sampling(transactions_df, prequential_split_indices, base_models_dict,
                                           input_features, output_feature, transactions_df_scorer=None):
    meta_rows = []
    meta_labels = []
    meta_indices = []
    model_names = list(base_models_dict.keys())

    # iterate prequential folds
    for fold_i, (train_ix, test_ix) in enumerate(prequential_split_indices):
        # get train/test slices (no sampling)
        train_df = transactions_df.iloc[train_ix]
        test_df = transactions_df.iloc[test_ix]
        X_train, y_train, X_test, y_test = get_train_test_features(train_df, test_df, input_features, output_feature)

        # scale: fit scaler on training fold only, transform train/test
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # collect base model predictions
        fold_preds = []

        for name in model_names:
            # initailized a new model instance on different fold
            model = clone(base_models_dict[name]) 
            model.fit(X_train_scaled, y_train)
            probs = get_predict_proba(model, X_test_scaled)

            fold_preds.append(probs.reshape(-1, 1))

        # stack column-wise to shape (n_test_rows, n_models)
        fold_meta_X = np.hstack(fold_preds)

        meta_rows.append(fold_meta_X)
        meta_labels.append(y_test)
        
        # keep original indices for evaluation (card precision)
        meta_indices.append(test_df.index)

    # vertically stack folds
    meta_X = np.vstack(meta_rows)
    meta_y = np.concatenate(meta_labels)
    meta_index = np.concatenate([np.array(idx) for idx in meta_indices])

    return meta_X, meta_y, meta_index, model_names

# Fit base models on full training data (no sampling)
def fit_base_models_full(X_train_df, y_train_series, base_models_dict):
    fitted_models = {}
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_df.values)   # we will also store this scaler for test transforms if needed
    for name, model in base_models_dict.items():
        m = clone(model)
        m.fit(X_train_scaled, y_train_series.values)
        fitted_models[name] = m
    return fitted_models, scaler

# Fit meta model (LogisticRegression) on meta features
def fit_meta_model(base_models, meta_X, meta_y):
    meta_model = base_models['LogisticRegression']
    meta_model.fit(meta_X, meta_y)
    return meta_model

# Super-learner prediction helper for producing probability predictions on test set
def super_learner_predict_proba(models_fitted_dict, meta_model, X_test_df, scaler=None):
    # transform X_test appropriately (if scaler provided)
    if scaler is not None:
        X_test_scaled = scaler.transform(X_test_df.values)
    else:
        X_test_scaled = X_test_df.values

    probs_list = []
    for _, model in models_fitted_dict.items():
        probs = get_predict_proba(model, X_test_scaled)
        probs_list.append(probs)
        
    meta_X_test = np.hstack(probs_list)
    
    # meta_model should support predict_proba
    if hasattr(meta_model, "predict_proba"):
        return meta_model.predict_proba(meta_X_test)[:, 1]
    else:
        # fallback
        return meta_model.predict(meta_X_test)

def get_train_test_features(train_df, test_df, input_features, output_feature):
    return (train_df[input_features].values, 
            train_df[output_feature].values,
            test_df[input_features].values,
            test_df[output_feature].values)

def get_predict_proba(model, X_scaled):
    if hasattr(model, "predict_proba"): # some models might not implement predict_proba
        return model.predict_proba(X_scaled)[:, 1]
    elif hasattr(model, "decision_function"):  # if model implement decision_function -> convert to probabilities via logistic/sigmoid
        dec = model.decision_function(X_scaled)
        return 1.0 / (1.0 + np.exp(-dec))  # sigmoid to convert to probability-like
    else: # last resort: use predict (not recommended)
        return model.predict(X_scaled).astype(float)

def get_performance_metrics(df, y, probs, transactions_df_scorer):
    auc = roc_auc_score(y, probs)
    ap  = average_precision_score(y, probs)
    cp = None
    if transactions_df_scorer is not None:
        cp = card_precision_top_k_wrapper(probs, df.index, transactions_df_scorer, k=100)
        
    return auc, ap, cp
    
def evaluate_base_models_per_fold(base_models_dict, train_df, test_df,
                                  input_features, output_feature,
                                  transactions_df_scorer=None):
    X_train, y_train, X_test, y_test = get_train_test_features(train_df, test_df, input_features, output_feature)

    # scale using train only
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    fold_results = {}

    for name, model in base_models_dict.items():
        m = clone(model)
        m.fit(X_train_scaled, y_train)

        # Predicting train set and test set
        train_probs = get_predict_proba(m, X_train_scaled)
        test_probs = get_predict_proba(m, X_test_scaled)
        
        train_auc, train_ap, train_cp = get_performance_metrics(train_df, y_train, train_probs, transactions_df_scorer)
        test_auc, test_ap, test_cp = get_performance_metrics(test_df, y_test, test_probs, transactions_df_scorer)

        train_performance_detail = {'n_train': len(train_df), 'auc': train_auc, 'ap': train_ap, 'cp100': train_cp}
        test_performance_detail = {'n_train': len(test_df), 'auc': test_auc, 'ap': test_ap, 'cp100': test_cp}
        fold_results[name] = {'train': train_performance_detail, 'test': test_performance_detail}

    return fold_results


def evaluate_super_learner_per_fold(fitted_base_models, meta_model,
                                    train_df, test_df,
                                    input_features, output_feature,
                                    scaler, transactions_df_scorer=None):
    X_train, y_train, X_test, y_test = get_train_test_features(train_df, test_df, input_features, output_feature)

    # scale test set with global scaler
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # build meta features for train
    train_meta = []
    for m in fitted_base_models.values():
        train_probs = get_predict_proba(m, X_train_scaled)
        train_meta.append(train_probs.reshape(-1, 1))
    X_train_meta = np.hstack(train_meta)

    # build meta features for test
    test_meta = []
    for m in fitted_base_models.values():
        test_probs = get_predict_proba(m, X_test_scaled)
        test_meta.append(test_probs.reshape(-1, 1))
    X_test_meta = np.hstack(test_meta)

    # predict using meta model
    train_probs = meta_model.predict_proba(X_train_meta)[:, 1]
    test_probs  = meta_model.predict_proba(X_test_meta)[:, 1]
    
    train_auc, train_ap, train_cp = get_performance_metrics(train_df, y_train, train_probs, transactions_df_scorer)
    test_auc, test_ap, test_cp = get_performance_metrics(test_df, y_test, test_probs, transactions_df_scorer)

    train_performance_detail = {'n_train': len(train_df), 'auc': train_auc, 'ap': train_ap, 'cp100': train_cp}
    test_performance_detail = {'n_train': len(test_df), 'auc': test_auc, 'ap': test_ap, 'cp100': test_cp}
    return {'train': train_performance_detail, 'test': test_performance_detail}

def summarize_performance_across_folds(per_fold_results):
    train_auc = []
    train_ap = []
    train_cp = []
    test_auc = []
    test_ap = []
    test_cp = []

    for item in per_fold_results:
        r_train = item['results']['train']
        r_test  = item['results']['test']

        train_auc.append(r_train['auc'])
        train_ap.append(r_train['ap'])
        train_cp.append(r_train['cp100'])

        test_auc.append(r_test['auc'])
        test_ap.append(r_test['ap'])
        test_cp.append(r_test['cp100'])

    return {
        'train_auc_mean': f'{np.mean(train_auc):.4f}',
        'train_ap_mean': f'{np.mean(train_ap):.4f}',
        'train_cp100_mean': f'{np.mean(train_cp):.4f}',
        'test_auc_mean': f'{np.mean(test_auc):.4f}',
        'test_ap_mean': f'{np.mean(test_ap):.4f}',
        'test_cp100_mean': f'{np.mean(test_cp):.4f}',
    }

# Full pipeline runner (ties everything together)
def run_super_learner(base_models, transactions_df, prequential_split_indices,
                              input_features, output_feature,
                              transactions_df_scorer=None, verbose=True):

    # -------------------- OOF meta features --------------------
    meta_X, meta_y, meta_index, model_names = get_out_of_fold_predictions_no_sampling(
        transactions_df, prequential_split_indices, base_models,
        input_features, output_feature, transactions_df_scorer
    )
    meta_model = fit_meta_model(base_models, meta_X, meta_y)

    # -------------------- fit base models on first window --------------------
    first_train_ix, _ = prequential_split_indices[0]
    train_df_full = transactions_df.iloc[first_train_ix]
    X_train_full = train_df_full[input_features]
    y_train_full = train_df_full[output_feature]

    fitted_base_models, global_scaler = fit_base_models_full(
        X_train_full, y_train_full, base_models
    )

    # -------------------- Evaluate per fold --------------------
    super_learner_performance = []
    base_model_performance = {name: [] for name in base_models.keys()}

    for fold_i, (train_ix, test_ix) in enumerate(prequential_split_indices):
        train_df = transactions_df.iloc[train_ix]
        test_df  = transactions_df.iloc[test_ix]

        # ---- base models per fold ----
        base_fold_result = evaluate_base_models_per_fold(
            base_models, train_df, test_df,
            input_features, output_feature,
            transactions_df_scorer
        )
        for model_name, metrics in base_fold_result.items():
            base_model_performance[model_name].append({'fold': fold_i, 'results': metrics})

        # ---- super learner per fold ----
        super_learner_fold_result = evaluate_super_learner_per_fold(
            fitted_base_models, meta_model,
            train_df, test_df,
            input_features, output_feature,
            global_scaler, transactions_df_scorer
        )
        super_learner_performance.append({'fold': fold_i, 'results': super_learner_fold_result })

    return {
        'meta_model': meta_model,
        'fitted_base_models': fitted_base_models,
        'global_scaler': global_scaler,
        'base_model_performance': base_model_performance,
        'super_learner_performance': super_learner_performance
    }


In [None]:
base_models = get_base_models()

preformance_results = run_super_learner(base_models, transactions_df, prequential_split_indices,
                            input_features, output_feature,
                            transactions_df_scorer=transactions_df_scorer,
                            verbose=True)

In [7]:
res = {}
res["SuperLearner"] = summarize_performance_across_folds(preformance_results['super_learner_performance'])
for name, performance_result in preformance_results['base_model_performance'].items():
    res[name] = summarize_performance_across_folds(performance_result)
    
pd.DataFrame.from_dict(res, orient='index')

Unnamed: 0,train_auc_mean,train_ap_mean,train_cp100_mean,test_auc_mean,test_ap_mean,test_cp100_mean
SuperLearner,0.9256,0.8018,0.4521,0.9078,0.7593,0.3229
XGBClassifier,0.9343,0.7378,0.4264,0.8719,0.6318,0.2921
LGBMClassifier,0.9469,0.7774,0.4311,0.8794,0.6488,0.2896
CatBoostClassifier,0.9945,0.795,0.4636,0.8816,0.6065,0.2921
HistGradientBoostingClassifier,0.9724,0.766,0.4354,0.8541,0.6004,0.2754
BalancedRandomForestClassifier,1.0,1.0,0.5464,0.8811,0.6549,0.2914
BalancedBaggingClassifier,0.9994,0.9489,0.535,0.8804,0.6913,0.2929
LogisticRegression,0.8996,0.5941,0.3982,0.8704,0.5419,0.2825


In [7]:
base_models = get_base_models_tuning()

preformance_results_tuning = run_super_learner(base_models, transactions_df, prequential_split_indices,
                            input_features, output_feature,
                            transactions_df_scorer=transactions_df_scorer,
                            verbose=True)

In [8]:
res = {}
res["SuperLearner"] = summarize_performance_across_folds(preformance_results_tuning['super_learner_performance'])
for name, performance_result in preformance_results_tuning['base_model_performance'].items():
    res[name] = summarize_performance_across_folds(performance_result)
    
pd.DataFrame.from_dict(res, orient='index')

Unnamed: 0,train_auc_mean,train_ap_mean,train_cp100_mean,test_auc_mean,test_ap_mean,test_cp100_mean
SuperLearner,0.9244,0.7989,0.4475,0.9082,0.7538,0.3214
XGBClassifier,0.9934,0.8701,0.4746,0.8798,0.6445,0.2864
LGBMClassifier,0.966,0.81,0.4379,0.8856,0.6535,0.2893
CatBoostClassifier,0.971,0.8322,0.4454,0.886,0.6721,0.2943
HistGradientBoostingClassifier,0.9724,0.766,0.4354,0.8541,0.6004,0.2754
BalancedRandomForestClassifier,0.9991,0.946,0.5189,0.8844,0.6673,0.2904
BalancedBaggingClassifier,0.9994,0.9489,0.535,0.8804,0.6913,0.2929
LogisticRegression,0.8996,0.594,0.3982,0.8704,0.5418,0.2825


## Export Model

In [9]:
import joblib
import os

FOLDER_PATH = "./models"
FILE_PATH = FOLDER_PATH+"/super_learner_model.pkl"

os.makedirs(FOLDER_PATH, exist_ok=True)

joblib.dump({
    'base_models': preformance_results_tuning['fitted_base_models'],
    'meta_model': preformance_results_tuning['meta_model'],
    'scaler': preformance_results_tuning['global_scaler'],
    'input_features': input_features,
}, FILE_PATH)

['./models/super_learner_model.pkl']