```
Copyright 2024 IBM Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

## Training LightGBM model for AML using graph features

In [1]:
from snapml import GraphFeaturePreprocessor

import numpy as np
import pandas as pd
import time

import lightgbm as lgb
from sklearn.metrics import f1_score

In [2]:
formatted_data_path = "./aml-demo-data/out_dir_small_li/"

In [3]:
total_size = pd.read_csv(f"{formatted_data_path}formatted_transactions.csv").shape[0]
n_test = round(total_size * 0.2)
print(total_size, n_test)

6924049 1384810


## Input parameters

In [4]:
# Set the path to the input transacton file enriched with graph-based features.
transactions_path = formatted_data_path + "formatted_transactions.csv"

# Set the column indices to be removed: Transaction ID, Source Account ID, Target Account ID, Source Bank ID, Target Bank ID
remove_cols = [0,1,2,10,11]

## Preparing data for training

In [5]:
print("Loading test data")
X_all = np.loadtxt(transactions_path, dtype=np.float64, delimiter=",", comments="#", skiprows=1)

Y_all = X_all[:,-1] # Labels
X_all = X_all[:,:-1] # Drop labels

print("Data loaded succesfully.")
print("Data shape is ", X_all.shape)

Loading test data
Data loaded succesfully.
Data shape is  (6924049, 18)


In [6]:
print("Creating a graph feature preprocessor ")
gp = GraphFeaturePreprocessor()

print("Setting the parameters of the graph feature preprocessor ")
tw_days = 1
gf_params = {
    # Number of software threads to be used
    "num_threads": 12,

    # Enable account statistics
    "vertex_stats": True,
    "vertex_stats_cols": [3,6],

    # Enable graph-pattern-based features
    "fan": True,
    "degree": True,
    "scatter-gather": True,
    "temp-cycle": True,
    "lc-cycle": True,
    "lc-cycle_len": 10,

    # Set time window parameters
    "time_window": tw_days*24*3600,
    "vertex_stats_tw": tw_days*24*3600,
    "scatter-gather_tw": 6*3600,
    "temp-cycle_tw": tw_days*24*3600,
    "lc-cycle_tw": tw_days*24*3600,
}
gp.set_params(gf_params)

Creating a graph feature preprocessor 
Setting the parameters of the graph feature preprocessor 


In [7]:
print("params", len(gp.get_params().keys()))
print("sub_params", sum([len(x) if isinstance(x, list) else 1 for x in gp.get_params().values()]))

params 23
sub_params 151


### Function for determining the number of graph-based features produced by Graph Feature Preprocessor

In [8]:
def get_num_gf_feats(gf):
    params = gf.get_params()
    feat_num = 0

    # add features names for the graph patterns
    for pattern in ["fan", "degree", "scatter-gather", "temp-cycle", "lc-cycle"]:
        if pattern in params:
            if params[pattern]:
                bins = len(params[pattern +"_bins"])
                if pattern in ["fan", "degree"]:
                    feat_num += 2*bins
                else:
                    feat_num += bins

    # add fan, deg, and ratio features
    for k in [0, 1, 2]:
        if k in params["vertex_stats_feats"]:
            feat_num += 4

    # add avg, sum, min, max, median, var, skew, and kurtosis features
    for k in [3, 4, 5, 6, 7, 8, 9, 10]:
        if k in params["vertex_stats_feats"]:
            feat_num += 4*len(params["vertex_stats_cols"])

    return feat_num

### Generate graph-based features using Graph Feature Preprocessor

In [9]:
%%time

X_all_gf = gp.transform(X_all.astype(np.float64)).astype(np.float32, copy=False)

CPU times: user 3h 24min 47s, sys: 37.6 s, total: 3h 25min 24s
Wall time: 21min 26s


### Split the data to train and test sets

In [10]:
X_all_gf = np.delete(X_all_gf, remove_cols, 1)

X_train_gf = X_all_gf[:-n_test]
X_test_gf = X_all_gf[-n_test:]
del X_all_gf

# Labels
y_train = Y_all[:-n_test]
y_test = Y_all[-n_test:]

In [11]:
X_train_gf.shape, X_test_gf.shape

((5539239, 248), (1384810, 248))

## LGBM Model Training

### Function for training LightGBM model

In [12]:
def lgbm_train_evaluate(X_train, y_train, X_test, y_test, params):
        """ Evaluate an LightGBM configuration

        Args:
            X_train (np.ndarray): Training feature matrix
            y_train (np.ndarray): Training labels
            X_test (np.ndarray): Test feature matrix
            y_test (np.ndarray): Test labels
            params (dict): Model configuration

        Returns:
            score (float): Configuration score
        """

        lgb_params = params.copy()
        num_round = lgb_params["num_round"]
        lgb_params.pop("num_round")

        lgb_params["objective"] = "binary"
        lgb_params.pop("alpha")
        lgb_params.pop("gamma")

        early_stopping_rounds = 20
        dtrain = lgb.Dataset(X_train, y_train, weight=None)
        dtest = lgb.Dataset(X_test, y_test, weight=None)

        bst = lgb.train(
            lgb_params,
            dtrain,
            num_boost_round=num_round,
            valid_sets=[dtest],
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds), lgb.log_evaluation(50)]
        )

        z_test = bst.predict(X_test)
        preds = np.round(z_test)  # 1: illicit, 0: licit

        return f1_score(y_test, preds)

### Training the model with graph-based features

In [13]:
# Set the training parameters. These parameters can be found using a Hyperparameter Tuning method such as Successive Halving.
params = {
    "boosting": "gbdt",
    "metric": "auc",
    "num_round": 185,
    "num_leaves": 21,
    "max_bin": 256,
    "learning_rate": 0.08995441299910924,
    "lambda_l1": 0.4902016501409548,
    "lambda_l2": 81.93169246795033,
    "scale_pos_weight": 4.495921090533586,
    "alpha": 0.8028096762102561,
    "gamma": 2.1902844884226473,
    "seed": 5935727,
    "max_depth": 10
}

print("Training using graph-based features.")
print("=" * 50)
res_score = lgbm_train_evaluate(X_train_gf, y_train, X_test_gf, y_test, params)
print("=" * 50)
print("Test minority-class F1 score: ", res_score)

Training using graph-based features.
[LightGBM] [Info] Number of positive: 2640, number of negative: 5536599
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.531293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14935
[LightGBM] [Info] Number of data points in the train set: 5539239, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000477 -> initscore=-7.648357
[LightGBM] [Info] Start training from score -7.648357
Training until validation scores don't improve for 20 rounds
[50]	valid_0's auc: 0.975395
[100]	valid_0's auc: 0.977004
[150]	valid_0's auc: 0.977188
Early stopping, best iteration is:
[132]	valid_0's auc: 0.977331
Test minority-class F1 score:  0.2573940847322142


In [14]:
print(round(res_score * 100, 2))

25.74
