In [1]:
import pandas as pd
import numpy as np

import os
import sys

BASE_DIR = '/'.join(os.getcwd().split('/')[:-2])
sys.path.append(BASE_DIR)

from src.loaders.featuring import CumTransactionCustomerinStep, AggAmountTransactionofCustomer, CountFrequency
from src.helpers.grouptimeseriessplit import GroupTimeSeriesSplit

import joblib 

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import PowerTransformer

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold, TimeSeriesSplit

from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer, f1_score, classification_report, confusion_matrix, roc_curve, mean_squared_error, recall_score

import category_encoders as ce

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
df = pd.read_csv('../../resources/transactions_train.csv')

In [6]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [7]:
# Convert Dtypes :
df[df.select_dtypes(['int64','int16','float32','float64','int8']).columns] = df[df.select_dtypes(['int64','int16','float32','float64','int8']).columns].apply(pd.to_numeric)

df[df.select_dtypes(['object','category']).columns] = df.select_dtypes(['object','category']).apply(lambda x: x.astype('category'))

In [8]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

In [9]:
# select non-numeric columns
cat_columns = X.select_dtypes(exclude=['int64','int16','float32','float64','int8']).columns

# select the float columns
num_columns = X.select_dtypes(include=['int64','int16','float32','float64','int8']).columns

In [10]:
all_columns = (num_columns.append(cat_columns))
print(cat_columns)
print(num_columns)
print(all_columns)

Index(['type', 'nameOrig', 'nameDest'], dtype='object')
Index(['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest'],
      dtype='object')
Index(['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'type', 'nameOrig', 'nameDest'],
      dtype='object')


### Feature Engineering

In [11]:
cum_transaction_customer_in_step_pipeline = Pipeline(
    steps=[
        ("Cumulate Transaction Customer in Step", CumTransactionCustomerinStep(step='step', customer_ids='nameOrig'))
    ]
)
agg_amount_transaction_of_customer_pipeline = Pipeline(
    steps=[
        ("Aggregate Amount Transaction of Customer", AggAmountTransactionofCustomer(amount='amount', customer_ids='nameOrig')),
        ("Scaler", PowerTransformer())
    ]
)
count_frequence_pipeline = Pipeline(
    steps=[
        ("Count frequence of Customer", CountFrequency(variables=['nameOrig', 'nameDest']))
    ]
)
scale_numerical_pipeline = Pipeline(
    steps=[
        ("Scale Numerical variables", PowerTransformer())
    ]    
)
category_to_numerical_pipeline = Pipeline(
    steps=[
        ("Categorical variables to Numerical variables", ce.cat_boost.CatBoostEncoder())
    ]
)

In [12]:
data_pipeline = ColumnTransformer(
    [   
        ('cum_transaction', cum_transaction_customer_in_step_pipeline, ['step','nameOrig']),
        ('mean_amount', agg_amount_transaction_of_customer_pipeline, ['amount','nameOrig']),
        ('freq_customer', count_frequence_pipeline, ['nameOrig','nameDest']),
        ('category_encoder', category_to_numerical_pipeline, ['type']),
        ('numeric_scaler', scale_numerical_pipeline, ['amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])
    ]
)

Because we don't have many feature so we ignore feature selection

In [13]:
data_pipeline

ColumnTransformer(transformers=[('cum_transaction',
                                 Pipeline(steps=[('Cumulate Transaction '
                                                  'Customer in Step',
                                                  CumTransactionCustomerinStep(customer_ids='nameOrig',
                                                                               step='step'))]),
                                 ['step', 'nameOrig']),
                                ('mean_amount',
                                 Pipeline(steps=[('Aggregate Amount '
                                                  'Transaction of Customer',
                                                  AggAmountTransactionofCustomer(amount='amount',
                                                                                 customer_ids='nameOrig')),
                                                 ('Scaler...
                                                  CountFrequency(variables=['nameOrig'

In [14]:
features = data_pipeline.fit_transform(X, y)

In [15]:
features.shape

(6351193, 10)

In [16]:
features

array([[ 1.        , -0.95496699,  1.        , ...,  1.11360603,
        -1.13436084, -1.22433564],
       [ 1.        , -1.66980028,  1.        , ...,  0.88964748,
        -1.13436084, -1.22433564],
       [ 1.        , -2.47038514,  1.        , ..., -0.86366377,
        -1.13436084, -1.22433564],
       ...,
       [ 1.        ,  2.75202657,  1.        , ..., -0.86366377,
        -1.13436084, -1.22433564],
       [ 1.        ,  2.75202657,  1.        , ..., -0.86366377,
         0.69996378,  1.08094922],
       [ 1.        , -2.09823975,  1.        , ...,  0.81109865,
         0.52377021,  0.37492219]])

Because the data have group and time series features so we need to combine Group and TimeSeries for spliting

In [17]:
for idx, (train_idx, test_idx) in enumerate(GroupTimeSeriesSplit().split(X, groups=X['step'])):
    print('-' * 100)
    print('Fold: ', idx)
    print(X.loc[train_idx, 'step'].unique())
    print(X.loc[test_idx, 'step'].unique())
    print('-' * 100)
    if idx > 0: break

----------------------------------------------------------------------------------------------------
Fold:  0
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119]
[120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
 210 211 212 213 214 215 216 217 218 219 

### Chose right Model

In [18]:
params_lightgbm = {
    'metric': 'auc',
    'device_type' : 'gpu',
    'boosting_type': 'gbdt',
    'lambda_l1': 1.0e-08,
    'lambda_l2': 1.0e-05,
    'num_leaves': 220,
    'n_estimators': 10000,
    'reg_alpha': 0.2,
    'reg_lambda': 0.2,
    'colsample_bytree': 1.0,
    'subsample': 0.8,
    'learning_rate': 0.02,
    'max_depth': 100, 
    'min_child_samples': 128
}
model_lightgbm = LGBMClassifier(**params_lightgbm, random_state=123)

In [19]:
pipeline_lgbm = Pipeline(
    steps=[
        ('loader', data_pipeline),
        ('estimator', model_lightgbm)
    ]
)

In [20]:
pipeline_lgbm

Pipeline(steps=[('loader',
                 ColumnTransformer(transformers=[('cum_transaction',
                                                  Pipeline(steps=[('Cumulate '
                                                                   'Transaction '
                                                                   'Customer '
                                                                   'in Step',
                                                                   CumTransactionCustomerinStep(customer_ids='nameOrig',
                                                                                                step='step'))]),
                                                  ['step', 'nameOrig']),
                                                 ('mean_amount',
                                                  Pipeline(steps=[('Aggregate '
                                                                   'Amount '
                                                         

In [21]:
params_xgboost = {
    'eval_metric': 'auc',
    'max_depth': 6,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'n_estimators': 2000,
    'learning_rate' : 0.02,
    'missing': -1,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'gpu_id' : 0
}
model_xgboost = XGBClassifier(**params_xgboost)

In [22]:
pipeline_xgboost = Pipeline(
    steps=[
        ('loader', data_pipeline),
        ('estimator', model_xgboost)
    ]
)

In [23]:
pipeline_xgboost

Pipeline(steps=[('loader',
                 ColumnTransformer(transformers=[('cum_transaction',
                                                  Pipeline(steps=[('Cumulate '
                                                                   'Transaction '
                                                                   'Customer '
                                                                   'in Step',
                                                                   CumTransactionCustomerinStep(customer_ids='nameOrig',
                                                                                                step='step'))]),
                                                  ['step', 'nameOrig']),
                                                 ('mean_amount',
                                                  Pipeline(steps=[('Aggregate '
                                                                   'Amount '
                                                         

In [24]:
##### %%time 
# evaluate each strategy on the dataset
results = list()
# Setting a 10-fold stratified cross-validation (note: shuffle=True)
SEED = 213
FOLDS = 10

# CV interations
# Create arrays for the features and the response variable
roc_auc = list()
F1 = list()
average_precision = list()
oof     = np.empty((X.shape[0],))
oof_bin = np.empty((X.shape[0],))
predictions=[]
mean_auc = 0
best_iteration = list()



In [25]:
for fold, (train_idx, test_idx) in enumerate(GroupTimeSeriesSplit(n_splits=FOLDS).split(X, y, groups=X['step'])):
    print('-' * 100)
    print('Fold: ', fold)
    print('Groups for training:')
    print(X.loc[train_idx, 'step'].unique())
    print(X.loc[train_idx, :].shape)
    print(y.loc[train_idx].shape)
    print('Group for test:')
    print(X.loc[test_idx, 'step'].unique())
    print('-' * 80)  
    X_train, y_train = X.iloc[list(train_idx), :], y.iloc[list(train_idx)]
    X_test, y_test = X.iloc[list(test_idx), :], y.iloc[list(test_idx)]
    pipeline_xgboost.fit(X_train, y_train,
                             #estimator__eval_metric="auc",
                             #estimator__eval_set = [(X_test, y_test)],
                             #estimator__verbose=1000,
                             #estimator__early_stopping_rounds =150
                            # lgbm__sample_weight=X_train_weight
                           )
    #pipeline_model_lgbm.fit(X_train,y_train)
    preds = pipeline_xgboost.predict_proba(X_test)[:,1]
    #oof[test_idx] = preds
    auc_score = roc_auc_score(y_true=y_test, y_score=preds)
    average_precesion = average_precision_score(y_true=y_test, y_score=preds)
    y_predicted = pipeline_xgboost.predict(X_test)
    recall = recall_score(y_test, y_predicted)
    f1= f1_score(y_test, y_predicted)
    print('Classification report:\n',classification_report(y_test, y_predicted))
    print('Confusion_matrix:\n',confusion_matrix(y_test, y_predicted))
    print(f"Fold {fold} | AUC: {auc_score}")
    print(f"Fold {fold} | Avergae_precesion: {average_precesion}")
    print(f"Fold {fold} | recall: {recall}")
    print(f"Fold {fold} | F1: {f1}")
    roc_auc.append(auc_score)
    F1.append(f1)
    average_precision.append(average_precesion)
    mean_auc += auc_score / FOLDS
    #predictions.append(pipeline_model_HGBC.predict_proba(x_test_final)[:,1]) 
#Mean of the predictions
print('-' * 80)  
print(f"\nOverall mean F1 score : {np.mean(f1)}")
print(f"\nOverall mean AUC score : {mean_auc}")
print(f"\nOverall mean average precision score : {np.mean(average_precision)}")

----------------------------------------------------------------------------------------------------
Fold:  0
Groups for training:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69]
(1030523, 9)
(1030523,)
Group for test:
[ 70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 131 132]
--------------------------------------------------------------------------------
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    166170
           1       0.85      0.85      0.85       668

    accuracy                           1.00    166838
   macro avg       0.93      0.93     

In [26]:
# %%time
joblib.dump(pipeline_xgboost, 'pipeline_model_xgboost.joblib')
#predictions_train = pipeline_xgboost.predict(train)
#pd.DataFrame(predictions_train.to_csv("predictions_train.csv")

CPU times: user 81.4 ms, sys: 3.96 ms, total: 85.4 ms
Wall time: 53.7 ms


['pipeline_model_xgboost.joblib']

In [50]:
# ##### %%time 
# # evaluate each strategy on the dataset
# results = list()
# # Setting a 10-fold stratified cross-validation (note: shuffle=True)
# SEED = 213
# FOLDS = 10

# # CV interations
# # Create arrays for the features and the response variable
# roc_auc = list()
# F1 = list()
# average_precision = list()
# oof     = np.empty((X.shape[0],))
# oof_bin = np.empty((X.shape[0],))
# predictions=[]
# mean_auc = 0
# best_iteration = list()

In [None]:
# for fold, (train_idx, test_idx) in enumerate(GroupTimeSeriesSplit(n_splits=FOLDS).split(X, y, groups=X['step'])):
#     print('-' * 100)
#     print('Fold: ', fold)
#     print('Groups for training:')
#     print(X.loc[train_idx, 'step'].unique())
#     print(X.loc[train_idx, :].shape)
#     print(y.loc[train_idx].shape)
#     print('Group for test:')
#     print(X.loc[test_idx, 'step'].unique())
#     print('-' * 100)  
#     X_train, y_train = X.iloc[list(train_idx), :], y.iloc[list(train_idx)]
#     X_test, y_test = X.iloc[list(test_idx), :], y.iloc[list(test_idx)]
#     pipeline_lgbm.fit( X_train, y_train,
#                              #estimator__eval_metric="auc",
#                              #estimator__eval_set = [(X_test, y_test)],
#                              #estimator__verbose=1000,
#                              #estimator__early_stopping_rounds =150
#                             # lgbm__sample_weight=X_train_weight
#                            )
#     #pipeline_model_lgbm.fit(X_train,y_train)
#     preds = pipeline_lgbm.predict_proba(X_test)[:,1]
#     #oof[test_idx] = preds
#     auc_score = roc_auc_score(y_true=y_test, y_score= preds)
#     average_precesion = average_precision_score(y_true= y_test, y_score= preds)
#     y_predicted = pipeline_lgbm.predict(X_test)
#     recall = recall_score(y_test, y_predicted)
#     f1= f1_score(y_test, y_predicted)
#     print('Classification report:\n',classification_report(y_test,y_predicted))
#     print('Confusion_matrix:\n',confusion_matrix(y_test,y_predicted))
#     print(f"Fold {fold} | AUC: {auc_score}")
#     print(f"Fold {fold} | Avergae_precesion: {average_precesion}")
#     print(f"Fold {fold} | recall: {recall}")
#     print(f"Fold {fold} | F1: {f1}")
#     roc_auc.append(auc_score)
#     F1.append(f1)
#     average_precision.append(average_precesion)
#     mean_auc += auc_score / FOLDS
#     #predictions.append(pipeline_model_HGBC.predict_proba(x_test_final)[:,1]) 
# #Mean of the predictions
# print('-' * 80)  
# print(f"\nOverall mean F1 score : {np.mean(f1)}")
# print(f"\nOverall mean AUC score : {mean_auc}")
# print(f"\nOverall mean average precision score : {np.mean(average_precision)}")

----------------------------------------------------------------------------------------------------
Fold:  0
Groups for training:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69]
(1030523, 9)
(1030523,)
Group for test:
[ 70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 131 132]
----------------------------------------------------------------------------------------------------
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    166170
           1       0.61      0.93      0.74       668

    accuracy                           1.00    166838
   macro avg      