In [1]:
import pandas as pd

In [2]:
from src import util

In [3]:
from src.models import PARAMS_SKOPT, BayesSearchCV2



In [4]:
from tqdm import tqdm
from skopt import BayesSearchCV
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
util.set_context("ieee_fraud_detection")

Files in data directory:
______

ieee_fraud_detection/
    comb.pkl
    raw/
        train_transaction.csv.zip
        train_identity.csv.zip
        test_transaction.csv.zip
        test_identity.csv.zip
        sample_submission.csv.zip
        train_transaction.csv
        train_identity.csv
        test_transaction.csv
        test_identity.csv
        sample_submission.csv
______



In [7]:
df = pd.read_pickle(util.get_dpath("comb.pkl"))
df.shape

(1097231, 435)

In [8]:
useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', "test_", "isFraud"]

df = df[useful_features]

In [9]:
df.head(2)

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_31,id_32,id_33,id_36,id_37,id_38,DeviceType,DeviceInfo,test_,isFraud
0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,...,,,,,,,,,False,0.0
1,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,...,,,,,,,,,False,0.0


In [10]:
df.shape

(1097231, 281)

In [11]:
df.dtypes

TransactionAmt     float16
ProductCD         category
card1                int16
card2              float16
card3              float16
                    ...   
id_38             category
DeviceType        category
DeviceInfo        category
test_                 bool
isFraud            float64
Length: 281, dtype: object

In [12]:
df["card1_count_enc"] = df.groupby("card1").test_.transform("count")

In [13]:
cat_cols = df.select_dtypes("category").columns

for col in tqdm(cat_cols):
    df[col] = df[col].cat.codes

100%|██████████| 23/23 [00:00<00:00, 818.36it/s]


In [14]:
tr = df[~df.test_]
tr.shape

(590540, 282)

In [15]:
tr = tr.sample(100_000)

In [16]:
tr.sample(2)

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_32,id_33,id_36,id_37,id_38,DeviceType,DeviceInfo,test_,isFraud,card1_count_enc
142429,117.0,4,15845,111.0,150.0,2,195.0,2,272.0,87.0,...,,-1,-1,-1,-1,-1,-1,False,0.0,119
377360,39.0,4,9500,321.0,150.0,3,226.0,2,272.0,87.0,...,,-1,-1,-1,-1,-1,-1,False,0.0,26243


In [17]:
target = "isFraud"
X_cols = list(set(tr.columns) - {target})

In [18]:
# X_cols = [c for c in X_cols if not c.startswith("V")]

In [19]:
# notna = tr[X_cols].notna().mean()
# X_cols = notna.index[notna > 0.2]
# len(X_cols)

In [20]:
X = tr[X_cols]
y = tr[target]

X.shape, y.shape

((100000, 281), (100000,))

In [21]:
est = LGBMClassifier(num_leaves=2, n_estimators=10)

In [22]:
%%time

est.fit(X, y)

CPU times: user 7.44 s, sys: 723 ms, total: 8.16 s
Wall time: 5.26 s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=10, n_jobs=-1, num_leaves=2, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [23]:
%%time

pred = cross_val_predict(est, X, y, cv=5, method="predict_proba")[:, 1]

CPU times: user 44.7 s, sys: 4.94 s, total: 49.7 s
Wall time: 33.1 s


In [24]:
pred

array([0.02839913, 0.02839913, 0.02839913, ..., 0.02805554, 0.02805554,
       0.02805554])

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
roc_auc_score(y, pred)

0.7916290608557348

In [27]:
params = {'num_leaves': 256,
          'min_child_samples': 79,
          'objective': 'binary',
          'max_depth': 13,
          'learning_rate': 0.03,
          "boosting_type": "gbdt",
          "subsample_freq": 3,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          'colsample_bytree': 0.9,
         }

est2 = LGBMClassifier(**params)

In [28]:
%%time

est2.fit(X, y)

CPU times: user 1min 24s, sys: 1.19 s, total: 1min 25s
Wall time: 25.1 s


LGBMClassifier(bagging_seed=11, boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.9, importance_type='split',
               learning_rate=0.03, max_depth=13, metric='auc',
               min_child_samples=79, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=256, objective='binary',
               random_state=None, reg_alpha=0.3, reg_lambda=0.3, silent=True,
               subsample=0.9, subsample_for_bin=200000, subsample_freq=3,
               verbosity=-1)

In [29]:
%%time

pred2 = cross_val_predict(est2, X, y, cv=5, method="predict_proba")[:, 1]

CPU times: user 6min 4s, sys: 5.49 s, total: 6min 9s
Wall time: 1min 51s


In [30]:
roc_auc_score(y, pred2)

0.9132268995972915

In [31]:
pd.DataFrame({
    "col": X_cols,
    "importance": est2.feature_importances_,
}).sort_values("importance", ascending=False).head(20)

Unnamed: 0,col,importance
274,"{V268, V285, V217, V182, R_emaildomain, V151, ...",1305
163,"{V268, V285, V217, V182, R_emaildomain, V151, ...",1161
146,"{V268, V285, V217, V182, R_emaildomain, V151, ...",1125
15,"{V268, V285, V217, V182, R_emaildomain, V151, ...",1123
68,"{V268, V285, V217, V182, R_emaildomain, V151, ...",1037
43,"{V268, V285, V217, V182, R_emaildomain, V151, ...",701
187,"{V268, V285, V217, V182, R_emaildomain, V151, ...",642
139,"{V268, V285, V217, V182, R_emaildomain, V151, ...",626
55,"{V268, V285, V217, V182, R_emaildomain, V151, ...",573
108,"{V268, V285, V217, V182, R_emaildomain, V151, ...",518


In [32]:
PARAMS_SKOPT["lgb_small_trees"]

{'n_estimators': Integer(low=50, high=2000),
 'max_depth': Integer(low=1, high=8),
 'num_leaves': Integer(low=4, high=32),
 'learning_rate': Real(low=0.0001, high=10, prior='log-uniform', transform='identity'),
 'cat_smooth': Real(low=0.01, high=100, prior='log-uniform', transform='identity')}

In [33]:
est3 = BayesSearchCV2(
    LGBMClassifier(),
    PARAMS_SKOPT["lgb_big_trees"],
    scoring="roc_auc",
    cv=5,
    n_iter=25,
    n_jobs=3,
    verbose=1,
)

In [34]:
est3.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 14.8min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 10.2min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  6.0min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 13.2min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 13.5min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 10.4min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  5.3min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  8.1min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  2.9min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 12.5min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  1.9min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 15.2min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  5.2min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   30.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  2.2min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 16.1min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  9.0min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  7.9min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   20.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 21.2min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   30.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 18.4min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 12.6min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  5.6min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 18.2min finished


BayesSearchCV2(cv=5, error_score='raise',
               estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                        colsample_bytree=1.0,
                                        importance_type='split',
                                        learning_rate=0.1, max_depth=-1,
                                        min_child_samples=20,
                                        min_child_weight=0.001,
                                        min_split_gain=0.0, n_estimators=100,
                                        n_jobs=-1, num_leaves=31,
                                        objective=None, random_state=None,
                                        reg_alpha=0.0, reg_lambda=0.0,
                                        sile...
               random_state=None, refit=True, return_train_score=False,
               scoring='roc_auc',
               search_spaces={'cat_smooth': Real(low=0.01, high=100, prior='log-uniform', transform='identi

In [35]:
pred3 = cross_val_predict(est3.best_estimator_, X, y, cv=5, method="predict_proba")[:, 1]

In [36]:
roc_auc_score(y, pred3)

0.928411649525746

In [37]:
repr(est3.best_estimator_)

"LGBMClassifier(boosting_type='gbdt', cat_smooth=100.0, class_weight=None,\n               colsample_bytree=1.0, importance_type='split',\n               learning_rate=0.03958268464619738, max_depth=200,\n               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n               n_estimators=324, n_jobs=-1, num_leaves=243, objective=None,\n               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)"

In [38]:
est3.best_score_

0.9284961922777616

In [39]:
pd.DataFrame(est3.cv_results_)

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cat_smooth,param_learning_rate,param_max_depth,param_n_estimators,param_num_leaves,params
0,0.920713,0.930098,0.93466,0.929502,0.920109,0.927016,0.005684,1,448.362076,60.808497,7.614456,1.809602,1.121284,0.027951,188,347,440,"{'cat_smooth': 1.1212838179340168, 'learning_r..."
1,0.876588,0.880306,0.885854,0.879638,0.870276,0.878532,0.005097,1,313.3449,44.434468,3.484273,0.312763,5.758759,0.001042,161,357,295,"{'cat_smooth': 5.758759483695491, 'learning_ra..."
2,0.921865,0.9254,0.930658,0.928323,0.914633,0.924176,0.005606,1,180.823442,25.874987,3.894945,0.696684,1.11795,0.089945,116,382,98,"{'cat_smooth': 1.1179499887779936, 'learning_r..."
3,0.909752,0.920108,0.929439,0.920795,0.908037,0.917626,0.00787,1,404.535556,58.035957,5.249114,0.374753,0.044411,0.006999,113,423,302,"{'cat_smooth': 0.04441100771869763, 'learning_..."
4,0.872817,0.881687,0.880331,0.875224,0.862577,0.874527,0.0068,1,414.466956,58.447222,5.004453,0.677556,1.960253,0.000291,102,450,342,"{'cat_smooth': 1.9602528092191487, 'learning_r..."
5,0.899114,0.909168,0.915673,0.908226,0.89955,0.906346,0.006277,1,317.668125,47.572313,3.63409,0.410271,15.76861,0.005766,138,293,400,"{'cat_smooth': 15.768610392996079, 'learning_r..."
6,0.888599,0.8964,0.903914,0.895487,0.886524,0.894185,0.006183,1,161.998003,24.148234,2.147656,0.391815,20.84025,0.0059,178,165,336,"{'cat_smooth': 20.840249687189242, 'learning_r..."
7,0.862828,0.87422,0.87937,0.861746,0.861376,0.867908,0.007452,1,248.186615,39.45128,2.924804,0.611296,12.002001,0.000298,167,234,438,"{'cat_smooth': 12.002001139678356, 'learning_r..."
8,0.917652,0.927137,0.927717,0.928203,0.913839,0.922909,0.005982,1,87.568913,12.843199,2.211731,0.330321,20.132734,0.092967,111,101,216,"{'cat_smooth': 20.13273448390463, 'learning_ra..."
9,0.870472,0.877808,0.882025,0.874893,0.86254,0.873548,0.00667,1,381.957181,55.488352,4.802622,1.172295,1.062311,0.000438,197,488,282,"{'cat_smooth': 1.0623114043775155, 'learning_r..."


In [42]:
est3.best_estimator_.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.03958268464619738,
 'max_depth': 200,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 324,
 'n_jobs': -1,
 'num_leaves': 243,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'cat_smooth': 100.0}