In [1]:
import pandas as pd

In [2]:
from src import util

In [28]:
from src.models import run_estimator_cv, PARAMS_SKOPT, get_oof_predictions, BayesSearchCV2

In [4]:
from skopt import BayesSearchCV
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
util.set_context("ieee_fraud_detection")

Files in data directory:
______

ieee_fraud_detection/
    comb.pkl
    raw/
        train_transaction.csv.zip
        train_identity.csv.zip
        test_transaction.csv.zip
        test_identity.csv.zip
        sample_submission.csv.zip
        train_transaction.csv
        train_identity.csv
        test_transaction.csv
        test_identity.csv
        sample_submission.csv
______



In [7]:
util.get_obj_ref(LGBMClassifier())

'LGBMClassifier_04a6eff6'

In [8]:
df = pd.read_pickle(util.get_path("comb.pkl"))
df.shape

(1097231, 435)

In [9]:
df.head(2)

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud,test_
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,0.0,False
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,0.0,False


In [10]:
tr = df[~df.test_]
tr.shape

(590540, 435)

In [11]:
tr.sample(2)

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud,test_
443000,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,0.0,...,firefox 59.0,24.0,1920x1080,match_status:2,T,F,T,F,0.0,False
128050,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,ie 11.0 for desktop,24.0,1920x1080,match_status:2,T,F,T,T,0.0,False


In [12]:
target = "isFraud"
X_cols = set(tr.columns) - {target}

In [13]:
X_cols = [c for c in X_cols if not c.startswith("V")]

In [14]:
X = tr[X_cols]
y = tr[target]

X.shape, y.shape

((590540, 95), (590540,))

In [15]:
est = LGBMClassifier()

In [16]:
est.fit(X, y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
pred = cross_val_predict(est, X, y, cv=5, method="predict_proba")[:, 1]

In [18]:
pred

array([0.00022309, 0.00017388, 0.00013443, ..., 0.00064562, 0.00582389,
       0.00531706])

In [19]:
from sklearn.metrics import roc_auc_score

In [20]:
roc_auc_score(y, pred)

0.7375870078411357

In [21]:
params = {'num_leaves': 256,
          'min_child_samples': 79,
          'objective': 'binary',
          'max_depth': 13,
          'learning_rate': 0.03,
          "boosting_type": "gbdt",
          "subsample_freq": 3,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          'colsample_bytree': 0.9,
          #'categorical_feature': cat_cols
         }

est2 = LGBMClassifier(**params)

In [22]:
est2.fit(X, y)

LGBMClassifier(bagging_seed=11, boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.9, importance_type='split',
               learning_rate=0.03, max_depth=13, metric='auc',
               min_child_samples=79, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=256, objective='binary',
               random_state=None, reg_alpha=0.3, reg_lambda=0.3, silent=True,
               subsample=0.9, subsample_for_bin=200000, subsample_freq=3,
               verbosity=-1)

In [23]:
pred2 = cross_val_predict(est2, X, y, cv=5, method="predict_proba")[:, 1]

In [24]:
roc_auc_score(y, pred2)

0.8135889055927089

In [25]:
pd.DataFrame({
    "col": X_cols,
    "importance": est2.feature_importances_,
}).sort_values("importance", ascending=False).head(20)

Unnamed: 0,col,importance
73,card1,1607
51,card2,1497
61,TransactionDT,1472
17,TransactionAmt,1365
27,addr1,1302
3,C13,881
88,D2,848
92,D15,847
28,C1,792
33,card5,735


In [26]:
PARAMS_SKOPT["lgb_small_trees"]

{'n_estimators': Integer(low=50, high=2000),
 'max_depth': Integer(low=1, high=8),
 'num_leaves': Integer(low=4, high=32),
 'learning_rate': Real(low=0.0001, high=10, prior='log-uniform', transform='identity'),
 'cat_smooth': Real(low=0.01, high=100, prior='log-uniform', transform='identity')}

In [31]:
est3 = BayesSearchCV2(
    LGBMClassifier(),
    PARAMS_SKOPT["lgb_big_trees"],
    scoring="roc_auc",
    cv=5,
    n_iter=25,
    n_jobs=3,
    verbose=1,
)

In [32]:
est3.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 22.3min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  9.2min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 558.0min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  4.3min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  2.9min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  3.8min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  3.4min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [None]:
pred3 = cross_val_predict(est3.best_estimator_, X, y, cv=5, method="predict_proba")[:, 1]

In [None]:
roc_auc_score(y, pred3)

In [None]:
repr(est.best_estimator_)

In [None]:
est.best_score_

In [None]:
pd.DataFrame(est.cv_results_)