In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import datetime
from IPython.display import clear_output
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier

from generate_features import generate_features_time_series
from read_utils import read_data
from settings import CATEGORICAL_FEATURES, COLUMNS_TO_REMOVE
from sklearn.model_selection import GroupKFold

%matplotlib inline


RANDOM_STATE = 42

In [None]:
%%time
train, test, sample_submission = read_data()
train_test_transformed, encoders = generate_features_time_series(train, test)

In [3]:
train_test_transformed.head()

Unnamed: 0_level_0,TransactionID,C1,C10,C11,C12,C13,C14,C2,C3,C4,...,smoothed_encoded_addr1_on_R_emaildomain_nunique,smoothed_encoded_addr1_on_ProductCD_nunique,smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique,smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique,smoothed_encoded_card2_on_card1_nunique,smoothed_encoded_card4_on_card1_nunique,smoothed_encoded_card6_on_card1_nunique,smoothed_encoded_card2_card3_on_card1_nunique,smoothed_encoded_addr1_on_card1_nunique,smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum
TransactionDT_to_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-01 00:00:00,2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,...,33.0,4.0,,,,21.0,7214.0,,1750.0,68.5
2017-12-01 00:00:01,2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,39.0,5.0,,54.0,21.0,5750.0,7214.0,19.0,2205.0,29.0
2017-12-01 00:01:09,2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,41.0,5.0,,20.0,125.0,9705.0,9994.0,125.0,1886.0,106.95
2017-12-01 00:01:39,2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,...,34.0,4.0,,43.0,34.0,5750.0,9994.0,34.0,1018.0,286.95
2017-12-01 00:01:46,2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,25.0,4.0,,54.0,117.0,5750.0,7214.0,116.0,642.0,50.0


In [None]:
'TransactionAmt_count_within_1min',
 'TransactionAmt_sum_within_1min',
 'TransactionAmt_mean_within_1min',
 'TransactionAmt_std_within_1min',
 'TransactionAmt_count_within_1h',
 'TransactionAmt_sum_within_1h',
 'TransactionAmt_mean_within_1h',
 'TransactionAmt_std_within_1h',
 'TransactionAmt_count_within_1d',
 'TransactionAmt_sum_within_1d',
 'TransactionAmt_mean_within_1d',
 'TransactionAmt_std_within_1d',

In [4]:
train_test_transformed.columns.tolist()

['TransactionID',
 'C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'DeviceInfo',
 'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'TransactionDT',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V126-137_mean',
 'V126-137_mean_with_zeros',
 'V126-137_std',
 'V126_card1_mean',
 'V127',
 'V127_card1_mean',
 'V128',
 'V128_card1_mean',
 'V129',
 'V129_card1_mean',
 'V13',
 'V130',
 'V130_card1_mean',
 'V131',
 'V131_card1_mean',
 'V132',
 'V132_card1_mean',
 'V133',
 'V133_card1_mean',
 'V134',
 'V134_car

In [4]:
ORIGINAL_FEATURES = [
    "smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean",
"smoothed_encoded_TransactionDT_hour_on_TransactionAmt_std",
"smoothed_encoded_TransactionDT_hour_on_TransactionAmt_nanmedian",
"smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean",
"smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_std",
"smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_nanmedian",
"smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean",
"smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_std",
"smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_nanmedian",
    "id_12_count",
"id_13_count",
"id_14_count",
"id_15_count",
"id_16_count",
"id_17_count",
"id_18_count",
"id_19_count",
"id_20_count",
"id_21_count",
"id_22_count",
"id_23_count",
"id_24_count",
"id_25_count",
"id_26_count",
"id_27_count",
"id_28_count",
"id_29_count",
"id_30_count",
"id_31_count",
"id_32_count",
"id_33_count",
"id_34_count",
"id_35_count",
"id_36_count",
"id_37_count",
"id_38_count",
    "DeviceInfo",
"DeviceType",
 #  "card1",
"card2",
"card3",
"card4",
"card5",
"card6",
    "id_01",
"id_02",
"id_03",
"id_04",
"id_05",
"id_06",
"id_07",
"id_08",
"id_09",
"id_10",
"id_11",
    "id_12",
"id_13",
"id_14",
"id_15",
"id_16",
"id_17",
"id_18",
"id_19",
"id_20",
"id_21",
"id_22",
"id_23",
"id_24",
"id_25",
"id_26",
"id_27",
"id_28",
"id_29",
"id_30",
"id_31",
"id_32",
"id_33",
"id_34",
"id_35",
"id_36",
"id_37",
"id_38",
    "TransactionDT_dayOfMonth",
"TransactionDT_dayOfWeek",
"TransactionDT_weekOfMonth",
"TransactionDT_hour",
"TransactionDT_split",
"is_holiday",
"is_foreign",
    "device_name",
]


DT_TRANSACTION = ['smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_nanmedian',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_nanmedian',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_nanmedian',]


CARD_TRANSACTION = ['smoothed_encoded_card2_on_TransactionAmt_mean',
 'smoothed_encoded_card2_on_TransactionAmt_std',
 'smoothed_encoded_card2_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card3_on_TransactionAmt_mean',
 'smoothed_encoded_card3_on_TransactionAmt_std',
 'smoothed_encoded_card3_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card5_on_TransactionAmt_mean',
 'smoothed_encoded_card5_on_TransactionAmt_std',
 'smoothed_encoded_card5_on_TransactionAmt_nanmedian',
                   'smoothed_encoded_card1_on_TransactionDT_hour_mean',
 'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum']



CARD_COUNT = ['card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',]

ID_COUNT = [
             'id_12_count',
 'id_13_count',
 'id_14_count',
 'id_15_count',
 'id_16_count',
 'id_17_count',
 'id_18_count',
 'id_19_count',
 'id_20_count',
 'id_21_count',
 'id_22_count',
 'id_23_count',
 'id_24_count',
 'id_25_count',
 'id_26_count',
 'id_27_count',
 'id_28_count',
 'id_29_count',
 'id_30_count',
 'id_31_count',
 'id_32_count',
 'id_33_count',
 'id_34_count',
 'id_35_count',
 'id_36_count',
 'id_37_count',
 'id_38_count',]

ADDR_COUNT = ['addr1_count',]


ID_COUNT = ['DeviceInfo_count',
 'device_name_count']


EMAIL_DOMAIN_COUNT = ['P_emaildomain_count',
 'R_emaildomain_count',]





ADDR_IS_NULL = ['addr1_isnull',
 'addr2_isnull']

DIST_IS_NULL = ['dist1_isnull',
 'dist2_isnull',]

D_IS_NULL = ['D2_isnull',
 'D3_isnull',
 'D5_isnull',
 'D6_isnull',
 'D7_isnull',
 'D8_isnull',
 'D9_isnull',
 'D10_isnull',
 'D11_isnull',
 'D12_isnull',
 'D13_isnull',
 'D14_isnull',
 'D15_isnull',]


V_IS_NULL = ['V1_isnull',
 'V12_isnull',
 'V35_isnull',
 'V53_isnull',
 'V75_isnull',
 'V95_isnull',
 'V138_isnull',
 'V167_isnull',
 'V322_isnull',]


ID_IS_NULL = ['id_01_isnull',
 'id_02_isnull',
 'id_03_isnull',
 'id_04_isnull',
 'id_05_isnull',
 'id_06_isnull',
 'id_07_isnull',
 'id_08_isnull',
 'id_09_isnull',
 'id_10_isnull',
 'id_11_isnull',
 'id_12_isnull',
 'id_13_isnull',
 'id_14_isnull',
 'id_15_isnull',
 'id_16_isnull',
 'id_17_isnull',
 'id_18_isnull',
 'id_19_isnull',
 'id_20_isnull',
 'id_21_isnull',
 'id_22_isnull',
 'id_23_isnull',
 'id_24_isnull',
 'id_25_isnull',
 'id_26_isnull',
 'id_27_isnull',
 'id_28_isnull',
 'id_29_isnull',
 'id_30_isnull',
 'id_31_isnull',
 'id_32_isnull',
 'id_33_isnull',
 'id_34_isnull',
 'id_35_isnull',
 'id_36_isnull',
 'id_37_isnull',
 'id_38_isnull',
 'DeviceType_isnull',
 'DeviceInfo_isnull',]

M_IS_NULL = ['M1_isnull',
 'M4_isnull',
 'M6_isnull',
 'M7_isnull',]



C_GROUP = [f'C{i}' for i in range(1, 15)]

D_GROUP = [f'D{i}' for i in range(1, 16)]

M_GROUP = [f'M{i}' for i in range(1, 10)]

ID_GROUP = [f'id_0{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(10, 39)] + ['DeviceInfo', 'DeviceType', 'device_name']

EMAIL_DOMAIN = [ 'P_emaildomain','ProductCD','R_emaildomain']

AMT = ['TransactionAmt']

V_GROUP = [f'V{i}' for i in range(1, 340)]


ADDR = [ 'addr1', 'addr2']

DIST = ['dist1', 'dist2']

DATES = ['TransactionDT_dayOfMonth',
 'TransactionDT_dayOfWeek',
 'TransactionDT_weekOfMonth',
 'TransactionDT_hour', 'is_holiday']


DERIVED_ID = ['OS_NAME',
 'OS_V0',
 'OS_V1',
 'OS_V2',
 'OS_V_COMBINED',
 'OS_V_MAJOR',
 'id_33_height',
 'id_33_width']



DERIVED_DOMAINS = ['P_emaildomain_1',
 'P_emaildomain_2',
 'P_emaildomain_3',
 'R_emaildomain_1',
 'R_emaildomain_2',
 'R_emaildomain_3',
 'R=P',
 'R1=P1']




AUC ~ 0.5

CARD_IS_NULL = ['card2_isnull',
 'card3_isnull',
 'card4_isnull',
 'card5_isnull',
 'card6_isnull'] 
 
 
TRAIN AUC ~ 0.71
VALID AUC ~ 0.71

 DERIVED_DOMAINS = ['P_emaildomain_1',
 'P_emaildomain_2',
 'P_emaildomain_3',
 'R_emaildomain_1',
 'R_emaildomain_2',
 'R_emaildomain_3',
 'R=P',
 'R1=P1']
 
 
 AUC ~ 0.5
 DERIVED_ID = ['OS_NAME',
 'OS_V0',
 'OS_V1',
 'OS_V2',
 'OS_V_COMBINED',
 'OS_V_MAJOR',
 'id_33_height',
 'id_33_width']
 
 
 TRAIN AUC ~ 0.63
 VALID AUC ~ 0.55
 DATES = ['TransactionDT_dayOfMonth',
 'TransactionDT_dayOfWeek',
 'TransactionDT_weekOfMonth',
 'TransactionDT_hour', 'is_holiday']
 
 
 TRAIN AUC ~ VALID AUC ~ 0.65
 DIST = ['dist1', 'dist2']
 
 
  TRAIN AUC ~ VALID AUC ~ 0.68
 ADDR = [ 'addr1', 'addr2']
 
 
 TRAIN AUC ~ 0.91 VALID AUC ~ 0.85
 V_GROUP = [f'V{i}' for i in range(1, 340)]
 
 
 TRAIN AUC ~ VALID AUC ~ 0.67
 AMT = ['TransactionAmt']
 
 TRAIN AUC ~ VALID AUC ~ 0.72
 EMAIL_DOMAIN = [ 'P_emaildomain','ProductCD','R_emaildomain']
 
 TRAIN AUC ~ 0.76 VALID AUC ~ 0.68
ID_GROUP = [f'id_0{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(10, 39)] + ['DeviceInfo', 'DeviceType', 'device_name']


 TRAIN AUC ~ VALID AUC ~ 0.76
M_GROUP = [f'M{i}' for i in range(1, 10)]


TRAIN AUC ~ 0.86 VALID AUC ~ 0.82
D_GROUP = [f'D{i}' for i in range(1, 16)]

TRAIN AUC ~ 0.91 VALID AUC ~ 0.86
C_GROUP = [f'C{i}' for i in range(1, 15)]


TRAIN AUC ~ 0.72 VALID AUC ~ 0.75
M_IS_NULL = ['M1_isnull',
 'M4_isnull',
 'M6_isnull',
 'M7_isnull',]
 
 
 
 TRAIN AUC ~ 0.69 VALID AUC ~ 0.69
 ID_IS_NULL = ['id_01_isnull',
 'id_02_isnull',
 'id_03_isnull',
 'id_04_isnull',
 'id_05_isnull',
 'id_06_isnull',
 'id_07_isnull',
 'id_08_isnull',
 'id_09_isnull',
 'id_10_isnull',
 'id_11_isnull',
 'id_12_isnull',
 'id_13_isnull',
 'id_14_isnull',
 'id_15_isnull',
 'id_16_isnull',
 'id_17_isnull',
 'id_18_isnull',
 'id_19_isnull',
 'id_20_isnull',
 'id_21_isnull',
 'id_22_isnull',
 'id_23_isnull',
 'id_24_isnull',
 'id_25_isnull',
 'id_26_isnull',
 'id_27_isnull',
 'id_28_isnull',
 'id_29_isnull',
 'id_30_isnull',
 'id_31_isnull',
 'id_32_isnull',
 'id_33_isnull',
 'id_34_isnull',
 'id_35_isnull',
 'id_36_isnull',
 'id_37_isnull',
 'id_38_isnull',
 'DeviceType_isnull',
 'DeviceInfo_isnull',]
 
 
 
  TRAIN AUC ~ 0.69 VALID AUC ~ 0.7
 V_IS_NULL = ['V1_isnull',
 'V12_isnull',
 'V35_isnull',
 'V53_isnull',
 'V75_isnull',
 'V95_isnull',
 'V138_isnull',
 'V167_isnull',
 'V322_isnull',]
 
 
 TRAIN AUC ~ 0.72 VALID AUC ~ 0.72
 D_IS_NULL = ['D2_isnull',
 'D3_isnull',
 'D5_isnull',
 'D6_isnull',
 'D7_isnull',
 'D8_isnull',
 'D9_isnull',
 'D10_isnull',
 'D11_isnull',
 'D12_isnull',
 'D13_isnull',
 'D14_isnull',
 'D15_isnull',]
 
  TRAIN AUC ~ 0.62 VALID AUC ~ 0.74
 DIST_IS_NULL = ['dist1_isnull',
 'dist2_isnull',]
 
 
 TRAIN AUC ~ 0.63 VALID AUC ~ 0.63
 ADDR_IS_NULL = ['addr1_isnull',
 'addr2_isnull']
 
 
 TRAIN AUC ~ 0.7 VALID AUC ~ 0.7
 EMAIL_DOMAIN_COUNT = ['P_emaildomain_count',
 'R_emaildomain_count',]
 
 
 TRAIN AUC ~ 0.64 VALID AUC ~ 0.65
 ID_COUNT = ['DeviceInfo_count',
 'device_name_count',
 'OS_NAME_count',]
 
 TRAIN AUC ~ 0.68 VALID AUC ~ 0.67
 ADDR_COUNT = ['addr1_count',]
 
 
 
 TRAIN AUC ~ 0.74 VALID AUC ~ 0.7
 ID_COUNT = [
             'id_12_count',
 'id_13_count',
 'id_14_count',
 'id_15_count',
 'id_16_count',
 'id_17_count',
 'id_18_count',
 'id_19_count',
 'id_20_count',
 'id_21_count',
 'id_22_count',
 'id_23_count',
 'id_24_count',
 'id_25_count',
 'id_26_count',
 'id_27_count',
 'id_28_count',
 'id_29_count',
 'id_30_count',
 'id_31_count',
 'id_32_count',
 'id_33_count',
 'id_34_count',
 'id_35_count',
 'id_36_count',
 'id_37_count',
 'id_38_count',]
 
 
  TRAIN AUC ~ 0.8 VALID AUC ~ 0.76
 CARD_COUNT = ['card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',]
 
 
 TRAIN AUC ~ 0.84 VALID AUC ~ 0.79
CARD_TRANSACTION = ['smoothed_encoded_card2_on_TransactionAmt_mean',
 'smoothed_encoded_card2_on_TransactionAmt_std',
 'smoothed_encoded_card2_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card3_on_TransactionAmt_mean',
 'smoothed_encoded_card3_on_TransactionAmt_std',
 'smoothed_encoded_card3_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card5_on_TransactionAmt_mean',
 'smoothed_encoded_card5_on_TransactionAmt_std',
 'smoothed_encoded_card5_on_TransactionAmt_nanmedian',
                   'smoothed_encoded_card1_on_TransactionDT_hour_mean',
 'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum']
 
 
 TRAIN AUC ~ 0.6 VALID AUC ~ 0.53
 DT_TRANSACTION = ['smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_nanmedian',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_nanmedian',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_std',
 'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_nanmedian',]

In [None]:
from sklearn.model_selection import KFold

folds = GroupKFold(n_splits=6)

# COLS_TO_USE = DT_TRANSACTION = CARD_TRANSACTION + CARD_COUNT + ID_COUNT + ADDR_COUNT + EMAIL_DOMAIN_COUNT + DIST_IS_NULL +\
# D_IS_NULL + ID_IS_NULL + M_IS_NULL + C_GROUP + D_GROUP + M_GROUP + EMAIL_DOMAIN + AMT + V_GROUP + ADDR + DIST + DERIVED_DOMAINS# + ORIGINAL_FEATURES


# COLS_TO_USE = [
#         i for i in train_test_transformed.columns
#         if i not in COLUMNS_TO_REMOVE
#     ]

COLS_TO_USE = [
        i for i in train_test_transformed.columns
        if i not in COLUMNS_TO_REMOVE
    ]


gbms = []

preds = []

for fold_ind, (train_idx, val_idx) in enumerate(
    folds.split(
        X=train_test_transformed[:train.shape[0]], 
        y=train_test_transformed[:train.shape[0]]['isFraud'], 
        groups=train_test_transformed[:train.shape[0]]['TransactionDT_split'].tolist()
    )
):
    
    #if fold_ind == 0:
    #    continue
    
    print('Fold', fold_ind)
    
    pars = {
        'num_leaves': 2 ** 7 - 1,
        #'min_data_in_leaf': 10,
        'learning_rate': 0.01,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 51,
        'cat_smooth': 209,
        'lambda_l1': 1.0,
        'lambda_l2': 3.0,
        'max_bin': 100,
        'scale_pos_weight': 7.0,
        #'max_cat_to_onehot': 10,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['auc'],
        'num_threads': -1
    }
    
    cat_c = [i for i in CATEGORICAL_FEATURES
        if i in COLS_TO_USE]
    
    cur_train = train_test_transformed[:train.shape[0]].iloc[train_idx]
    cur_test = train_test_transformed[:train.shape[0]].iloc[val_idx]
    
    cur_train = cur_train.sample(frac=1.0)
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(cur_train[COLS_TO_USE], cur_train['isFraud'])
    lgb_eval = lgb.Dataset(cur_test[COLS_TO_USE], cur_test['isFraud'], reference=lgb_train)
    
    gbm = lgb.train(
        pars,
        lgb_train,
        num_boost_round=20000,
        valid_sets=(lgb_train, lgb_eval),
        valid_names=('train', 'valid'),
        early_stopping_rounds=100,
        feature_name=COLS_TO_USE,
        categorical_feature=cat_c,
        verbose_eval=100
    )
    
    gbms.append(gbm)
    preds.append(gbm.predict(train_test_transformed[train_test_transformed['TransactionDT_split'] > 17][COLS_TO_USE]))
    
    
    
import scipy


np.array(np.array([i.best_score['valid']['auc'] for i in gbms])).mean()

Fold 0


New categorical_feature is ['DeviceInfo', 'DeviceType', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'OS_NAME', 'OS_V0', 'OS_V1', 'OS_V2', 'OS_V_MAJOR', 'P_emaildomain', 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'ProductCD', 'R_emaildomain', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'device_name', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'is_foreign', 'is_holiday']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.954257	valid's auc: 0.887918
[200]	train's auc: 0.966527	valid's auc: 0.893633
[300]	train's auc: 0.97428	valid's auc: 0.898864
[400]	train's auc: 0.979861	valid's auc: 0.901953
[500]	train's auc: 0.984382	valid's auc: 0.904246
[600]	train's auc: 0.987649	valid's auc: 0.90672
[700]	train's auc: 0.990236	valid's auc: 0.907554
[800]	train's auc: 0.992237	valid's auc: 0.908573
[900]	train's auc: 0.993799	valid's auc: 0.908792
[1000]	train's auc: 0.994993	valid's auc: 0.909302
[1100]	train's auc: 0.995973	valid's auc: 0.910115
[1200]	train's auc: 0.996788	valid's auc: 0.910256
[1300]	train's auc: 0.997463	valid's auc: 0.910783
[1400]	train's auc: 0.997943	valid's auc: 0.911117
[1500]	train's auc: 0.99832	valid's auc: 0.911167
Early stopping, best iteration is:
[1428]	train's auc: 0.998053	valid's auc: 0.911224


0.9391991291341545

In [143]:
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.947134	valid's auc: 0.886015
[200]	train's auc: 0.960167	valid's auc: 0.894736
[300]	train's auc: 0.969244	valid's auc: 0.901696
[400]	train's auc: 0.975418	valid's auc: 0.905974
[500]	train's auc: 0.980219	valid's auc: 0.909495
[600]	train's auc: 0.983846	valid's auc: 0.912843
[700]	train's auc: 0.986725	valid's auc: 0.914156
[800]	train's auc: 0.988952	valid's auc: 0.915428
[900]	train's auc: 0.99073	valid's auc: 0.916191
[1000]	train's auc: 0.992291	valid's auc: 0.916507
[1100]	train's auc: 0.993581	valid's auc: 0.917087
[1200]	train's auc: 0.994569	valid's auc: 0.917578
[1300]	train's auc: 0.995363	valid's auc: 0.91822
[1400]	train's auc: 0.996105	valid's auc: 0.918279
[1500]	train's auc: 0.996694	valid's auc: 0.918478
[1600]	train's auc: 0.997205	valid's auc: 0.918784
[1700]	train's auc: 0.997586	valid's auc: 0.918832
[1800]	train's auc: 0.997935	valid's auc: 0.918984
[1900]	train's auc: 0.998201	valid's auc: 0.919123
[2000]	train's auc: 0.998463	valid's auc: 0.919275
[2100]	train's auc: 0.998666	valid's auc: 0.919747
[2200]	train's auc: 0.998834	valid's auc: 0.919848
Early stopping, best iteration is:
[2142]	train's auc: 0.998724	valid's auc: 0.920041
Fold 1
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.945712	valid's auc: 0.912652
[200]	train's auc: 0.959314	valid's auc: 0.922438
[300]	train's auc: 0.96795	valid's auc: 0.928058
[400]	train's auc: 0.974338	valid's auc: 0.932171
[500]	train's auc: 0.979262	valid's auc: 0.935075
[600]	train's auc: 0.983134	valid's auc: 0.936908
[700]	train's auc: 0.986156	valid's auc: 0.938095
[800]	train's auc: 0.988593	valid's auc: 0.938986
[900]	train's auc: 0.99043	valid's auc: 0.939726
[1000]	train's auc: 0.99192	valid's auc: 0.940118
[1100]	train's auc: 0.993217	valid's auc: 0.940235
[1200]	train's auc: 0.994289	valid's auc: 0.940411
[1300]	train's auc: 0.995164	valid's auc: 0.94068
[1400]	train's auc: 0.995856	valid's auc: 0.940848
Early stopping, best iteration is:
[1376]	train's auc: 0.995712	valid's auc: 0.940924
Fold 2
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.944113	valid's auc: 0.910135
[200]	train's auc: 0.957342	valid's auc: 0.919708
[300]	train's auc: 0.966448	valid's auc: 0.927437
[400]	train's auc: 0.972889	valid's auc: 0.931842
[500]	train's auc: 0.977922	valid's auc: 0.93544
[600]	train's auc: 0.981842	valid's auc: 0.937514
[700]	train's auc: 0.985039	valid's auc: 0.939167
[800]	train's auc: 0.987591	valid's auc: 0.94056
[900]	train's auc: 0.989567	valid's auc: 0.941453
[1000]	train's auc: 0.991163	valid's auc: 0.942198
[1100]	train's auc: 0.992508	valid's auc: 0.94268
[1200]	train's auc: 0.993616	valid's auc: 0.942976
[1300]	train's auc: 0.994552	valid's auc: 0.943309
[1400]	train's auc: 0.995313	valid's auc: 0.943499
[1500]	train's auc: 0.996017	valid's auc: 0.943809
[1600]	train's auc: 0.996594	valid's auc: 0.944212
[1700]	train's auc: 0.997032	valid's auc: 0.944484
[1800]	train's auc: 0.9974	valid's auc: 0.94469
[1900]	train's auc: 0.997749	valid's auc: 0.944956
[2000]	train's auc: 0.998057	valid's auc: 0.944988
[2100]	train's auc: 0.998272	valid's auc: 0.945076
[2200]	train's auc: 0.998491	valid's auc: 0.945151
[2300]	train's auc: 0.998677	valid's auc: 0.945165
Early stopping, best iteration is:
[2295]	train's auc: 0.998667	valid's auc: 0.945179
Fold 3
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.945322	valid's auc: 0.90842
[200]	train's auc: 0.958349	valid's auc: 0.918015
[300]	train's auc: 0.966901	valid's auc: 0.923992
[400]	train's auc: 0.973316	valid's auc: 0.928398
[500]	train's auc: 0.978238	valid's auc: 0.931139
[600]	train's auc: 0.982109	valid's auc: 0.932692
[700]	train's auc: 0.985229	valid's auc: 0.934233
[800]	train's auc: 0.987633	valid's auc: 0.935025
[900]	train's auc: 0.989728	valid's auc: 0.935088
[1000]	train's auc: 0.991329	valid's auc: 0.935191
Early stopping, best iteration is:
[954]	train's auc: 0.990654	valid's auc: 0.935335
Fold 4
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.942653	valid's auc: 0.920808
[200]	train's auc: 0.956665	valid's auc: 0.929666
[300]	train's auc: 0.965724	valid's auc: 0.935303
[400]	train's auc: 0.9722	valid's auc: 0.939584
[500]	train's auc: 0.977083	valid's auc: 0.941916
[600]	train's auc: 0.980953	valid's auc: 0.943232
[700]	train's auc: 0.984205	valid's auc: 0.944341
[800]	train's auc: 0.986742	valid's auc: 0.945371
[900]	train's auc: 0.988753	valid's auc: 0.945802
[1000]	train's auc: 0.990381	valid's auc: 0.946467
[1100]	train's auc: 0.991817	valid's auc: 0.947028
[1200]	train's auc: 0.993079	valid's auc: 0.947124
[1300]	train's auc: 0.994085	valid's auc: 0.947315
[1400]	train's auc: 0.994916	valid's auc: 0.947553
[1500]	train's auc: 0.995691	valid's auc: 0.947687
Early stopping, best iteration is:
[1427]	train's auc: 0.995131	valid's auc: 0.947733
Fold 5
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.942516	valid's auc: 0.915814
[200]	train's auc: 0.956524	valid's auc: 0.928355
[300]	train's auc: 0.965541	valid's auc: 0.936015
[400]	train's auc: 0.971929	valid's auc: 0.94099
[500]	train's auc: 0.976952	valid's auc: 0.943803
[600]	train's auc: 0.98089	valid's auc: 0.945914
[700]	train's auc: 0.984088	valid's auc: 0.9477
[800]	train's auc: 0.986731	valid's auc: 0.948889
[900]	train's auc: 0.988857	valid's auc: 0.949588
[1000]	train's auc: 0.990614	valid's auc: 0.950376
[1100]	train's auc: 0.99206	valid's auc: 0.951099
[1200]	train's auc: 0.993171	valid's auc: 0.951714
[1300]	train's auc: 0.994117	valid's auc: 0.952175
[1400]	train's auc: 0.99496	valid's auc: 0.952615
[1500]	train's auc: 0.995706	valid's auc: 0.952897
[1600]	train's auc: 0.99633	valid's auc: 0.952988
[1700]	train's auc: 0.996806	valid's auc: 0.953326
[1800]	train's auc: 0.997283	valid's auc: 0.953523
[1900]	train's auc: 0.997667	valid's auc: 0.953765
[2000]	train's auc: 0.997941	valid's auc: 0.954105
[2100]	train's auc: 0.998207	valid's auc: 0.954291
[2200]	train's auc: 0.998456	valid's auc: 0.954251
[2300]	train's auc: 0.998633	valid's auc: 0.954401
[2400]	train's auc: 0.998815	valid's auc: 0.954517
[2500]	train's auc: 0.998959	valid's auc: 0.95458
[2600]	train's auc: 0.999073	valid's auc: 0.954576
Early stopping, best iteration is:
[2557]	train's auc: 0.999027	valid's auc: 0.954627
0.940575769368861

69

In [10]:
COLS_TO_USE = [
    'C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
'D8',
'D9',
#  'DeviceInfo',
#  'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
'TransactionAmt',
    
    'V91', 'V70', 'V317', 'V258', 'V62', 'V312', 'V165', 'V49', 'V48',

#  'V1',
#  'V10',
#  'V100',
#  'V101',
#  'V102',
#  'V103',
#  'V104',
#  'V105',
#  'V106',
#  'V107',
#  'V108',
#  'V109',
#  'V11',
#  'V110',
#  'V111',
#  'V112',
#  'V113',
#  'V114',
#  'V115',
#  'V116',
#  'V117',
#  'V118',
#  'V119',
#  'V12',
#  'V120',
#  'V121',
#  'V122',
#  'V123',
#  'V124',
#  'V125',
#  'V126',
#  'V127',
#  'V128',
#  'V129',
#  'V13',
#  'V130',
#  'V131',
#  'V132',
#  'V133',
#  'V134',
#  'V135',
#  'V136',
#  'V137',
#  'V138',
#  'V139',
#  'V14',
#  'V140',
#  'V141',
#  'V142',
#  'V143',
#  'V144',
#  'V145',
#  'V146',
#  'V147',
#  'V148',
#  'V149',
#  'V15',
#  'V150',
#  'V151',
#  'V152',
#  'V153',
#  'V154',
#  'V155',
#  'V156',
#  'V157',
#  'V158',
#  'V159',
#  'V16',
#  'V160',
#  'V161',
#  'V162',
#  'V163',
#  'V164',
#  'V165',
#  'V166',
#  'V167',
#  'V168',
#  'V169',
#  'V17',
#  'V170',
#  'V171',
#  'V172',
#  'V173',
#  'V174',
#  'V175',
#  'V176',
#  'V177',
#  'V178',
#  'V179',
#  'V18',
#  'V180',
#  'V181',
#  'V182',
#  'V183',
#  'V184',
#  'V185',
#  'V186',
#  'V187',
#  'V188',
#  'V189',
#  'V19',
#  'V190',
#  'V191',
#  'V192',
#  'V193',
#  'V194',
#  'V195',
#  'V196',
#  'V197',
#  'V198',
#  'V199',
#  'V2',
#  'V20',
#  'V200',
#  'V201',
#  'V202',
#  'V203',
#  'V204',
#  'V205',
#  'V206',
#  'V207',
#  'V208',
#  'V209',
#  'V21',
#  'V210',
#  'V211',
#  'V212',
#  'V213',
#  'V214',
#  'V215',
#  'V216',
#  'V217',
#  'V218',
#  'V219',
#  'V22',
#  'V220',
#  'V221',
#  'V222',
#  'V223',
#  'V224',
#  'V225',
#  'V226',
#  'V227',
#  'V228',
#  'V229',
#  'V23',
#  'V230',
#  'V231',
#  'V232',
#  'V233',
#  'V234',
#  'V235',
#  'V236',
#  'V237',
#  'V238',
#  'V239',
#  'V24',
#  'V240',
#  'V241',
#  'V242',
#  'V243',
#  'V244',
#  'V245',
#  'V246',
#  'V247',
#  'V248',
#  'V249',
#  'V25',
#  'V250',
#  'V251',
#  'V252',
#  'V253',
#  'V254',
#  'V255',
#  'V256',
#  'V257',
#  'V258',
#  'V259',
#  'V26',
#  'V260',
#  'V261',
#  'V262',
#  'V263',
#  'V264',
#  'V265',
#  'V266',
#  'V267',
#  'V268',
#  'V269',
#  'V27',
#  'V270',
#  'V271',
#  'V272',
#  'V273',
#  'V274',
#  'V275',
#  'V276',
#  'V277',
#  'V278',
#  'V279',
#  'V28',
#  'V280',
#  'V281',
#  'V282',
#  'V283',
#  'V284',
#  'V285',
#  'V286',
#  'V287',
#  'V288',
#  'V289',
#  'V29',
#  'V290',
#  'V291',
#  'V292',
#  'V293',
#  'V294',
#  'V295',
#  'V296',
#  'V297',
#  'V298',
#  'V299',
#  'V3',
#  'V30',
#  'V300',
#  'V301',
#  'V302',
#  'V303',
#  'V304',
#  'V305',
#  'V306',
#  'V307',
#  'V308',
#  'V309',
#  'V31',
#  'V310',
#  'V311',
#  'V312',
#  'V313',
#  'V314',
#  'V315',
#  'V316',
#  'V317',
#  'V318',
#  'V319',
#  'V32',
#  'V320',
#  'V321',
#  'V322',
#  'V323',
#  'V324',
#  'V325',
#  'V326',
#  'V327',
#  'V328',
#  'V329',
#  'V33',
#  'V330',
#  'V331',
#  'V332',
#  'V333',
#  'V334',
#  'V335',
#  'V336',
#  'V337',
#  'V338',
#  'V339',
#  'V34',
#  'V35',
#  'V36',
#  'V37',
#  'V38',
#  'V39',
#  'V4',
#  'V40',
#  'V41',
#  'V42',
#  'V43',
#  'V44',
#  'V45',
#  'V46',
#  'V47',
#  'V48',
#  'V49',
#  'V5',
#  'V50',
#  'V51',
#  'V52',
#  'V53',
#  'V54',
#  'V55',
#  'V56',
#  'V57',
#  'V58',
#  'V59',
#  'V6',
#  'V60',
#  'V61',
#  'V62',
#  'V63',
#  'V64',
#  'V65',
#  'V66',
#  'V67',
#  'V68',
#  'V69',
#  'V7',
#  'V70',
#  'V71',
#  'V72',
#  'V73',
#  'V74',
#  'V75',
#  'V76',
#  'V77',
#  'V78',
#  'V79',
#  'V8',
#  'V80',
#  'V81',
#  'V82',
#  'V83',
#  'V84',
#  'V85',
#  'V86',
#  'V87',
#  'V88',
#  'V89',
#  'V9',
#  'V90',
#  'V91',
#  'V92',
#  'V93',
#  'V94',
#  'V95',
#  'V96',
#  'V97',
#  'V98',
#  'V99',
 'addr1',
#  'addr2',
#  'card1',
#  'card2',
#  'card3',
#  'card4',
#  'card5',
#  'card6',
 'dist1',
#  'dist2',
#  'id_01',
#  'id_02',
#  'id_03',
#  'id_04',
#  'id_05',
#  'id_06',
#  'id_07',
#  'id_08',
#  'id_09',
#  'id_10',
#  'id_11',
#  'id_12',
#  'id_13',
#  'id_14',
#  'id_15',
#  'id_16',
#  'id_17',
#  'id_18',
#  'id_19',
#  'id_20',
#  'id_21',
#  'id_22',
#  'id_23',
#  'id_24',
#  'id_25',
#  'id_26',
#  'id_27',
#  'id_28',
#  'id_29',
#  'id_30',
#  'id_31',
#  'id_32',
#  'id_33',
#  'id_34',
#  'id_35',
#  'id_36',
#  'id_37',
#  'id_38',
#  'TransactionDT_dayOfMonth',
#  'TransactionDT_dayOfWeek',
#  'TransactionDT_weekOfMonth',
#  'TransactionDT_hour',
#  'is_holiday',
#  'is_foreign',
#  'OS_NAME',
#  'OS_V0',
#  'OS_V1',
#  'OS_V2',
#  'OS_V_COMBINED',
#  'OS_V_MAJOR',
#  'id_33_height',
#  'id_33_width',
#  'P_emaildomain_1',
#  'P_emaildomain_2',
#  'P_emaildomain_3',
#  'R_emaildomain_1',
#  'R_emaildomain_2',
#  'R_emaildomain_3',
#  'R=P',
#  'R1=P1',
#  'addr1_isnull',
 'addr2_isnull',
#  'dist1_isnull',
 'dist2_isnull',
#  'D2_isnull',
#  'D3_isnull',
#  'D5_isnull',
#  'D6_isnull',
#  'D7_isnull',
#  'D8_isnull',
#  'D9_isnull',
#  'D10_isnull',
#  'D11_isnull',
#  'D12_isnull',
#  'D13_isnull',
#  'D14_isnull',
#  'D15_isnull',
#  'M1_isnull',
#  'M4_isnull',
#  'M6_isnull',
#  'M7_isnull',
#  'V1_isnull',
#  'V12_isnull',
#  'V35_isnull',
#  'V53_isnull',
#  'V75_isnull',
#  'V95_isnull',
#  'V138_isnull',
#  'V167_isnull',
#  'V322_isnull',
#  'id_01_isnull',
#  'id_02_isnull',
#  'id_03_isnull',
#  'id_04_isnull',
#  'id_05_isnull',
#  'id_06_isnull',
#  'id_07_isnull',
#  'id_08_isnull',
#  'id_09_isnull',
#  'id_10_isnull',
#  'id_11_isnull',
#  'id_12_isnull',
#  'id_13_isnull',
#  'id_14_isnull',
#  'id_15_isnull',
#  'id_16_isnull',
#  'id_17_isnull',
#  'id_18_isnull',
#  'id_19_isnull',
#  'id_20_isnull',
#  'id_21_isnull',
#  'id_22_isnull',
#  'id_23_isnull',
#  'id_24_isnull',
#  'id_25_isnull',
#  'id_26_isnull',
#  'id_27_isnull',
#  'id_28_isnull',
#  'id_29_isnull',
#  'id_30_isnull',
#  'id_31_isnull',
#  'id_32_isnull',
#  'id_33_isnull',
#  'id_34_isnull',
#  'id_35_isnull',
#  'id_36_isnull',
#  'id_37_isnull',
#  'id_38_isnull',
 'DeviceType_isnull',
 'DeviceInfo_isnull',
 'device_name',
 'card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',
 'addr1_count',
 'addr2_count',
#  'TransactionDT_hour_count',
#  'card1_TransactionDT_hour_count',
 'card1_TransactionDT_hour_count_how_typical',
#  'TransactionDT_dayOfMonth_count',
#  'TransactionDT_weekOfMonth_count',

 'card1_addr1_count_how_typical',
#  'card1_dist1_count',
#  'card1_dist1_count_how_typical',
#  'card2_addr1_count',
#  'card2_addr1_count_how_typical',
#  'card3_addr1_count',
#  'card3_addr1_count_how_typical',
#  'card4_addr1_count',
#  'card4_addr1_count_how_typical',
#  'card6_addr1_count',
#  'card6_addr1_count_how_typical',
#  'card1_P_emaildomain_count',
 'card1_P_emaildomain_count_how_typical',
#  'card1_R_emaildomain_count',
 'card1_R_emaildomain_count_how_typical',
#  'card1_is_foreign_count',
#  'card1_is_foreign_count_how_typical',
#  'card1_ProductCD_count',
#  'card1_ProductCD_count_how_typical',
#  'card1_C1_count',
 'card1_C1_count_how_typical',
#  'card1_C2_count',
 'card1_C2_count_how_typical',
#  'card1_C3_count',
 'card1_C3_count_how_typical',
#  'card1_C4_count',
 'card1_C4_count_how_typical',
#  'card1_C5_count',
 'card1_C5_count_how_typical',
#  'card1_C6_count',
  'card1_C6_count_how_typical',
#  'card1_C7_count',
  'card1_C7_count_how_typical',
#  'card1_C8_count',
  'card1_C8_count_how_typical',
#  'card1_C9_count',
  'card1_C9_count_how_typical',
#  'card1_C10_count',
  'card1_C10_count_how_typical',
#  'card1_C11_count',
  'card1_C11_count_how_typical',
#  'card1_C12_count',
  'card1_C12_count_how_typical',
#  'card1_C13_count',
  'card1_C13_count_how_typical',
#  'card1_C14_count',
  'card1_C14_count_how_typical',
#  'smoothed_encoded_card1_on_TransactionAmt_mean',
#  'smoothed_encoded_card1_on_TransactionAmt_std',
#  'smoothed_encoded_card1_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card2_on_TransactionAmt_mean',
#  'smoothed_encoded_card2_on_TransactionAmt_std',
#  'smoothed_encoded_card2_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card3_on_TransactionAmt_mean',
#  'smoothed_encoded_card3_on_TransactionAmt_std',
#  'smoothed_encoded_card3_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card5_on_TransactionAmt_mean',
#  'smoothed_encoded_card5_on_TransactionAmt_std',
#  'smoothed_encoded_card5_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card2_card3_on_TransactionAmt_mean',
#  'smoothed_encoded_card2_card3_on_TransactionAmt_std',
#  'smoothed_encoded_card2_card3_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card1_card2_card3_on_TransactionAmt_mean',
#  'smoothed_encoded_card1_card2_card3_on_TransactionAmt_std',
#  'smoothed_encoded_card1_card2_card3_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card1_on_dist1_mean',
#  'smoothed_encoded_card1_on_dist1_std',
#  'smoothed_encoded_card1_on_dist1_nanmedian',
#  'smoothed_encoded_card1_on_TransactionDT_hour_mean',
#  'smoothed_encoded_card1_on_card2_nunique',
 'smoothed_encoded_card1_on_P_emaildomain_nunique',
 'smoothed_encoded_card1_on_R_emaildomain_nunique',
 'smoothed_encoded_card1_on_ProductCD_nunique',
 'smoothed_encoded_card2_on_P_emaildomain_nunique',
 'smoothed_encoded_card2_on_R_emaildomain_nunique',
 'smoothed_encoded_card2_on_ProductCD_nunique',
 'smoothed_encoded_addr1_on_P_emaildomain_nunique',
 'smoothed_encoded_addr1_on_R_emaildomain_nunique',
 'smoothed_encoded_addr1_on_ProductCD_nunique',
 'smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique',
 'smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique',
#  'smoothed_encoded_card2_on_card1_nunique',
#  'smoothed_encoded_card4_on_card1_nunique',
#  'smoothed_encoded_card6_on_card1_nunique',
#  'smoothed_encoded_card2_card3_on_card1_nunique',
'smoothed_encoded_addr1_on_card1_nunique',
 'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum',
    
 'mean_time_between_transactions',
 'median_time_between_transactions',
 'time_from_prev_transaction_ratio_to_mean',
 'time_from_prev_transaction_ratio_to_median',
 'time_to_next_transaction_ratio_to_mean',
 'time_to_next_transaction_ratio_to_median',
    

]

In [12]:
COLS_TO_USE

['TransactionAmt',
 'dist1',
 'dist2',
 'addr1_isnull',
 'addr2_isnull',
 'dist1_isnull',
 'dist2_isnull',
 'D2_isnull',
 'D3_isnull',
 'D5_isnull',
 'D6_isnull',
 'D7_isnull',
 'D8_isnull',
 'D9_isnull',
 'D10_isnull',
 'D11_isnull',
 'D12_isnull',
 'D13_isnull',
 'D14_isnull',
 'D15_isnull',
 'M1_isnull',
 'M4_isnull',
 'M6_isnull',
 'M7_isnull',
 'card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',
 'addr1_count',
 'addr2_count',
 'TransactionDT_hour_count',
 'card1_TransactionDT_hour_count',
 'card1_TransactionDT_hour_count_how_typical',
 'TransactionDT_dayOfMonth_count',
 'TransactionDT_weekOfMonth_count',
 'DeviceInfo_count',
 'device_name_count',
 'P_emaildomain_count',
 'R_emaildomain_count',
 'card1_addr1_count',
 'card1_addr1_count_how_typical',
 'card1_dist1_count',
 'card1_dist1_count_how_typical',
 'card2_addr1_count',
 'card2_addr1_count_how_typical',
 'card3_addr1_count',
 'card3_addr1_count_how_typical',
 'card4_addr1_count',
 '

In [None]:
r = gbm.predict(
    train_test_transformed[
        (train_test_transformed['TransactionDT_split'] >= 19) & \
        (train_test_transformed['TransactionDT_split'] < 20)
    ].sample(1000)[COLS_TO_USE],
    pred_contrib=True
)

mean_shap = np.abs(r).mean(axis=0)


shap_imp = pd.DataFrame(sorted(zip(mean_shap, COLS_TO_USE)), columns=['Value','Feature'])
plt.figure(figsize=(10, 20))
sns.barplot(x="Value", y="Feature", data=shap_imp.sort_values(by="Value", ascending=False)[:150])
plt.tight_layout()
plt.show()



In [15]:
COLS_TO_USE = [
    #'card1_addr1_count',
#  'card1_addr1_count_how_typical',
#  'card1_dist1_count',
#  'card1_dist1_count_how_typical',
#  'card2_addr1_count',
#  'card2_addr1_count_how_typical',
#  'card3_addr1_count',
#  'card3_addr1_count_how_typical',
#  'card4_addr1_count',
#  'card4_addr1_count_how_typical',
#  'card6_addr1_count',
#  'card6_addr1_count_how_typical',
#  'card1_P_emaildomain_count',
#  'card1_P_emaildomain_count_how_typical',
#  'card1_R_emaildomain_count',
#  'card1_R_emaildomain_count_how_typical',
#  'card1_is_foreign_count',
#  'card1_is_foreign_count_how_typical',
#  'card1_ProductCD_count',
#  'card1_ProductCD_count_how_typical',
#  'card1_C1_count',
#  'card1_C1_count_how_typical',
#  'card1_C2_count',
#  'card1_C2_count_how_typical',
#  'card1_C3_count',
#  'card1_C3_count_how_typical',
#  'card1_C4_count',
#  'card1_C4_count_how_typical',
#  'card1_C5_count',
#  'card1_C5_count_how_typical',
#  'card1_C6_count',
#  'card1_C6_count_how_typical',
#  'card1_C7_count',
#  'card1_C7_count_how_typical',
#  'card1_C8_count',
#  'card1_C8_count_how_typical',
#  'card1_C9_count',
#  'card1_C9_count_how_typical',
#  'card1_C10_count',
#  'card1_C10_count_how_typical',
#  'card1_C11_count',
#  'card1_C11_count_how_typical',
#  'card1_C12_count',
#  'card1_C12_count_how_typical',
#  'card1_C13_count',
#  'card1_C13_count_how_typical',
#  'card1_C14_count',
#  'card1_C14_count_how_typical',
#  'smoothed_encoded_card1_on_TransactionAmt_mean',
#  'smoothed_encoded_card1_on_TransactionAmt_std',
#  'smoothed_encoded_card1_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card2_on_TransactionAmt_mean',
#  'smoothed_encoded_card2_on_TransactionAmt_std',
#  'smoothed_encoded_card2_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card3_on_TransactionAmt_mean',
#  'smoothed_encoded_card3_on_TransactionAmt_std',
#  'smoothed_encoded_card3_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card4_on_TransactionAmt_mean',
#  'smoothed_encoded_card4_on_TransactionAmt_std',
#  'smoothed_encoded_card4_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card5_on_TransactionAmt_mean',
#  'smoothed_encoded_card5_on_TransactionAmt_std',
#  'smoothed_encoded_card5_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_std',
#  'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_nanmedian',
#  'smoothed_encoded_card1_on_dist1_mean',
#  'smoothed_encoded_card1_on_dist1_std',
#  'smoothed_encoded_card1_on_dist1_nanmedian',
#  'smoothed_encoded_card1_on_TransactionDT_hour_mean',
#  'smoothed_encoded_card1_on_OS_NAME_nunique',
    
 #'smoothed_encoded_card1_on_card2_nunique',
    
#  'smoothed_encoded_card1_on_card3_nunique',
#  'smoothed_encoded_card1_on_card5_nunique',
#  'smoothed_encoded_card1_on_card6_nunique',
# 'smoothed_encoded_card1_on_addr1_nunique',
    
#  'smoothed_encoded_card1_on_dist1_nunique',
#  'smoothed_encoded_card1_on_dist2_nunique',
#  'smoothed_encoded_card1_on_addr2_nunique',
    
#  'smoothed_encoded_card1_on_P_emaildomain_nunique',
#  'smoothed_encoded_card1_on_R_emaildomain_nunique',   # GOOD
    
#  'smoothed_encoded_card1_on_ProductCD_nunique',
    
#  'smoothed_encoded_card2_on_card3_nunique',
#  'smoothed_encoded_card2_on_card4_nunique',
#  'smoothed_encoded_card2_on_card5_nunique',
#  'smoothed_encoded_card2_on_card6_nunique',
#  'smoothed_encoded_card2_on_addr1_nunique',
#  'smoothed_encoded_card2_on_addr2_nunique',
#  'smoothed_encoded_card2_on_dist1_nunique',
#  'smoothed_encoded_card2_on_dist2_nunique',
#  'smoothed_encoded_card2_on_P_emaildomain_nunique',
#  'smoothed_encoded_card2_on_R_emaildomain_nunique',
#  'smoothed_encoded_card2_on_ProductCD_nunique',
#  'smoothed_encoded_card3_on_card4_nunique',
#  'smoothed_encoded_card3_on_card5_nunique',
#  'smoothed_encoded_card3_on_card6_nunique',
#  'smoothed_encoded_card3_on_addr1_nunique',
#  'smoothed_encoded_card3_on_addr2_nunique',
#  'smoothed_encoded_card3_on_dist1_nunique',
#  'smoothed_encoded_card3_on_dist2_nunique',
#  'smoothed_encoded_card3_on_P_emaildomain_nunique',
#  'smoothed_encoded_card3_on_R_emaildomain_nunique',
#  'smoothed_encoded_card3_on_ProductCD_nunique',
#  'smoothed_encoded_card4_on_card5_nunique',
#  'smoothed_encoded_card4_on_card6_nunique',
#  'smoothed_encoded_card4_on_addr1_nunique',
#  'smoothed_encoded_card4_on_addr2_nunique',
#  'smoothed_encoded_card4_on_dist1_nunique',
#  'smoothed_encoded_card4_on_dist2_nunique',
#  'smoothed_encoded_card4_on_P_emaildomain_nunique',
#  'smoothed_encoded_card4_on_R_emaildomain_nunique',
#  'smoothed_encoded_card4_on_ProductCD_nunique',
#  'smoothed_encoded_card5_on_card6_nunique',
#  'smoothed_encoded_card5_on_addr1_nunique',
#  'smoothed_encoded_card5_on_addr2_nunique',
#  'smoothed_encoded_card5_on_dist1_nunique',
#  'smoothed_encoded_card5_on_dist2_nunique',
#  'smoothed_encoded_card5_on_P_emaildomain_nunique',
#  'smoothed_encoded_card5_on_R_emaildomain_nunique',
#  'smoothed_encoded_card5_on_ProductCD_nunique',
#  'smoothed_encoded_addr1_on_addr2_nunique',
#  'smoothed_encoded_addr1_on_dist1_nunique',
#  'smoothed_encoded_addr1_on_dist2_nunique',
    
 'smoothed_encoded_addr1_on_P_emaildomain_nunique',
 'smoothed_encoded_addr1_on_R_emaildomain_nunique',
# 'smoothed_encoded_addr1_on_ProductCD_nunique',
#  'smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique',
    
# 'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum'
]

In [5]:
COLS_TO_USE

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'DeviceInfo',
 'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V160',
 'V161',
 'V

In [20]:
sample_submission['TransactionID'] = train_test_transformed.iloc[train.shape[0]:].index
sample_submission['isFraud'] = np.array(preds).mean(axis=0)
sample_submission.head()

vv = np.array([i.best_score['valid']['auc'] for i in gbms]).mean()
filename = f'submit_ts_kfold_{vv}_mean_shuffled.csv'
sample_submission.to_csv(filename, index=None)
print(filename)

!KAGGLE_USERNAME=merkylove KAGGLE_KEY=d5a91994e09f1cb75e62c8b8b2967594 kaggle competitions submit ieee-fraud-detection -f {filename} -m "auto"

submit_ts_kfold_0.9406398446273224_mean_shuffled.csv
100%|██████████████████████████████████████| 13.8M/13.8M [00:00<00:00, 15.3MB/s]
Successfully submitted to IEEE-CIS Fraud Detection

In [162]:
['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165',
 'V166',
 'V167',
 'V168',
 'V169',
 'V17',
 'V170',
 'V171',
 'V172',
 'V173',
 'V174',
 'V175',
 'V176',
 'V177',
 'V178',
 'V179',
 'V18',
 'V180',
 'V181',
 'V182',
 'V183',
 'V184',
 'V185',
 'V186',
 'V187',
 'V188',
 'V189',
 'V19',
 'V190',
 'V191',
 'V192',
 'V193',
 'V194',
 'V195',
 'V196',
 'V197',
 'V198',
 'V199',
 'V2',
 'V20',
 'V200',
 'V201',
 'V202',
 'V203',
 'V204',
 'V205',
 'V206',
 'V207',
 'V208',
 'V209',
 'V21',
 'V210',
 'V211',
 'V212',
 'V213',
 'V214',
 'V215',
 'V216',
 'V217',
 'V218',
 'V219',
 'V22',
 'V220',
 'V221',
 'V222',
 'V223',
 'V224',
 'V225',
 'V226',
 'V227',
 'V228',
 'V229',
 'V23',
 'V230',
 'V231',
 'V232',
 'V233',
 'V234',
 'V235',
 'V236',
 'V237',
 'V238',
 'V239',
 'V24',
 'V240',
 'V241',
 'V242',
 'V243',
 'V244',
 'V245',
 'V246',
 'V247',
 'V248',
 'V249',
 'V25',
 'V250',
 'V251',
 'V252',
 'V253',
 'V254',
 'V255',
 'V256',
 'V257',
 'V258',
 'V259',
 'V26',
 'V260',
 'V261',
 'V262',
 'V263',
 'V264',
 'V265',
 'V266',
 'V267',
 'V268',
 'V269',
 'V27',
 'V270',
 'V271',
 'V272',
 'V273',
 'V274',
 'V275',
 'V276',
 'V277',
 'V278',
 'V279',
 'V28',
 'V280',
 'V281',
 'V282',
 'V283',
 'V284',
 'V285',
 'V286',
 'V287',
 'V288',
 'V289',
 'V29',
 'V290',
 'V291',
 'V292',
 'V293',
 'V294',
 'V295',
 'V296',
 'V297',
 'V298',
 'V299',
 'V3',
 'V30',
 'V300',
 'V301',
 'V302',
 'V303',
 'V304',
 'V305',
 'V306',
 'V307',
 'V308',
 'V309',
 'V31',
 'V310',
 'V311',
 'V312',
 'V313',
 'V314',
 'V315',
 'V316',
 'V317',
 'V318',
 'V319',
 'V32',
 'V320',
 'V321',
 'V322',
 'V323',
 'V324',
 'V325',
 'V326',
 'V327',
 'V328',
 'V329',
 'V33',
 'V330',
 'V331',
 'V332',
 'V333',
 'V334',
 'V335',
 'V336',
 'V337',
 'V338',
 'V339',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V4',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V5',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V6',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V7',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V8',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V9',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'addr1',
 'dist1',
 'addr2_isnull',
 'dist2_isnull',
 'DeviceType_isnull',
 'DeviceInfo_isnull',
 'device_name',
 'card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',
 'addr1_count',
 'addr2_count',
 'card1_TransactionDT_hour_count_how_typical',
 'card1_addr1_count_how_typical',
 'card1_P_emaildomain_count_how_typical',
 'card1_R_emaildomain_count_how_typical',
 'card1_C1_count_how_typical',
 'card1_C2_count_how_typical',
 'card1_C3_count_how_typical',
 'card1_C4_count_how_typical',
 'card1_C5_count_how_typical',
 'card1_C6_count_how_typical',
 'card1_C7_count_how_typical',
 'card1_C8_count_how_typical',
 'card1_C9_count_how_typical',
 'card1_C10_count_how_typical',
 'card1_C11_count_how_typical',
 'card1_C12_count_how_typical',
 'card1_C13_count_how_typical',
 'card1_C14_count_how_typical',
 'smoothed_encoded_card1_on_P_emaildomain_nunique',
 'smoothed_encoded_card1_on_R_emaildomain_nunique',
 'smoothed_encoded_card1_on_ProductCD_nunique',
 'smoothed_encoded_card2_on_P_emaildomain_nunique',
 'smoothed_encoded_card2_on_R_emaildomain_nunique',
 'smoothed_encoded_card2_on_ProductCD_nunique',
 'smoothed_encoded_addr1_on_P_emaildomain_nunique',
 'smoothed_encoded_addr1_on_R_emaildomain_nunique',
 'smoothed_encoded_addr1_on_ProductCD_nunique',
 'smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique',
 'smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique',
 'smoothed_encoded_addr1_on_card1_nunique',
 'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum']

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165

In [163]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [165]:
train_test_transformed[[f'V{i}' for i in range(1, 12)]].corr()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11
V1,1.0,0.022551,0.016818,0.010361,0.009728,0.022909,0.017923,0.030131,0.024366,0.004665,0.004541
V2,0.022551,1.0,0.756247,0.280211,0.26788,0.534092,0.416574,0.663982,0.551424,0.032982,0.035165
V3,0.016818,0.756247,1.0,0.208703,0.319912,0.396648,0.480968,0.494421,0.599841,-0.010145,0.02105
V4,0.010361,0.280211,0.208703,1.0,0.905086,0.236708,0.188142,0.296748,0.243133,-0.004677,-0.00575
V5,0.009728,0.26788,0.319912,0.905086,1.0,0.220123,0.266676,0.284097,0.33562,-0.0274,-0.00917
V6,0.022909,0.534092,0.396648,0.236708,0.220123,1.0,0.78085,0.5929,0.487246,0.043465,0.042622
V7,0.017923,0.416574,0.480968,0.188142,0.266676,0.78085,1.0,0.457812,0.527379,0.011838,0.039588
V8,0.030131,0.663982,0.494421,0.296748,0.284097,0.5929,0.457812,1.0,0.813416,0.065376,0.063721
V9,0.024366,0.551424,0.599841,0.243133,0.33562,0.487246,0.527379,0.813416,1.0,0.032289,0.058662
V10,0.004665,0.032982,-0.010145,-0.004677,-0.0274,0.043465,0.011838,0.065376,0.032289,1.0,0.966152


In [205]:
rest = PCA(n_components=0.99).fit_transform(StandardScaler().fit_transform(train_test_transformed[[f'V{i}' for i in range(1, 340)]].fillna(-0.5)))

In [206]:
rest.shape

(1097231, 149)

In [207]:
folds = GroupKFold(n_splits=6)

scores = []

for fold_ind, (train_idx, val_idx) in enumerate(
    folds.split(
        X=train_test_transformed[:train.shape[0]], 
        y=train_test_transformed[:train.shape[0]]['isFraud'], 
        groups=train_test_transformed[:train.shape[0]]['TransactionDT_split'].tolist()
    )
):
    
    #if fold_ind == 0:
    #    continue
    
    print('Fold', fold_ind)
    
    pars = {
        'num_leaves': 2 ** 7 - 1,
        #'min_data_in_leaf': 10,
        'learning_rate': 0.01,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 51,
        'cat_smooth': 209,
        'lambda_l1': 1.0,
        'lambda_l2': 3.0,
        'max_bin': 100,
        'scale_pos_weight': 7.0,
        #'max_cat_to_onehot': 10,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['auc'],
        'num_threads': -1
    }
    
    cat_c = [i for i in CATEGORICAL_FEATURES
        if i in COLS_TO_USE]
    
    cur_train = train_test_transformed[:train.shape[0]].iloc[train_idx]
    cur_test = train_test_transformed[:train.shape[0]].iloc[val_idx]
    
    # create dataset for lightgbm
    #lgb_train = lgb.Dataset(cur_train[COLS_TO_USE], cur_train['isFraud'])
    #lgb_eval = lgb.Dataset(cur_test[COLS_TO_USE], cur_test['isFraud'], reference=lgb_train)
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(rest[train_idx, :], cur_train['isFraud'])
    lgb_eval = lgb.Dataset(rest[val_idx, :], cur_test['isFraud'], reference=lgb_train)
    
    gbm = lgb.train(
        pars,
        lgb_train,
        num_boost_round=20000,
        valid_sets=(lgb_train, lgb_eval),
        valid_names=('train', 'valid'),
        early_stopping_rounds=100,
        #feature_name=COLS_TO_USE,
        #categorical_feature=cat_c,
        verbose_eval=100
    )
    
    scores.append(gbm.best_score['valid']['auc'])
    
    
import scipy


print(np.array(scores).mean(axis=0))

Fold 0
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.894964	valid's auc: 0.788835
[200]	train's auc: 0.905982	valid's auc: 0.7931
[300]	train's auc: 0.91458	valid's auc: 0.794623
[400]	train's auc: 0.921769	valid's auc: 0.797659
[500]	train's auc: 0.927747	valid's auc: 0.797867
Early stopping, best iteration is:
[446]	train's auc: 0.924661	valid's auc: 0.798602
Fold 1
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.887927	valid's auc: 0.850761
[200]	train's auc: 0.899322	valid's auc: 0.853699
[300]	train's auc: 0.907985	valid's auc: 0.85597
[400]	train's auc: 0.915423	valid's auc: 0.857067
[500]	train's auc: 0.921983	valid's auc: 0.857819
Early stopping, best iteration is:
[473]	train's auc: 0.920289	valid's auc: 0.858036
Fold 2
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.888587	valid's auc: 0.847537
[200]	train's auc: 0.899313	valid's auc: 0.851353
[300]	train's auc: 0.907

In [12]:
train_test_transformed[['V126-137_mean', 'V126-137_std', 'TransactionAmt'] + [f'V{i}' for i in range(126, 138)]]

Unnamed: 0_level_0,V126-137_mean,V126-137_std,TransactionAmt,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2987000,,,68.500,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
2987001,,,29.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987002,,,59.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987003,,,50.000,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0
2987004,,,50.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,,,94.679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4170236,,,12.173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4170237,,,49.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4170238,,,202.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
(train_test_transformed[[f'V{i}' for i in range(126, 138)]] > 0).sum(axis=1)

TransactionID
2987000    2
2987001    0
2987002    0
2987003    8
2987004    0
          ..
4170235    0
4170236    0
4170237    0
4170238    0
4170239    0
Length: 1097231, dtype: int64

In [21]:
train_test_transformed[[f'V{i}' for i in range(126, 138)]].sum(axis=1) / (train_test_transformed[[f'V{i}' for i in range(126, 138)]] > 0).sum(axis=1)

TransactionID
2987000    117.00
2987001       NaN
2987002       NaN
2987003    683.25
2987004       NaN
            ...  
4170235       NaN
4170236       NaN
4170237       NaN
4170238       NaN
4170239       NaN
Length: 1097231, dtype: float64

In [14]:
train_test_transformed['V126-137_mean_ratio_Amt'] = train_test_transformed['V126-137_mean'] / train_test_transformed['TransactionAmt']
train_test_transformed['V126-137_mean_ratio_std'] = train_test_transformed['V126-137_mean'] / train_test_transformed['V126-137_std']


train_test_transformed['V306-321_mean_ratio_Amt'] = train_test_transformed['V306-321_mean'] / train_test_transformed['TransactionAmt']
train_test_transformed['V306-321_mean_ratio_std'] = train_test_transformed['V306-321_mean'] / train_test_transformed['V306-321_std']

In [98]:
folds = GroupKFold(n_splits=6)


scores = []

CTU = [
    'TransactionAmt_count_within_1min',
 'TransactionAmt_sum_within_1min',
 'TransactionAmt_mean_within_1min',
 'TransactionAmt_std_within_1min',
        'TransactionAmt_count_within_5min',
 'TransactionAmt_sum_within_5min',
 'TransactionAmt_mean_within_5min',
 'TransactionAmt_std_within_5min',
 'TransactionAmt_count_within_1h',
 'TransactionAmt_sum_within_1h',
 'TransactionAmt_mean_within_1h',
 'TransactionAmt_std_within_1h',
 'TransactionAmt_count_within_1d',
 'TransactionAmt_sum_within_1d',
 'TransactionAmt_mean_within_1d',
 'TransactionAmt_std_within_1d',
    'TransactionAmt_count_within_7d',
 'TransactionAmt_sum_within_7d',
 'TransactionAmt_mean_within_7d',
 'TransactionAmt_std_within_7d',
#    'TransactionAmt'
]

for fold_ind, (train_idx, val_idx) in enumerate(
    folds.split(
        X=train_test_transformed[:train.shape[0]], 
        y=train_test_transformed[:train.shape[0]]['isFraud'], 
        groups=train_test_transformed[:train.shape[0]]['TransactionDT_split'].tolist()
    )
):
    
    #if fold_ind == 0:
    #    continue
    
    print('Fold', fold_ind)
    
    pars = {
        'num_leaves': 2 ** 7 - 1,
        #'min_data_in_leaf': 10,
        'learning_rate': 0.01,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 51,
        'cat_smooth': 209,
        'lambda_l1': 1.0,
        'lambda_l2': 3.0,
        'max_bin': 100,
        'scale_pos_weight': 7.0,
        #'max_cat_to_onehot': 10,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['auc'],
        'num_threads': -1
    }
    
    #cat_c = [i for i in CATEGORICAL_FEATURES
    #    if i in COLS_TO_USE]
    
    cur_train = train_test_transformed[:train.shape[0]].iloc[train_idx]
    cur_test = train_test_transformed[:train.shape[0]].iloc[val_idx]
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(cur_train[CTU], cur_train['isFraud'])
    lgb_eval = lgb.Dataset(cur_test[CTU], cur_test['isFraud'], reference=lgb_train)
    
    gbm = lgb.train(
        pars,
        lgb_train,
        num_boost_round=20000,
        valid_sets=(lgb_train, lgb_eval),
        valid_names=('train', 'valid'),
        early_stopping_rounds=100,
        #feature_name=COLS_TO_USE,
        #categorical_feature=cat_c,
        verbose_eval=100
    )
    
    scores.append(gbm.best_score['valid']['auc'])
    
    
import scipy


print(np.array(scores).mean(axis=0))

Fold 0
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.790423	valid's auc: 0.68068
Early stopping, best iteration is:
[73]	train's auc: 0.785124	valid's auc: 0.681326
Fold 1


MemoryError: 

In [53]:
train_test_transformed['Transaction_to_datetime'] = train_test_transformed['TransactionDT'].apply(
        lambda x: datetime.datetime.strptime('2017-11-30', '%Y-%m-%d') + datetime.timedelta(seconds=x)
    )

In [55]:
train_test_transformed.head()

Unnamed: 0,TransactionID,C1,C10,C11,C12,C13,C14,C2,C3,C4,...,smoothed_encoded_addr1_on_ProductCD_nunique,smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique,smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique,smoothed_encoded_card2_on_card1_nunique,smoothed_encoded_card4_on_card1_nunique,smoothed_encoded_card6_on_card1_nunique,smoothed_encoded_card2_card3_on_card1_nunique,smoothed_encoded_addr1_on_card1_nunique,smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum,Transaction_to_datetime
0,2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,...,4.0,,,,21.0,7214.0,,1750.0,68.5,2017-12-01 00:00:00
1,2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,5.0,,54.0,21.0,5750.0,7214.0,19.0,2205.0,29.0,2017-12-01 00:00:01
2,2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,5.0,,20.0,125.0,9705.0,9994.0,125.0,1886.0,106.95,2017-12-01 00:01:09
3,2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,...,4.0,,43.0,34.0,5750.0,9994.0,34.0,1018.0,286.95,2017-12-01 00:01:39
4,2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,4.0,,54.0,117.0,5750.0,7214.0,116.0,642.0,50.0,2017-12-01 00:01:46


In [67]:
train_test_transformed.set_index('Transaction_to_datetime', inplace=True)

In [69]:
train_test_transformed[['TransactionID']]

Unnamed: 0_level_0,TransactionID
Transaction_to_datetime,Unnamed: 1_level_1
2017-12-01 00:00:00,2987000
2017-12-01 00:00:01,2987001
2017-12-01 00:01:09,2987002
2017-12-01 00:01:39,2987003
2017-12-01 00:01:46,2987004
...,...
2018-12-30 23:57:59,4170235
2018-12-30 23:58:07,4170236
2018-12-30 23:58:46,4170237
2018-12-30 23:58:57,4170238


In [71]:
ab = train_test_transformed[['card1', 'TransactionAmt']].groupby('card1')\
        .rolling(interval)['TransactionAmt']\
        .count()

In [73]:
ab = ab.reset_index().set_index('Transaction_to_datetime')

In [82]:
train_test_transformed

Unnamed: 0_level_0,TransactionID,C1,C10,C11,C12,C13,C14,C2,C3,C4,...,smoothed_encoded_addr1_on_R_emaildomain_nunique,smoothed_encoded_addr1_on_ProductCD_nunique,smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique,smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique,smoothed_encoded_card2_on_card1_nunique,smoothed_encoded_card4_on_card1_nunique,smoothed_encoded_card6_on_card1_nunique,smoothed_encoded_card2_card3_on_card1_nunique,smoothed_encoded_addr1_on_card1_nunique,smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum
Transaction_to_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-01 00:00:00,2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,...,33.0,4.0,,,,21.0,7214.0,,1750.0,68.500
2017-12-01 00:00:01,2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,39.0,5.0,,54.0,21.0,5750.0,7214.0,19.0,2205.0,29.000
2017-12-01 00:01:09,2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,41.0,5.0,,20.0,125.0,9705.0,9994.0,125.0,1886.0,106.950
2017-12-01 00:01:39,2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,...,34.0,4.0,,43.0,34.0,5750.0,9994.0,34.0,1018.0,286.950
2017-12-01 00:01:46,2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,25.0,4.0,,54.0,117.0,5750.0,7214.0,116.0,642.0,50.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-30 23:57:59,4170235,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,19.0,1.0,50.0,54.0,5.0,5750.0,9994.0,5.0,188.0,150.503
2018-12-30 23:58:07,4170236,1.0,2.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0,...,,,39.0,40.0,16.0,5750.0,9994.0,16.0,,136.363
2018-12-30 23:58:46,4170237,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,28.0,4.0,,40.0,125.0,9705.0,9994.0,125.0,1124.0,49.000
2018-12-30 23:58:57,4170238,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,26.0,4.0,,40.0,14.0,5750.0,9994.0,14.0,830.0,202.000


In [81]:
ab.reset_index().sort_values('Transaction_to_datetime')

Unnamed: 0,Transaction_to_datetime,card1,TransactionAmt
263252,2017-12-01 00:00:00,4248,1.0
629904,2017-12-01 00:00:01,9979,1.0
742960,2017-12-01 00:01:09,11850,1.0
553376,2017-12-01 00:01:39,8796,1.0
737291,2017-12-01 00:01:46,11687,1.0
...,...,...,...
259849,2018-12-30 23:57:59,4146,42.0
672557,2018-12-30 23:58:07,10367,49.0
463601,2018-12-30 23:58:46,7207,4.0
453750,2018-12-30 23:58:57,7165,1.0


In [27]:
train_test_transformed\
        .groupby('card1')['TransactionAmt']\
        .rolling(interval)\
        .count()

Unnamed: 0_level_0,Unnamed: 1_level_0,TransactionAmt,TransactionID
card1,TransactionDT_to_datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2018-02-04 23:36:59,1.0,1.0
1,2018-01-15 21:14:11,1.0,1.0
2,2018-08-24 20:29:07,1.0,1.0
3,2018-10-13 13:52:45,1.0,1.0
3,2018-12-02 03:36:21,1.0,1.0
...,...,...,...
17090,2018-09-17 15:20:22,1.0,1.0
17090,2018-09-18 14:45:34,2.0,2.0
17090,2018-09-25 14:59:11,1.0,1.0
17090,2018-09-26 15:25:35,1.0,1.0


In [93]:
for interval in ['1week']:
    print(interval)
    train_test_transformed[f'TransactionAmt_count_within_{interval}'] = train_test_transformed\
        .groupby('card1')['TransactionAmt']\
        .rolling(interval)\
        .count()\
        .reset_index().sort_values('Transaction_to_datetime')['TransactionAmt'].values
    
    print('count')
    
    train_test_transformed[f'TransactionAmt_sum_within_{interval}'] = train_test_transformed \
        .groupby('card1')['TransactionAmt'] \
        .rolling(interval) \
        .sum()\
        .reset_index().sort_values('Transaction_to_datetime')['TransactionAmt'].values
    
    print('sum')

    train_test_transformed[f'TransactionAmt_mean_within_{interval}'] = train_test_transformed \
        .groupby('card1')['TransactionAmt'] \
        .rolling(interval) \
        .mean()\
        .reset_index().sort_values('Transaction_to_datetime')['TransactionAmt'].values
    
    print('mean')

    train_test_transformed[f'TransactionAmt_std_within_{interval}'] = train_test_transformed \
        .groupby('card1')['TransactionAmt'] \
        .rolling(interval) \
        .std()\
        .reset_index().sort_values('Transaction_to_datetime')['TransactionAmt'].values
    
    print('std')

1week


ValueError: passed window 1week is not compatible with a datetimelike index

In [22]:
encoders['card1'].transform(['12037'])

array([2207])

In [15]:
train_test_transformed['TransactionAmt_mean_within_1h']

TransactionDT_to_datetime
2017-12-01 00:00:00      23.443
2017-12-01 00:00:01      29.000
2017-12-01 00:01:09    2003.350
2017-12-01 00:01:39      51.213
2017-12-01 00:01:46     226.000
                         ...   
2018-12-30 23:57:59      25.000
2018-12-30 23:58:07      20.000
2018-12-30 23:58:46      25.000
2018-12-30 23:58:57      25.000
2018-12-30 23:59:05      25.000
Name: TransactionAmt_mean_within_1h, Length: 1097231, dtype: float64

In [217]:
train_test_transformed[['card1']].head()

Unnamed: 0_level_0,card1
TransactionID,Unnamed: 1_level_1
2987000,2094
2987001,4922
2987002,5827
2987003,4358
2987004,5751


In [222]:
train_test_transformed['time_from_prev_transaction'] = train_test_transformed.groupby('card1')['TransactionDT'].diff()

In [223]:
train_test_transformed[train_test_transformed.card1 == 4358][['time_from_prev_transaction', 'TransactionDT']]

Unnamed: 0_level_0,time_from_prev_transaction,TransactionDT
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1
2987003,,86499
2987062,1102.0,87601
2987076,186.0,87787
2987383,4829.0,92616
2987471,1644.0,94260
...,...,...
4169126,5539.0,34193148
4169373,4425.0,34197573
4169461,1921.0,34199494
4169762,5628.0,34205122


In [227]:
train_test_transformed['time_to_next_transaction'] = train_test_transformed.groupby('card1')['time_from_prev_transaction'].shift(-1)

In [226]:
train_test_transformed[train_test_transformed.card1 == 4358][['TransactionDT']].diff().shift(-1)

Unnamed: 0_level_0,TransactionDT
TransactionID,Unnamed: 1_level_1
2987003,1102.0
2987062,186.0
2987076,4829.0
2987383,1644.0
2987471,737.0
...,...
4169126,4425.0
4169373,1921.0
4169461,5628.0
4169762,4077.0


In [229]:
train_test_transformed[train_test_transformed.card1 == 4358][['TransactionDT', 'time_to_next_transaction', 'time_from_prev_transaction']]

Unnamed: 0_level_0,TransactionDT,time_to_next_transaction,time_from_prev_transaction
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2987003,86499,1102.0,
2987062,87601,186.0,1102.0
2987076,87787,4829.0,186.0
2987383,92616,1644.0,4829.0
2987471,94260,737.0,1644.0
...,...,...,...
4169126,34193148,4425.0,5539.0
4169373,34197573,1921.0,4425.0
4169461,34199494,5628.0,1921.0
4169762,34205122,4077.0,5628.0


In [34]:
train_test_transformed['card2-6_count'] = train_test_transformed.groupby(['card2', 'card3', 'card4', 'card5', 'card6'])['card1'].transform('count')

In [38]:
train_test_transformed['mean_D9'] = train_test_transformed.groupby(['card1'])['D9'].transform('mean')

In [39]:
train_test_transformed['mean_D9']

TransactionID
2987000    0.708333
2987001    0.610897
2987002    0.736111
2987003    0.563248
2987004    0.624999
             ...   
4170235    0.504613
4170236    0.498602
4170237    0.668320
4170238    0.566583
4170239    0.517676
Name: mean_D9, Length: 1097231, dtype: float64

In [None]:
def permutation_importance(X, y, model): 
    perm = {}
    y_true = model.predict_proba(X)[:,1]
    baseline= roc_auc_score(y, y_true)
    for cols in X.columns:
        value = X[cols]
        X[cols] = np.random.permutation(X[cols].values)
        y_true = model.predict_proba(X)[:,1]
        perm[cols] = roc_auc_score(y, y_true) - baseline
        X[cols] = value
    return perm

In [24]:
COLS_TO_USE

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'V91',
 'V70',
 'V317',
 'V258',
 'V62',
 'V312',
 'V165',
 'V49',
 'V48',
 'addr1',
 'dist1',
 'addr2_isnull',
 'dist2_isnull',
 'DeviceType_isnull',
 'DeviceInfo_isnull',
 'device_name',
 'card1_count',
 'card2_count',
 'card3_count',
 'card4_count',
 'card5_count',
 'card6_count',
 'addr1_count',
 'addr2_count',
 'card1_TransactionDT_hour_count_how_typical',
 'card1_addr1_count_how_typical',
 'card1_P_emaildomain_count_how_typical',
 'card1_R_emaildomain_count_how_typical',
 'card1_C1_count_how_typical',
 'card1_C2_count_how_typical',
 'card1_C3_count_how_typical',
 'card1_C4_count_how_typical',
 'card1_C5_count_how_typical',
 'card1_C6_count

In [48]:
folds = GroupKFold(n_splits=6)

from sklearn.metrics import roc_auc_score


for C in COLS_TO_USE:
    
    print(C, '\n')
    
    cur_scores = []

    for fold_ind, (train_idx, val_idx) in enumerate(
        folds.split(
            X=train_test_transformed[:train.shape[0]], 
            y=train_test_transformed[:train.shape[0]]['isFraud'], 
            groups=train_test_transformed[:train.shape[0]]['TransactionDT_split'].tolist()
        )
    ):

        cur_test = train_test_transformed[:train.shape[0]].iloc[val_idx].copy()
        cur_test.reset_index(inplace=True)
        tmp = cur_test[C].sample(frac=1.0).values
        cur_test[C] = tmp
        
        cur_preds = gbms[fold_ind].predict(cur_test[COLS_TO_USE])
        
        auc = roc_auc_score(train_test_transformed[:train.shape[0]].iloc[val_idx]['isFraud'], cur_preds)
        best_auc_diff = auc - gbms[fold_ind].best_score['valid']['auc']
        print(f'Feature {C}\tFold {fold_ind} = {auc}\tFrom Best = {best_auc_diff}')
        cur_scores.append(auc)
        
    mean_auc = np.array(cur_scores).mean()
    print(f'Feature avg auc = {mean_auc}')
    
    print('='*30)

C1 

Feature C1	Fold 0 = 0.8991628493554414	From Best = -0.009344590466301428
Feature C1	Fold 1 = 0.9387182853417214	From Best = -0.0044233127663524385
Feature C1	Fold 2 = 0.9425489218962381	From Best = -0.002082196073858511
Feature C1	Fold 3 = 0.9314126680771789	From Best = -0.003336970466762623
Feature C1	Fold 4 = 0.9429034332113593	From Best = -0.0025899740823062123
Feature C1	Fold 5 = 0.9525615713824254	From Best = -0.0027069794710056216
Feature avg auc = 0.9345512882107273
C10 

Feature C10	Fold 0 = 0.9084722356610903	From Best = -3.5204160652613226e-05
Feature C10	Fold 1 = 0.9431462067478046	From Best = 4.608639730729891e-06
Feature C10	Fold 2 = 0.9446355698322868	From Best = 4.4518621902245314e-06
Feature C10	Fold 3 = 0.9346145516149477	From Best = -0.0001350869289938883
Feature C10	Fold 4 = 0.9454620581063634	From Best = -3.1349187302076587e-05
Feature C10	Fold 5 = 0.9552740806761586	From Best = 5.529822727612377e-06
Feature avg auc = 0.9386007837731086
C11 

Feature C11	Fold 0

Feature D11	Fold 0 = 0.9084953316793962	From Best = -1.2108142346645145e-05
Feature D11	Fold 1 = 0.9430604141662684	From Best = -8.118394180545163e-05
Feature D11	Fold 2 = 0.9445222751020041	From Best = -0.00010884286809242916
Feature D11	Fold 3 = 0.9342447592484671	From Best = -0.0005048792954744918
Feature D11	Fold 4 = 0.945104087839165	From Best = -0.0003893194545004919
Feature D11	Fold 5 = 0.9549548384102675	From Best = -0.0003137124431634719
Feature avg auc = 0.9383969510742615
D12 

Feature D12	Fold 0 = 0.9085301736486602	From Best = 2.2733826917309408e-05
Feature D12	Fold 1 = 0.9431340784801945	From Best = -7.519627879370994e-06
Feature D12	Fold 2 = 0.9446489618597487	From Best = 1.7843889652091605e-05
Feature D12	Fold 3 = 0.9347186325439155	From Best = -3.100600002603482e-05
Feature D12	Fold 4 = 0.9456466785752123	From Best = 0.00015327128154685177
Feature D12	Fold 5 = 0.9552082871603345	From Best = -6.026369309652946e-05
Feature avg auc = 0.9386478020446777
D13 

Feature D13	F

Feature M1	Fold 5 = 0.955267610258584	From Best = -9.405948470098124e-07
Feature avg auc = 0.9386310347848964
M2 

Feature M2	Fold 0 = 0.9082623036362014	From Best = -0.0002451361855414502
Feature M2	Fold 1 = 0.9431538079445286	From Best = 1.2209836454779577e-05
Feature M2	Fold 2 = 0.9445531739408758	From Best = -7.794402922078891e-05
Feature M2	Fold 3 = 0.9347539519687409	From Best = 4.313424799340382e-06
Feature M2	Fold 4 = 0.9453730310519831	From Best = -0.00012037624168237393
Feature M2	Fold 5 = 0.9552283415174418	From Best = -4.020933598913867e-05
Feature avg auc = 0.9385541016766287
M3 

Feature M3	Fold 0 = 0.9082806322050685	From Best = -0.00022680761667437377
Feature M3	Fold 1 = 0.9431543534361783	From Best = 1.2755328104407937e-05
Feature M3	Fold 2 = 0.9441964601318431	From Best = -0.0004346578382534716
Feature M3	Fold 3 = 0.9347146953331962	From Best = -3.49432107453751e-05
Feature M3	Fold 4 = 0.9449222955778449	From Best = -0.0005711117158205825
Feature M3	Fold 5 = 0.9549124

Feature V101	Fold 1 = 0.9431432014456792	From Best = 1.6033376053492532e-06
Feature V101	Fold 2 = 0.944633146513032	From Best = 2.028542935406108e-06
Feature V101	Fold 3 = 0.9347377077152436	From Best = -1.1930828697992446e-05
Feature V101	Fold 4 = 0.9454970962600071	From Best = 3.688966341619526e-06
Feature V101	Fold 5 = 0.9552691502092171	From Best = 5.993557861128096e-07
Feature avg auc = 0.9386315893459588
V102 

Feature V102	Fold 0 = 0.9082840646002034	From Best = -0.00022337522153947909
Feature V102	Fold 1 = 0.9431427349738483	From Best = 1.1368657744093014e-06
Feature V102	Fold 2 = 0.9445771490104476	From Best = -5.3968959648931225e-05
Feature V102	Fold 3 = 0.9346632508514134	From Best = -8.638769252811151e-05
Feature V102	Fold 4 = 0.9454012639410516	From Best = -9.214335261387152e-05
Feature V102	Fold 5 = 0.9550998912602359	From Best = -0.00016865959319511958
Feature avg auc = 0.9385280591062001
V103 

Feature V103	Fold 0 = 0.9084174521078223	From Best = -8.998771392054739e-05


Feature V116	Fold 0 = 0.9085065196029244	From Best = -9.202188184787019e-07
Feature V116	Fold 1 = 0.9431388935910637	From Best = -2.704517010121421e-06
Feature V116	Fold 2 = 0.94463218386616	From Best = 1.0658960634124526e-06
Feature V116	Fold 3 = 0.9347501190549944	From Best = 4.805110528360856e-07
Feature V116	Fold 4 = 0.9454932597350119	From Best = -1.475586536292539e-07
Feature V116	Fold 5 = 0.9552744569140977	From Best = 5.906060666749369e-06
Feature avg auc = 0.9386325721273754
V117 

Feature V117	Fold 0 = 0.9085074503505624	From Best = 1.0528819482935603e-08
Feature V117	Fold 1 = 0.9431415981080739	From Best = 0.0
Feature V117	Fold 2 = 0.9446311179700967	From Best = 1.1102230246251565e-16
Feature V117	Fold 3 = 0.9347496385439416	From Best = 0.0
Feature V117	Fold 4 = 0.9454934072936655	From Best = 0.0
Feature V117	Fold 5 = 0.9552692902047293	From Best = 7.393512982956096e-07
Feature avg auc = 0.9386320837451781
V118 

Feature V118	Fold 0 = 0.9085074840427846	From Best = 4.4221041

KeyboardInterrupt: 

In [44]:
USELESS_FEATURES = [
    'C10', 
    'C4',
    'D12',
    'D6',
    'D7',
    'D9',
    'M1',
    'addr2_isnull',
    'dist2_isnull',
    'DeviceType_isnull',
    'DeviceInfo_isnull',
    'addr2_count',
    'card1_C7_count_how_typical',
    'card1_C8_count_how_typical',
    'smoothed_encoded_card2_on_ProductCD_nunique',
    'smoothed_encoded_addr1_on_ProductCD_nunique',
    'time_from_prev_transaction_ratio_to_mean',
    
]
QUESTIONABLE = [
    'time_to_next_transaction_ratio_to_median',
    'time_to_next_transaction_ratio_to_mean',
    'time_from_prev_transaction_ratio_to_median',
    'median_time_between_transactions',
    'mean_time_between_transactions',
    'smoothed_encoded_card1_TransactionDT_split_TransactionDT_dayOfMonth_TransactionDT_hour_on_TransactionAmt_sum',
    'smoothed_encoded_addr1_on_card1_nunique',
    'smoothed_encoded_P_emaildomain_on_R_emaildomain_nunique',
    'smoothed_encoded_R_emaildomain_on_P_emaildomain_nunique',
    'smoothed_encoded_addr1_on_R_emaildomain_nunique',
    'smoothed_encoded_addr1_on_P_emaildomain_nunique',
    'smoothed_encoded_card2_on_R_emaildomain_nunique',
    'smoothed_encoded_card2_on_P_emaildomain_nunique',
    'smoothed_encoded_card1_on_ProductCD_nunique',
    'smoothed_encoded_card1_on_P_emaildomain_nunique',
    'card1_C10_count_how_typical',
    'card1_C9_count_how_typical',
    'card1_C4_count_how_typical',
    'card1_C3_count_how_typical',
    'card1_R_emaildomain_count_how_typical',
    'card1_addr1_count_how_typical',
    'card1_TransactionDT_hour_count_how_typical',
    'addr1_count',
    'card1_count',
    'card2_count',
    'card3_count',
    'card4_count',
    'card5_count',
    'C12',
    'C3',
    'C5',
    'C6',
    'C7',
    'C8',
    'C9',
    'D10',
    'D11',
    'D13',
    'D14',
    'D5',
    'M2',
    'M3',
    'M7',
    'M8',
    'M9',
    'R_emaildomain',
    'V91',
    'V165',
    'V49',
    'V48',
    'dist1',
    
]

In [34]:
train_test_transformed['C1'].sample(frac=1.0)

TransactionID
4129017      3.0
3321360      1.0
3840741      1.0
3674844      1.0
4116162      1.0
           ...  
3321074    104.0
3043911      1.0
3205474      1.0
4057741      1.0
3907149      1.0
Name: C1, Length: 1097231, dtype: float64

In [27]:
train_test_transformed['C1']

TransactionID
2987000    1.0
2987001    1.0
2987002    1.0
2987003    2.0
2987004    1.0
          ... 
4170235    1.0
4170236    1.0
4170237    1.0
4170238    1.0
4170239    1.0
Name: C1, Length: 1097231, dtype: float64

In [4]:
train_test_transformed['TransactionAmt_cumsum_card1'] = train_test_transformed.groupby('card1')['TransactionAmt'].cumsum()

In [5]:
train_test_transformed['Transaction_Number'] = train_test_transformed.groupby('card1').cumcount() + 1

In [6]:
train_test_transformed['TransactionAmt_cumsum_card1_current_mean'] = train_test_transformed['TransactionAmt_cumsum_card1'] / train_test_transformed['Transaction_Number']

In [8]:
train_test_transformed[train_test_transformed.card1==1039][['TransactionAmt', 'TransactionAmt_cumsum_card1_current_mean', 'TransactionAmt_cumsum_card1']]

Unnamed: 0_level_0,TransactionAmt,TransactionAmt_cumsum_card1_current_mean,TransactionAmt_cumsum_card1
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3095765,108.5,108.5,108.5
3095779,117.0,112.75,225.5
3201189,287.64,171.046667,513.14


In [25]:
train_test_transformed[train_test_transformed.card1 == 2207][[    'TransactionAmt_count_within_1min',
 'TransactionAmt_sum_within_1min',
 'TransactionAmt_mean_within_1min',
 'TransactionAmt_std_within_1min',
 'TransactionAmt_count_within_1h',
 'TransactionAmt_sum_within_1h',
 'TransactionAmt_mean_within_1h',
 'TransactionAmt_std_within_1h',
 'TransactionAmt_count_within_1d',
 'TransactionAmt_sum_within_1d',
 'TransactionAmt_mean_within_1d',
 'TransactionAmt_std_within_1d', 'TransactionAmt']]

Unnamed: 0_level_0,TransactionAmt_count_within_1min,TransactionAmt_sum_within_1min,TransactionAmt_mean_within_1min,TransactionAmt_std_within_1min,TransactionAmt_count_within_1h,TransactionAmt_sum_within_1h,TransactionAmt_mean_within_1h,TransactionAmt_std_within_1h,TransactionAmt_count_within_1d,TransactionAmt_sum_within_1d,TransactionAmt_mean_within_1d,TransactionAmt_std_within_1d,TransactionAmt
TransactionDT_to_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-12-01 13:50:10,1.0,19.500,19.500,,2.0,127.450,63.725000,62.543595,10.0,1316.850,131.685000,211.906063,75.00
2017-12-01 15:49:59,1.0,200.000,200.000,,3.0,297.361,99.120333,87.759583,11.0,919.211,83.564636,48.123783,500.00
2017-12-01 16:02:03,1.0,59.000,59.000,,2.0,91.970,45.985000,18.405990,11.0,992.810,90.255455,81.409576,311.95
2017-12-01 18:03:22,1.0,57.950,57.950,,2.0,369.900,184.950000,179.605122,12.0,1091.270,90.939167,74.132766,300.00
2017-12-01 19:02:48,1.0,107.950,107.950,,1.0,107.950,107.950000,,16.0,1844.650,115.290625,107.105981,311.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-28 18:17:49,1.0,44.000,44.000,,1.0,44.000,44.000000,,4.0,246.000,61.500000,25.877919,47.95
2018-12-29 04:42:04,1.0,4.505,4.505,,1.0,4.505,4.505000,,8.0,67.439,8.429875,11.101223,97.00
2018-12-29 13:21:20,1.0,14.017,14.017,,1.0,14.017,14.017000,,16.0,317.421,19.838812,22.365978,117.00
2018-12-29 17:25:37,1.0,424.950,424.950,,1.0,424.950,424.950000,,1.0,424.950,424.950000,,159.00


In [4]:
train[train.card1 == 12037]

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312_card1_mean,V313_card1_mean,V314_card1_mean,V315_card1_mean,V316_card1_mean,V317_card1_mean,V318_card1_mean,V319_card1_mean,V320_card1_mean,V321_card1_mean
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2988597,0,136210,75.00,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
2989161,0,143399,500.00,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
2989234,0,144123,311.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
2989995,0,151402,300.00,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
2990377,0,154968,311.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3571476,1,15626250,35.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
3573321,0,15696395,77.00,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
3574868,0,15726465,82.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261
3577532,0,15811007,204.97,W,12037,595.0,150.0,mastercard,224.0,debit,...,23.734739,13.774564,32.737689,20.58782,20.964348,51.587536,36.63529,6.436623,9.936116,9.091261


In [6]:
train_test_transformed.card1

TransactionID
2987000     4248
2987001     9979
2987002    11850
2987003     8796
2987004    11687
           ...  
4170235     4146
4170236    10367
4170237     7207
4170238     7165
4170239    12881
Name: card1, Length: 1097231, dtype: int64

In [7]:
train.card1

TransactionID
2987000    13926
2987001     2755
2987002     4663
2987003    18132
2987004     4497
           ...  
3577535     6550
3577536    10444
3577537    12037
3577538     7826
3577539    15066
Name: card1, Length: 590540, dtype: int64