In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import datetime
from IPython.display import clear_output
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


RANDOM_STATE = 42

In [7]:
from generate_features import generate_features_time_series
from read_utils import read_data

from feature_engineering import add_datetime_features, process_id_30, \
    process_id_33, emaildomain_features, count_features, smoothed_encodings, \
    encode_categorical_features, V_features_to_PCA, D_features_to_PCA, \
    C_features_to_PCA, exchange_rate_took_place_feature, generate_uid_features

In [3]:
from settings import CATEGORICAL_FEATURES, COLUMNS_TO_REMOVE

In [4]:
train, test, sample_submission = read_data()

In [5]:
train.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [6]:
print(CATEGORICAL_FEATURES)

['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo', 'OS_NAME', 'OS_V_MAJOR', 'OS_V0', 'OS_V1', 'OS_V2', 'is_foreign', 'is_holiday', 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']


In [7]:
%%time
train_test_transformed = generate_features_time_series(train, test)

Starting 2019-09-07 19:58:23.445082
Concatted 2019-09-07 19:58:28.416171
DT FEATURES 2019-09-07 19:58:42.249113


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: x if ' ' not in x else ' '.join(x.split()[:-1])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: parse_version(x.split()[-1])[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: parse_version(x.split()[-1])[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: parse_version(x.spli

ids, emaildomain 2019-09-07 19:59:03.803445
Count features 2019-09-07 19:59:23.157600
target encoding 2019-09-07 19:59:53.138141
Mean Encoding 2019-09-07 20:04:40.795936
Encoders 2019-09-07 20:07:35.632144
CPU times: user 25min 11s, sys: 12min 8s, total: 37min 20s
Wall time: 9min 12s


In [8]:
train_test_transformed.head()

Unnamed: 0_level_0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,PCA_V_14_GROUP_0,PCA_V_14_GROUP_1,PCA_V_14_GROUP_2,PCA_V_14_GROUP_3,PCA_V_14_GROUP_4,PCA_V_14_GROUP_5,PCA_V_14_GROUP_6,PCA_C0,PCA_C1,PCA_C2
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,-0.162103,0.031453,0.02512,-0.072115,0.002818,0.01194,-0.011175,-0.331893,-0.318349,-0.089067
2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,-0.162103,0.031453,0.02512,-0.072115,0.002818,0.01194,-0.011175,-0.339774,-0.348925,-0.089575
2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,-0.162103,0.031453,0.02512,-0.072115,0.002818,0.01194,-0.011175,-0.335812,-0.318274,-0.089079
2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,-0.162103,0.031453,0.02512,-0.072115,0.002818,0.01194,-0.011175,-0.260197,-0.233751,-0.087608
2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,-0.162103,0.031453,0.02512,-0.072115,0.002818,0.01194,-0.011175,-0.331422,-0.353153,-0.089598


In [232]:
from feature_engineering import smoothed_encodings

In [8]:
train_test_joined = pd.concat([train, test], sort=True)

In [9]:
train_test_joined = add_datetime_features(train_test_joined)

In [99]:
train_test_joined = train_test_joined = emaildomain_features(train_test_joined)

In [101]:
train_test_joined, encoders = encode_categorical_features(
        train_test_joined,
        [i for i in CATEGORICAL_FEATURES
        if i in COLS_TO_USE]
    )

In [87]:
train_test_joined = smoothed_encodings(
    train_test_joined,
    [['card1', 'TransactionDT_dayOfWeek'], ['card1']],
    'TransactionAmt',
    funcs=[np.nanmedian, 'std', 'sum'],
    m=0
)

In [88]:
train_test_joined['TransactionAmt_ratio_nanmedian'] = train_test_joined['TransactionAmt'] / train_test_joined['smoothed_encoded_card1_on_TransactionAmt_nanmedian']

In [60]:
train_test_joined[['smoothed_encoded_card1_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card1_on_TransactionAmt_std',
'TransactionAmt_ratio_nanmedian', 'smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_std']].corr()

Unnamed: 0,smoothed_encoded_card1_on_TransactionAmt_nanmedian,smoothed_encoded_card1_on_TransactionAmt_std,TransactionAmt_ratio_nanmedian,smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_nanmedian,smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_std
smoothed_encoded_card1_on_TransactionAmt_nanmedian,1.0,0.453635,0.03514,0.729223,0.337896
smoothed_encoded_card1_on_TransactionAmt_std,0.453635,1.0,0.112576,0.392964,0.745048
TransactionAmt_ratio_nanmedian,0.03514,0.112576,1.0,0.132148,0.170931
smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_nanmedian,0.729223,0.392964,0.132148,1.0,0.360998
smoothed_encoded_card1_TransactionDT_dayOfWeek_on_TransactionAmt_std,0.337896,0.745048,0.170931,0.360998,1.0


In [29]:
train_test_transformed = train_test_joined

In [89]:
train_test_joined.columns.tolist()

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'DeviceInfo',
 'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'TransactionDT',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V

In [None]:
train_test_transformed['mean_diff_card_2'] = train_test_transformed['TransactionAmt'] / train_test_transformed['smoothed_encoded_card1_card2_on_TransactionAmt_nanmedian']
train_test_transformed['mean_diff_card_1'] = train_test_transformed['TransactionAmt'] / train_test_transformed['smoothed_encoded_card1_on_TransactionAmt_nanmedian']

In [541]:
train_test_transformed.columns.tolist()

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'DeviceInfo',
 'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'TransactionAmt',
 'TransactionDT',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V

In [542]:
COLUMNS_TO_REMOVE_FIXED = COLUMNS_TO_REMOVE + ['card1', 'card2', 'addr1', 'addr2', 'smoothed_encoded_card1_on_dist1_mean',
 'smoothed_encoded_card1_on_dist1_std',
 'smoothed_encoded_card2_on_dist1_mean',
 'smoothed_encoded_card2_on_dist1_std','smoothed_encoded_card1_on_TransactionAmt_nunique',
 'smoothed_encoded_card2_on_TransactionAmt_nunique',]

In [11]:
COLUMNS_TO_REMOVE_FIXED

['TransactionID',
 'TransactionDT',
 'isFraud',
 'TransactionDT_split',
 'card1',
 'card2',
 'addr1',
 'addr2',
 'smoothed_encoded_card1_on_dist1_mean',
 'smoothed_encoded_card1_on_dist1_std',
 'smoothed_encoded_card2_on_dist1_mean',
 'smoothed_encoded_card2_on_dist1_std',
 'smoothed_encoded_card1_on_TransactionAmt_nunique',
 'smoothed_encoded_card2_on_TransactionAmt_nunique']

In [360]:
train_test_transformed['TransactionAmt_diff_mean_card'] = train_test_transformed['smoothed_encoded_card1_on_TransactionAmt_nanmedian'] - train_test_transformed['TransactionAmt']
train_test_transformed['TransactionAmt_diff_mean_uid'] = train_test_transformed['smoothed_encoded_uid_on_TransactionAmt_nanmedian'] - train_test_transformed['TransactionAmt']
train_test_transformed['TransactionAmt_diff_mean_uid2'] = train_test_transformed['smoothed_encoded_uid2_on_TransactionAmt_nanmedian'] - train_test_transformed['TransactionAmt']
train_test_transformed['TransactionAmt_diff_mean_uid3'] = train_test_transformed['smoothed_encoded_uid3_on_TransactionAmt_nanmedian'] - train_test_transformed['TransactionAmt']
train_test_transformed['TransactionAmt_diff_mean_uid4'] = train_test_transformed['smoothed_encoded_uid4_on_TransactionAmt_nanmedian'] - train_test_transformed['TransactionAmt']

In [336]:
train_test_transformed['TransactionAmt_diff_mean'] = train_test_transformed['TransactionAmt'] - train_test_transformed['smoothed_encoded_uid_on_TransactionAmt_mean']
train_test_transformed['TransactionAmt_ratio_mean'] = train_test_transformed['TransactionAmt'] / train_test_transformed['smoothed_encoded_uid2_on_TransactionAmt_nanmedian']
train_test_transformed['TransactionAmt_ratio_mean_inv'] = train_test_transformed['smoothed_encoded_uid2_on_TransactionAmt_nanmedian'] / train_test_transformed['TransactionAmt']

In [450]:
train_test_transformed['TransactionAmt_ratio_min'] = train_test_transformed['TransactionAmt'] / train_test_transformed['smoothed_encoded_card1_on_TransactionAmt_min']
train_test_transformed['TransactionAmt_ratio_max'] = train_test_transformed['smoothed_encoded_card1_on_TransactionAmt_max'] / train_test_transformed['TransactionAmt']

In [109]:
COLS_TO_USE = [
    'TransactionAmt',
  #  'TransactionDT_dayOfMonth',
 #'TransactionDT_weekOfMonth',
 #'TransactionDT_hour',
 # 'is_holiday',
 #'is_foreign',
    
 #'addr1_count',
 #'addr2_count',
 #'TransactionDT_hour_count',
 #'TransactionDT_dayOfMonth_count',
 #'TransactionDT_weekOfMonth_count',
 #'DeviceType_count',
 #'DeviceInfo_count',
 #'OS_NAME_count',
 #'P_emaildomain_count',
 #'R_emaildomain_count',
 #'ProductCD_count',
    
    
    
# 'smoothed_encoded_addr1_on_TransactionAmt_mean',
# 'smoothed_encoded_addr2_on_TransactionAmt_mean',
    
    
#    'smoothed_encoded_TransactionDT_hour_on_TransactionAmt_mean',
# 'smoothed_encoded_TransactionDT_dayOfWeek_on_TransactionAmt_mean',
'smoothed_encoded_TransactionDT_dayOfMonth_on_TransactionAmt_mean',
'smoothed_encoded_TransactionDT_weekOfMonth_on_TransactionAmt_mean',
#0.7941

#'smoothed_encoded_card1_on_dist1_mean',
#'smoothed_encoded_card1_on_dist1_std',
# 'smoothed_encoded_card2_on_dist1_mean',
#'smoothed_encoded_card2_on_dist1_std',
    
    #0.822223
    
    
#'smoothed_encoded_card3_on_dist1_mean',
#'smoothed_encoded_card3_on_dist1_std',
#'smoothed_encoded_card4_on_dist1_mean',
#'smoothed_encoded_card4_on_dist1_std',
#'smoothed_encoded_card5_on_dist1_mean',
#'smoothed_encoded_card5_on_dist1_std',
#'smoothed_encoded_card6_on_dist1_mean',
#'smoothed_encoded_card6_on_dist1_std',
    
    
    #0.825289
    
 #   'smoothed_encoded_addr1_on_dist1_mean',
 #'smoothed_encoded_addr1_on_dist1_std',
 #'smoothed_encoded_addr2_on_dist1_mean',
 #'smoothed_encoded_addr2_on_dist1_std',
    
    #0.826418
    
#'smoothed_encoded_card1_on_dist2_mean',
#'smoothed_encoded_card1_on_dist2_std',
#'smoothed_encoded_card2_on_dist2_mean',
#'smoothed_encoded_card2_on_dist2_std',
#'smoothed_encoded_card4_on_dist2_mean',
#'smoothed_encoded_card4_on_dist2_std',
    
    #0.835994
    
 #    'smoothed_encoded_addr1_on_dist2_mean',
 #'smoothed_encoded_addr1_on_dist2_std',
 #'smoothed_encoded_addr2_on_dist2_mean',
 #'smoothed_encoded_addr2_on_dist2_std',
    
    #0.838919
    
#'smoothed_encoded_card1_on_TransactionAmt_nunique',
#'smoothed_encoded_card2_on_TransactionAmt_nunique',
#'smoothed_encoded_card4_on_TransactionAmt_nunique',
    
    #0.840648
    
#'DeviceInfo_count',
     
    #0.843219
    
'smoothed_encoded_uid4_on_TransactionAmt_mean',
'smoothed_encoded_uid4_on_TransactionAmt_std',
'smoothed_encoded_uid5_on_TransactionAmt_std',
    
    #0.846167
    
#'smoothed_encoded_uid4_on_dist1_mean',
#'smoothed_encoded_uid4_on_dist1_std',
#'smoothed_encoded_uid5_on_dist1_mean',
    
    #0.849265
    
'smoothed_encoded_uid2_on_TransactionAmt_nunique',
    
    #0.851577
    
'smoothed_encoded_uid_on_TransactionDT_hour_nunique',
    'smoothed_encoded_uid3_on_TransactionDT_hour_nunique',
'smoothed_encoded_card1_on_TransactionDT_hour_nunique',
    
    
    'TransactionAmt_ratio_min',
    'TransactionAmt_ratio_max'
    
    
#'smoothed_encoded_uid_on_TransactionAmt_min',
#'smoothed_encoded_uid_on_TransactionAmt_max',
#'smoothed_encoded_card1_on_TransactionAmt_min',
#'smoothed_encoded_card1_on_TransactionAmt_max',
    
    
    #'TransactionAmt_diff_mean_card',
    #'TransactionAmt_diff_mean_uid',
    #'TransactionAmt_diff_mean_uid2',
    #'TransactionAmt_diff_mean_uid3',
    #'TransactionAmt_diff_mean_uid4'
    
    #
    #'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C13', 'C14', 'C11', 'C2', 'C9', 'C12', 'C2',
    
 # 'smoothed_encoded_uid_on_C13_mean',
 #'smoothed_encoded_uid_on_C13_std',
 #'smoothed_encoded_uid_on_C13_nanmedian',
 #'smoothed_encoded_uid2_on_C13_mean',
 #'smoothed_encoded_uid2_on_C13_std',
 #'smoothed_encoded_uid2_on_C13_nanmedian',
 #'smoothed_encoded_uid3_on_C13_mean',
 #'smoothed_encoded_uid3_on_C13_std',
 #'smoothed_encoded_uid3_on_C13_nanmedian',
 #'smoothed_encoded_uid5_on_C13_mean',
 #'smoothed_encoded_uid5_on_C13_std',
 #'smoothed_encoded_uid5_on_C13_nanmedian',
    
    #0.9146
    
    #'smoothed_encoded_uid_on_C14_mean',
 #'smoothed_encoded_uid_on_C14_std',
# 'smoothed_encoded_uid_on_C14_nanmedian',
    
    #0.915555
    
    #'TransactionAmt_diff_mean_card',
   


]

COLS_TO_USE = [
    'TransactionAmt', 
    'smoothed_encoded_card1_on_TransactionAmt_nanmedian',
 'smoothed_encoded_card1_on_TransactionAmt_std',
    'TransactionAmt_ratio_nanmedian',
    'smoothed_encoded_card1_on_TransactionAmt_sum',
      'TransactionDT_dayOfMonth',
 'TransactionDT_weekOfMonth',
 'TransactionDT_hour',
  'is_holiday',
 'P_emaildomain',
 'R_emaildomain',
 'ProductCD',
#    'card2',
#    'card3',
#    'card4',
#    'card5',
#    'card6'
]

In [110]:
def train_val_lgb():
    
    #COLS_TO_USE = [
    #    i for i in train_test_transformed.columns
    #    if i not in COLUMNS_TO_REMOVE_FIXED
    #]
    
    pars = {
        'num_leaves': 2 ** 5 - 1,
        'learning_rate': 0.05,
        'feature_fraction': 0.6 ,
        'bagging_fraction': 0.6,
        'bagging_freq': 51,
        'cat_smooth': 209,
        'lambda_l1': 1.0,
        'lambda_l2': 3.0,
        'max_bin': 270,
        'scale_pos_weight': 7.0,
        #'max_cat_to_onehot': 10,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['auc'],
        'num_threads': -1
    }
    
    bound = 16
    
    cat_c = [i for i in CATEGORICAL_FEATURES
        if i in COLS_TO_USE]
    
    print(len(COLS_TO_USE))
    
    cur_train = train_test_transformed[
        train_test_transformed['TransactionDT_split'] <= bound
    ]
    
    cur_test = train_test_transformed[
        (train_test_transformed['TransactionDT_split'] > bound) & \
        (train_test_transformed['TransactionDT_split'] < 19)
    ]
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(cur_train[COLS_TO_USE], cur_train['isFraud'])
    lgb_eval = lgb.Dataset(cur_test[COLS_TO_USE], cur_test['isFraud'], reference=lgb_train)
    
    gbm = lgb.train(
        pars,
        lgb_train,
        num_boost_round=20000,
        valid_sets=(lgb_train, lgb_eval),
        valid_names=('train', 'valid'),
        early_stopping_rounds=100,
        feature_name=COLS_TO_USE,
        categorical_feature=cat_c,
        verbose_eval=100
    )
    
    return gbm

In [111]:
%%time
lgb_fitted = train_val_lgb()

12


New categorical_feature is ['P_emaildomain', 'ProductCD', 'R_emaildomain', 'is_holiday']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.852041	valid's auc: 0.80325
[200]	train's auc: 0.871398	valid's auc: 0.80752
[300]	train's auc: 0.884528	valid's auc: 0.80922
[400]	train's auc: 0.894791	valid's auc: 0.810034
[500]	train's auc: 0.903844	valid's auc: 0.812504
[600]	train's auc: 0.910723	valid's auc: 0.813931
Early stopping, best iteration is:
[598]	train's auc: 0.910609	valid's auc: 0.814022
CPU times: user 7min 35s, sys: 5.32 s, total: 7min 40s
Wall time: 29.7 s


In [224]:
COLS_TO_USE = [
        i for i in train_test_transformed.columns
        if i not in COLUMNS_TO_REMOVE_FIXED
    ]


r = lgb_fitted.predict(
    train_test_transformed[
        (train_test_transformed['TransactionDT_split'] > 16) & \
        (train_test_transformed['TransactionDT_split'] < 19)
    ].sample(10000)[COLS_TO_USE],
    pred_contrib=True
)

NameError: name 'COLUMNS_TO_REMOVE_FIXED' is not defined

In [None]:
mean_shap = np.abs(r).mean(axis=0)

In [None]:
shap_imp = pd.DataFrame(sorted(zip(mean_shap, COLS_TO_USE)), columns=['Value','Feature'])
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=shap_imp.sort_values(by="Value", ascending=False)[:60])
plt.tight_layout()
plt.show()

In [None]:
COLS_TO_USE

In [None]:
COLUMNS_TO_REMOVE_FIXED