In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cbt
import datetime

from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
%%time
folder_path = 'data/'
train = pd.read_pickle(f'{folder_path}train.pkl')
test = pd.read_pickle(f'{folder_path}test.pkl')

CPU times: user 1.91 s, sys: 2.68 s, total: 4.59 s
Wall time: 4.59 s


In [3]:
N_TRAIN_EXAMPLES = 590540

In [4]:
tr_am = pd.read_csv(f'{folder_path}train_transaction.csv', usecols=['TransactionAmt'], dtype=str)
te_am = pd.read_csv(f'{folder_path}test_transaction.csv', usecols=['TransactionAmt'], dtype=str)
train[['dollars', 'cents']] = tr_am.TransactionAmt.str.split('.', expand=True).astype(int)
test[['dollars', 'cents']] = te_am.TransactionAmt.str.split('.', expand=True).astype(int)

In [5]:
roman_df = pd.read_pickle('subcard_strikes_back_features.pkl')

In [6]:
roman_feature_names = set(roman_df.columns)

In [7]:
basic_features = set(test.columns) - set(['TransactionDT', 'TransactionID'])

In [8]:
MODEL_FEATURES = set(test.columns) - set(['TransactionDT', 'TransactionID'])

In [9]:
CATEGORICAL_FEATURES =  set(['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19',
            'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37',
            'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 
            'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'])

Говорят, что неплохо бы дропнуть те карты, которые не встречается или в трейне, или в тесте

In [10]:
# давай проверим с и без

for col in ['card1']: 

    print('No intersection in Train', len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

for col in ['card2','card3','card4','card5','card6']: 
    print('No intersection in Train', col, len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', col, len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

No intersection in Train 10396
Intersection in Train 580144
####################
No intersection in Train card2 5012
Intersection in Train card2 585528
####################
No intersection in Train card3 47
Intersection in Train card3 590493
####################
No intersection in Train card4 0
Intersection in Train card4 590540
####################
No intersection in Train card5 7279
Intersection in Train card5 583261
####################
No intersection in Train card6 30
Intersection in Train card6 590510
####################


In [11]:
roman_df.reset_index(inplace=True)

In [12]:
all_df = pd.concat([train, test])
all_df.reset_index(inplace=True, drop=True)
all_df = pd.concat([all_df, roman_df], axis=1)

# Фичи по датам

In [13]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
all_df['DT'] = all_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))

In [14]:
del train, test, roman_df
import gc
gc.collect()

40

In [15]:
def datetime_features(all_df):
    all_df['DT'] = all_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    all_df['dayofweek'] = all_df['DT'].dt.dayofweek
    all_df['dayofmonth'] = all_df['DT'].dt.day
    all_df['hour'] = all_df['DT'].dt.hour
    all_df['weekofmonth'] = (all_df['DT'].dt.day - 1) // 7 + 1
    new_features = ['dayofweek', 'dayofmonth', 'hour', 'weekofmonth']
    return new_features, new_features

In [16]:
%%time
a, c = datetime_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

CPU times: user 1.47 s, sys: 42.3 ms, total: 1.51 s
Wall time: 1.51 s


In [17]:
all_df['DT_split'] = (all_df['DT'].dt.year - 2017) * 12 + all_df['DT'].dt.month
N_TRAIN = sum(all_df['DT_split'] < 17)

In [18]:
all_df['day'] = ((all_df['DT'].dt.year-2017)*365 + all_df['DT'].dt.dayofyear).astype(np.int16)

# D-features

In [19]:
d_cols = [col for col in all_df.columns if col.startswith('D') and not col[-1:].isalpha() and col != 'D9']

In [20]:
d_cols_notnull = [d + '_notnull' for d in d_cols + ['D9']]
all_df[d_cols_notnull] = all_df[d_cols + ['D9']].notnull()

In [21]:
# Lets transform D8 and D9 column
# As we almost sure it has connection with hours
all_df['D8_not_same_day'] = np.where(all_df['D8']>=1,1,0)
all_df['D8_D9_decimal_dist'] = all_df['D8'].fillna(0)-all_df['D8'].fillna(0).astype(int)
all_df['D8_D9_decimal_dist'] = ((all_df['D8_D9_decimal_dist']-all_df['D9'])**2)**0.5

# почему -1?
all_df['D8'] = all_df['D8'].fillna(-1).astype(int)

In [22]:
MODEL_FEATURES.update(['D8_not_same_day', 'D8_D9_decimal_dist', 'D8_D9_decimal_dist'])

In [23]:
def values_normalization(all_df, period, col, clip=True, minmax=True):
        new_col = col + '_' + period
        df = all_df[[col, period]].copy()
        df[col] = df[col].astype(float)
        if clip:
            df[col] = df[col].clip(0) 

        aggs = df.groupby([period])[col].agg(['min', 'max', 'std', 'mean'])
        
        agg_max = aggs['max'].to_dict()
        agg_min = aggs['min'].to_dict()
        agg_std = aggs['std'].to_dict()
        agg_mean = aggs['mean'].to_dict()

        all_df['temp_min'] = all_df[period].map(agg_max)
        all_df['temp_max'] = all_df[period].map(agg_min)
        all_df['temp_std'] = all_df[period].map(agg_std)
        all_df['temp_mean'] = all_df[period].map(agg_mean)
        

        all_df[new_col + '_min_max'] = ((all_df[col] - all_df['temp_min']) /\
                (all_df['temp_max'] - all_df['temp_min'])).astype(float)
        
        all_df[new_col + '_std_score'] = (all_df[col] - all_df['temp_mean']) / (all_df['temp_std'])

        del all_df['temp_min'], all_df['temp_max'], all_df['temp_std'], all_df['temp_mean']

In [24]:
%%time
for period in ['day']:
    for col in d_cols:
        values_normalization(all_df, period, col, minmax=True)

CPU times: user 38 s, sys: 1min 2s, total: 1min 40s
Wall time: 1min 40s


# Device info

In [25]:
all_df['DeviceInfo'].fillna('', inplace=True)
all_df['id_30'].fillna('', inplace=True)
all_df['id_31'].fillna('', inplace=True)

In [26]:
def add_device_features(all_df):
    all_df['DeviceInfoMajor'] = all_df['DeviceInfo'].str.split(' ', expand=True)[0]
    all_df['DeviceInfoTop'] = all_df['DeviceInfo'].str.split('-', expand=True)[0]
    all_df['DeviceInfoIsRV'] = all_df['DeviceInfoMajor'].apply(lambda x: 'rv' in x)
    return ['DeviceInfoMajor', 'DeviceInfoTop', 'DeviceInfoIsRV'], ['DeviceInfoMajor', 'DeviceInfoTop']

In [27]:
%%time
a, c = add_device_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

CPU times: user 4.42 s, sys: 59.2 ms, total: 4.48 s
Wall time: 4.48 s


In [28]:
def split_version(version):
    n_sep_a = version.count('.')
    n_sep_b = version.count('_')
    if n_sep_a > 0:
        return version.split('.')[0]
    elif n_sep_b > 0:
        return version.split('_')[0]
    else:
        return version

def split_os(os):
    spl = os.split(' ')
    if len(spl) > 1:
        os_name = ' '.join(spl[:-1])
        major_version = split_version(spl[-1])
        return os_name, ' '.join([os_name, major_version])
    else:
        return os, ''
# TODO: add minor

In [29]:
def add_os_features(all_df):
    os, version = zip(*all_df['id_30'].apply(lambda x: split_os(x)).values)
    all_df['OSName'] = os
    all_df['OSMajorVersion'] = version
    return ['OSName', 'OSMajorVersion'], ['OSName', 'OSMajorVersion']

In [30]:
%%time
a, c = add_os_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

CPU times: user 1.61 s, sys: 23.9 ms, total: 1.63 s
Wall time: 1.63 s


In [31]:
# SHAME ON ME
def get_browser(browser):
    if 'safari' in browser:
        return 'safari'
    
    if 'chrome' in browser:
        return 'chrome'
    
    if browser.startswith('ie'):
        return 'internetexplorer'
    
    if 'edge' in browser:
        return 'edge'
    
    if 'firefox' in browser.lower():
        return 'firefox'
    
    if 'samsung' in browser.lower():
        return 'samsung'
    
    if 'google' in browser:
        return 'google'
    
    if 'opera' in browser:
        return 'opera'
    
    if 'android' in browser.lower():
        return 'android'
    
    return browser

def is_mobile(browser):
    br = browser.lower()
    if 'mobile' in br or 'for android' in br:
        return True
    else:
        return False


In [32]:
def get_browser_features(all_df):
    all_df['Browser'] = all_df['id_31'].apply(lambda x: get_browser(x))
    all_df['IsMobile'] = all_df['id_31'].apply(lambda x: is_mobile(x))
    #all_df['VersionNum'] = all_df['id_31'].fillna('0').\
    #    apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    #all_df['BrowserVersion'] = all_df['Browser'] + ' ' + all_df['VersionNum'].astype('str')
    
    return ['Browser', 'IsMobile'],  ['Browser', 'IsMobile']

In [33]:
%%time
a, c = get_browser_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

CPU times: user 839 ms, sys: 801 µs, total: 840 ms
Wall time: 839 ms


# Some with M

In [34]:
########################### M columns (except M4)
# All these columns are binary encoded 1/0
# We can have some features from it
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

all_df['M_sum'] = all_df[i_cols].sum(axis=1).astype(np.int8)
all_df['M_na'] = all_df[i_cols].isna().sum(axis=1).astype(np.int8)

MODEL_FEATURES.update(['M_sum', 'M_na'])

# NaN count

In [35]:
all_df['notnull_count'] = all_df[basic_features].notnull().sum(axis=1)

In [36]:
MODEL_FEATURES.add('notnull_count')

# UID

In [37]:
cols_for_uid = ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']

In [38]:
all_df[cols_for_uid] = all_df[cols_for_uid].astype(str)

In [42]:
all_df['subcard_categorical_full'] = all_df['card1'] + '_' + all_df['subcard_categorical'].astype('str')
all_df['subcard_categorical_D4_full'] = all_df['card1'] + '_' + all_df['subcard_categorical_D4'].astype('str')

In [44]:
subcard_id_cols = ['addr1', 'DeviceInfo', 'P_emaildomain', 'ProductCD',
                   'id_20', 'id_19', 'card4', 'subcard_categorical_D4']

In [45]:
subcard_ids = []
for col in subcard_id_cols:
    all_df['subcard_' + col] =all_df['subcard_categorical_full'].astype(str) + '_' +\
        all_df[col].astype(str)
    subcard_ids.append('subcard_' + col)

In [50]:
MODEL_FEATURES.update(['subcard_categorical_full', 'subcard_categorical_D4_full'])
CATEGORICAL_FEATURES.update(['subcard_categorical_full', 'subcard_categorical_D4_full'])

In [51]:
MODEL_FEATURES.update(subcard_ids)
CATEGORICAL_FEATURES.update(subcard_ids)

In [52]:
all_df['uid1'] = all_df['card1'] + '_' + all_df['card2']

all_df['uid2'] = all_df['uid1'] + '_' + all_df['card3'] + '_' + all_df['card5']

all_df['uid3'] = all_df['uid2'] + '_' + all_df['addr1'] + '_' + all_df['addr2']

all_df['uid4'] = all_df['uid3'] + '_' + all_df['P_emaildomain']

all_df['uid5'] = all_df['uid3'] + '_' + all_df['R_emaildomain']

In [53]:
uids = [f'uid{i}' for i in range(1, 6)] + ['subcard_categorical_full'] + subcard_ids

In [54]:
uids

['uid1',
 'uid2',
 'uid3',
 'uid4',
 'uid5',
 'subcard_categorical_full',
 'subcard_addr1',
 'subcard_DeviceInfo',
 'subcard_P_emaildomain',
 'subcard_ProductCD',
 'subcard_id_20',
 'subcard_id_19',
 'subcard_card4',
 'subcard_subcard_categorical_D4']

 # TransactionAmt features

Говорят, если в сумме транзакции есть более двух чисел после запятой, то это транзакция в иностранной валюте

In [55]:
all_df['is_foreign'] = all_df['cents'].apply(lambda x: len(str(x)) > 2)

In [56]:
MODEL_FEATURES.add('is_foreign')

А раз уникальных TransactionAmt не очень много, то почему бы не добавить nunique для каждого из айдишников сверху

In [57]:
from sklearn.preprocessing import QuantileTransformer
from collections import Counter

In [58]:
%%time
unique_amt_cols = []
for uid in uids:
    uniques = all_df.groupby(uid)['TransactionAmt'].nunique()
    all_df[uid + '_unique_amt'] = all_df[uid].map(uniques)
    unique_amt_cols.append(uid + '_unique_amt')
    
QT = QuantileTransformer(n_quantiles=500)
all_df[unique_amt_cols] = QT.fit_transform(all_df[unique_amt_cols])

CPU times: user 27.7 s, sys: 12.5 s, total: 40.1 s
Wall time: 40.1 s


In [59]:
MODEL_FEATURES.update(unique_amt_cols)

# Следующая транзакция

In [60]:
all_df.set_index('TransactionDT', inplace=True)

In [61]:
len(all_df)

1097231

In [62]:
%%time
near_cols = []
same_cols = []
for col in [['card1'], ['card1', 'subcard_categorical']] + [[i] for i in uids]:
    print(col)
    all_df[col] = all_df[col].astype(str)
    cur_group = all_df.groupby(col)
    cur_group = cur_group.apply(lambda x: x['TransactionAmt'])
    for i in range(1, 5):
        print(i)
        a = 'is_same_next_transaction_' + str(i)
        b = 'is_same_prev_transaction_' + str(i)
        all_df[a] = (cur_group == cur_group.shift(i)).reset_index().\
            sort_values('TransactionDT')['TransactionAmt'].values
        all_df[b] = (cur_group == cur_group.shift(-i)).reset_index().\
            sort_values('TransactionDT')['TransactionAmt'].values
        same_cols.extend([a, b])
    name = '_'.join(col)
    all_df[f'same_transaction_near_{name}'] = all_df[same_cols].sum(axis=1)
    all_df.drop(same_cols, axis=1, inplace=True)
    near_cols.append(f'same_transaction_near_{name}')

    for c in col:
        all_df[c] = all_df[c].replace({'': np.nan}, inplace=True)

['card1']
1
2
3
4
['card1', 'subcard_categorical']
1
2
3
4
['uid1']
1
2
3
4
['uid2']
1
2
3
4
['uid3']
1
2
3
4
['uid4']
1
2
3
4
['uid5']
1
2
3
4
['subcard_categorical_full']
1
2
3
4
['subcard_addr1']
1
2
3
4
['subcard_DeviceInfo']
1
2
3
4
['subcard_P_emaildomain']
1
2
3
4
['subcard_ProductCD']
1
2
3
4
['subcard_id_20']
1
2
3
4
['subcard_id_19']
1
2
3
4
['subcard_card4']
1
2
3
4
['subcard_subcard_categorical_D4']
1
2
3
4
CPU times: user 16min 4s, sys: 2min 5s, total: 18min 9s
Wall time: 18min 5s


In [63]:
all_df.reset_index(inplace=True)

# Frequency Encoding

In [65]:
def encode_frequency(col, quantile=True):
    return col.map(col.value_counts().to_dict())

In [66]:
%%time
freq_cols = []
for col in list(CATEGORICAL_FEATURES) + uids + ['cents', 'dollars'] :
    all_df[col + '_freq'] = encode_frequency(all_df[col])
    freq_cols.append(col + '_freq')
    
QT = QuantileTransformer(n_quantiles=500)
all_df[freq_cols] = QT.fit_transform(all_df[freq_cols])

  overwrite_input=overwrite_input, interpolation=interpolation


CPU times: user 33.5 s, sys: 14.2 s, total: 47.7 s
Wall time: 47.7 s


In [67]:
MODEL_FEATURES.update(freq_cols)

# Categorical

In [68]:
from multiprocessing import Pool

In [69]:
def encode(col):
    le = LabelEncoder()
    le.fit(all_df[col].astype(str).values)
    return le.transform(all_df[col].astype(str).values)

In [70]:
all_df['cents_categorical'] = all_df['cents'].copy()
all_df['dollars_categorical'] = all_df['dollars'].copy()

In [71]:
CATEGORICAL_FEATURES.update(['cents_categorical', 'dollars_categorical'])

In [72]:
MODEL_FEATURES.update(['cents_categorical', 'dollars_categorical'])

In [103]:
%%time
with Pool(16) as pool:
    encodes = pool.map(encode,CATEGORICAL_FEATURES +['subcard_categorical', 'subcard_categorical_D4'])

for name, enc in zip(CATEGORICAL_FEATURES + ['subcard_categorical', 'subcard_categorical_D4'], encodes):
    all_df[name] = enc

CPU times: user 122 ms, sys: 4.53 s, total: 4.65 s
Wall time: 5.78 s


# Numerical encoding

In [74]:
def calc_smooth_encoding(all_df, by, on, m):
    # Compute the global mean
    mean = all_df[on].mean()
    
    std = all_df[on].std()
    
    median = np.nanmedian(all_df[on])

    # Compute the number of values and the mean of each group
    agg = all_df.groupby(by)[on].agg(['count', 'mean', 'std', np.nanmedian])
    counts = agg['count']
    means = agg['mean']
    stds = agg['std']
    medians = agg['nanmedian']
    

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)
    
    smooth_std = (stds * counts + m * std ) / (counts + m)
    
    smooth_median = (medians * counts + m * median ) / (counts + m)

    # Replace each value by the according smoothed mean
    return all_df[by].map(smooth), all_df[by].map(smooth_std), all_df[by].map(smooth_median)

In [75]:
%%time
amt_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'TransactionAmt', 30)
    all_df[col + '_TransactionAmt_mean'] = mean
    all_df[col + '_TransactionAmt_std'] = std
    #all_df[col + '_TransactionAmt_median'] = median
    amt_features.extend([col + '_TransactionAmt_mean', col + '_TransactionAmt_std'])#, col + '_TransactionAmt_median'])

CPU times: user 44.5 s, sys: 1min 12s, total: 1min 57s
Wall time: 1min 57s


In [76]:
%%time
D15_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'D15_day_min_max', 30)
    all_df[col + '_D15_mean'] = mean
    all_df[col + '_D15_std'] = std
    #all_df[col + '_D15_median'] = median
    D15_features.extend([col + '_D15_mean', col + '_D15_std'])#, col + '_D15_median'])

CPU times: user 45.8 s, sys: 1min 14s, total: 2min
Wall time: 2min


In [77]:
%%time
C13_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'C13', 30)
    all_df[col + '_C13_mean'] = mean
    all_df[col + '_C13_std'] = std
    #all_df[col + '_C13_median'] = median
    C13_features.extend([col + '_C13_mean', col + '_C13_std'])#, col + '_C13_median']

CPU times: user 47.3 s, sys: 1min 16s, total: 2min 3s
Wall time: 2min 3s


In [78]:
MODEL_FEATURES.update(D15_features + C13_features)

In [79]:
MODEL_FEATURES.update(amt_features)

In [80]:
values_normalization(all_df, 'day', 'subcard_reg_timestamp')
values_normalization(all_df, 'day', 'subcard_reg_timestamp_D4')

In [81]:
MODEL_FEATURES.add('subcard_reg_timestamp_day_min_max')
MODEL_FEATURES.add('subcard_reg_timestamp_D4_day_min_max')

In [104]:
all_df.to_pickle('superfinal_features.pkl')

# Splits

In [105]:
train_final = all_df[:N_TRAIN_EXAMPLES]
test_final = all_df[N_TRAIN_EXAMPLES:]

In [84]:
def downsample(df, how_strong=2):
    positive = df[df.isFraud == 1]
    negative = df[df.isFraud == 0]
    negative = negative.sample(int(len(negative) / how_strong))
    res = pd.concat([positive, negative])
    return res.sample(len(res))

# Train

In [106]:
add_cols = [col + '_day_min_max' for col in d_cols] 
add_cols = add_cols + [col + '_day_std_score' for col in d_cols]

In [107]:
import sys
sys.path.insert(0, 'stable_features/IEEE_FRAUD/')

In [108]:
from settings import CATEGORICAL_FEATURES as roman_categorical

In [109]:
import pickle

In [110]:
roman_categorical = list(set(roman_categorical) - set(['is_holiday']))

In [111]:
final_features = set(list(MODEL_FEATURES) + list(roman_feature_names))

In [112]:
d_cols = [col for col in all_df.columns if col.startswith('D') 
          and not col[-1:].isalpha() and col != 'D9' and len(col) < 4]

In [113]:
good_cols = list(MODEL_FEATURES - set(['card1']) - set(d_cols) ) + add_cols
good_categotical = list(set(CATEGORICAL_FEATURES) - set(['card1']))

In [114]:
all_good_features = set(good_cols
                        + list(roman_feature_names)) -\
    set(['card1', 'dayofmonth', 'subcard_reg_timestamp', 'subcard_reg_timestamp_D4'])
all_good_categorical = set(good_categotical + roman_categorical) -\
    set(['card1', 'is_holiday', 'dayofmonth', 'subcard_reg_timestamp', 'subcard_reg_timestamp_D4'])

all_good_categorical = all_good_categorical.intersection(all_good_features)

In [115]:
with open('superfinal_feature_names.pkl', 'wb') as f:
    pickle.dump((MODEL_FEATURES, CATEGORICAL_FEATURES, roman_feature_names, roman_categorical,
                all_good_features, all_good_categorical), f)

# Fit KFold (Scary)

In [None]:
import pickle
with open('final_feature_names.pkl', 'rb') as f:
    MODEL_FEATURES, CATEGORICAL_FEATURES, roman_feature_names, roman_categorical,\
                all_good_features, all_good_categorical = pickle.load(f)
    
all_df = pd.read_pickle('final_features.pkl')

In [None]:
all_df.shape

In [116]:
from sklearn.model_selection import GroupKFold

In [117]:
gkf = GroupKFold(6)

In [118]:
params = {'num_leaves': 200,
          'min_child_samples': 40,
          #'min_sum_hessian_in_leaf': 5e-3,
          #'max_bin': 1023,
          #'min_data_in_leaf': 
          #'scale_pos_weight': 2,
          'objective': 'binary',
          'metric': 'auc',
          'max_depth': 13,
          'learning_rate': 1.01,
          "boosting_type": "gbdt",
          "top_rate": 0.6,
          "other_rate": 0.1,
          "bagging_freq": 0,
          'nthread': 16,
          
          'pos_bagging_fraction': 0.8,
          'neg_bagging_fraction': 0.01,
          "bagging_fraction": 0.7,
          "bagging_seed": 11,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          
          'feature_fraction': 0.9,
          'min_data_per_group': 25,
          'cat_smooth': 500,
          'max_cat_to_onehot': 8
          #'categorical_feature': cat_cols
         }

In [None]:
scores = []
all_preds = []
oof_preds = np.zeros(len(train_final))
for train_ids, val_ids in gkf.split(train_final, train_final['isFraud'], train_final['DT_split']):
    df_tr = train_final.loc[train_ids]
    df_val = train_final.loc[val_ids]
    dtrain = lgb.Dataset(df_tr[all_good_features], label=df_tr['isFraud'],
                         categorical_feature=all_good_categorical, free_raw_data=False)
    dval = lgb.Dataset(df_val[all_good_features], label=df_val['isFraud'],
                       categorical_feature=all_good_categorical, free_raw_data=False)
    
    
    model = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=(dval),
                early_stopping_rounds=100, verbose_eval=100)
    preds = model.predict(test_final[all_good_features], num_iteration=model.best_iteration)
    oof = model.predict(df_val[all_good_features], num_iteration=model.best_iteration)
    
    all_preds.append(preds)
    oof_preds[val_ids] = oof
    scores.append(model.best_score)



Training until validation scores don't improve for 100 rounds.


In [None]:
best_score = np.mean([score['valid_0']['auc'] for score in scores])

In [None]:
best_score

In [None]:
from scipy.stats import gmean
gmean_preds = gmean(all_preds, axis=0)
np.corrcoef(gmean_preds, np.mean(all_preds, axis=0))

In [None]:
filename = f'submissions/lgb_stable_features{best_score:0.7}.csv.gz'

In [None]:
sub = pd.read_csv('data/sample_submission.csv')
sub['isFraud'] = gmean_preds
sub.to_csv(filename, index=False)

In [None]:
!KAGGLE_USERNAME=tishur KAGGLE_KEY=28da1297bec180204c1c524afa6f3d2e kaggle competitions submit ieee-fraud-detection -f {filename} -m "x"

# Split importance

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance('split'), good_cols)), columns=['Value','Feature'])
bad_cols = feature_imp[feature_imp['Value'] < 1]['Feature']
len(bad_cols)

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance('split'), all_good_features)), columns=['Value','Feature'])
bad_cols = feature_imp[feature_imp['Value'] < 1]['Feature']
plt.figure(figsize=(10, 20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:100])
plt.tight_layout()
plt.show()

# SHAP importance

In [None]:
shap_test = test_final.sample(7000)

In [None]:
%%time
res = model.predict(shap_test[all_good_features], pred_contrib=True)
mean_shap = np.abs(res).mean(axis=0)
shap_imp = pd.DataFrame(sorted(zip(mean_shap, all_good_features)), columns=['Value','Feature'])
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=shap_imp.sort_values(by="Value", ascending=False)[:60])
plt.tight_layout()
plt.show()