In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cbt
import datetime

from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
identity_cols = !head -1 data/train_identity.csv
identity_cols = identity_cols[0].split(',')

In [None]:
%%time
folder_path = 'data/'
train = pd.read_pickle(f'{folder_path}train.pkl')
test = pd.read_pickle(f'{folder_path}test.pkl')

In [None]:
N_TRAIN_EXAMPLES = 590540

In [None]:
N_TRAIN_EXAMPLES = len(train)

In [None]:
tr_am = pd.read_csv(f'{folder_path}train_transaction.csv', usecols=['TransactionAmt'], dtype=str)
te_am = pd.read_csv(f'{folder_path}test_transaction.csv', usecols=['TransactionAmt'], dtype=str)
train[['dollars', 'cents']] = tr_am.TransactionAmt.str.split('.', expand=True).astype(int)
test[['dollars', 'cents']] = te_am.TransactionAmt.str.split('.', expand=True).astype(int)

In [None]:
roman_df = pd.read_pickle('all_new_ids_roman_features.pkl')

In [None]:
roman_feature_names = set(roman_df.columns)

In [None]:
MODEL_FEATURES = set(test.columns) - set(['TransactionDT', 'TransactionID'])

In [None]:
basic_features = set(test.columns) - set(['TransactionDT', 'TransactionID'])

In [None]:
CATEGORICAL_FEATURES =  set(['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19',
            'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37',
            'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 
            'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'])

Говорят, что неплохо бы дропнуть те карты, которые не встречается или в трейне, или в тесте

In [None]:
for col in ['card1']: 

    print('No intersection in Train', len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

for col in ['card2','card3','card4','card5','card6',]: 
    print('No intersection in Train', col, len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', col, len(train[train[col].isin(test[col])]))
    
    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)
    print('#'*20)

In [None]:
roman_df.reset_index(inplace=True)

In [None]:
roman_df.head()

In [None]:
all_df = pd.concat([train, test])
all_df.reset_index(inplace=True, drop=True)
all_df = pd.concat([all_df, roman_df], axis=1)

# Фичи по датам

In [None]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
all_df['DT'] = all_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))

In [None]:
del train, test, roman_df

In [None]:
import gc

In [None]:
gc.collect()

In [None]:
def datetime_features(all_df):
    all_df['DT'] = all_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    all_df['dayofweek'] = all_df['DT'].dt.dayofweek
    all_df['dayofmonth'] = all_df['DT'].dt.day
    all_df['hour'] = all_df['DT'].dt.hour
    all_df['weekofmonth'] = (all_df['DT'].dt.day - 1) // 7 + 1
    new_features = ['dayofweek', 'dayofmonth', 'hour', 'weekofmonth']
    return new_features, new_features

In [None]:
%%time
a, c = datetime_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

In [None]:
all_df['DT_split'] = (all_df['DT'].dt.year - 2017) * 12 + all_df['DT'].dt.month
N_TRAIN = sum(all_df['DT_split'] < 17)

In [None]:
all_df['day'] = ((all_df['DT'].dt.year-2017)*365 + all_df['DT'].dt.dayofyear).astype(np.int16)
all_df['hour_day'] = all_df['DT'].dt.hour + 24 * (all_df['day'] - 335)

# D-features

In [None]:
d_cols = [col for col in all_df.columns if col.startswith('D') and not col[-1:].isalpha() and col != 'D9']

In [None]:
d_cols

In [None]:
d_cols_notnull = [d + '_notnull' for d in d_cols + ['D9']]
all_df[d_cols_notnull] = all_df[d_cols + ['D9']].notnull()

In [None]:
# Lets transform D8 and D9 column
# As we almost sure it has connection with hours
all_df['D8_not_same_day'] = np.where(all_df['D8']>=1,1,0)
all_df['D8_D9_decimal_dist'] = all_df['D8'].fillna(0)-all_df['D8'].fillna(0).astype(int)
all_df['D8_D9_decimal_dist'] = ((all_df['D8_D9_decimal_dist']-all_df['D9'])**2)**0.5
all_df['D8'] = all_df['D8'].fillna(-1).astype(int)

In [None]:
MODEL_FEATURES.update(['D8_not_same_day', 'D8_D9_decimal_dist', 'D8_D9_decimal_dist'])

In [None]:
def values_normalization(all_df, period, col, clip=True, minmax=True):
        new_col = col + '_' + period
        df = all_df[[col, period]].copy()
        df[col] = df[col].astype(float)
        if clip:
            df[col] = df[col].clip(0) 

        aggs = df.groupby([period])[col].agg(['min', 'max', 'std', 'mean'])
        
        agg_max = aggs['max'].to_dict()
        agg_min = aggs['min'].to_dict()
        agg_std = aggs['std'].to_dict()
        agg_mean = aggs['mean'].to_dict()

        all_df['temp_min'] = all_df[period].map(agg_max)
        all_df['temp_max'] = all_df[period].map(agg_min)
        all_df['temp_std'] = all_df[period].map(agg_std)
        all_df['temp_mean'] = all_df[period].map(agg_mean)
        

        all_df[new_col + '_min_max'] = ((all_df[col] - all_df['temp_min']) /\
                (all_df['temp_max'] - all_df['temp_min'])).astype(float)
        
        all_df[new_col + '_std_score'] = (all_df[col] - all_df['temp_mean']) / (all_df['temp_std'])

        del all_df['temp_min'], all_df['temp_max'], all_df['temp_std'], all_df['temp_mean']

In [None]:
%%time
for period in ['day']:
    for col in d_cols:
        values_normalization(all_df, period, col, minmax=True)

In [None]:
for col in ['D1','D2']:
    all_df[col + '_scaled'] = all_df[col] / all_df[:N_TRAIN_EXAMPLES][col].max()

In [None]:
MODEL_FEATURES.update(['D1_scaled', 'D2_scaled'])

# Device info

In [None]:
all_df['DeviceInfo'].fillna('', inplace=True)
all_df['id_30'].fillna('', inplace=True)
all_df['id_31'].fillna('', inplace=True)

In [None]:
def add_device_features(all_df):
    all_df['DeviceInfoMajor'] = all_df['DeviceInfo'].str.split(' ', expand=True)[0]
    all_df['DeviceInfoTop'] = all_df['DeviceInfo'].str.split('-', expand=True)[0]
    all_df['DeviceInfoIsRV'] = all_df['DeviceInfoMajor'].apply(lambda x: 'rv' in x)
    return ['DeviceInfoMajor', 'DeviceInfoTop', 'DeviceInfoIsRV'], ['DeviceInfoMajor', 'DeviceInfoTop']

In [None]:
%%time
a, c = add_device_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

In [None]:
def split_version(version):
    n_sep_a = version.count('.')
    n_sep_b = version.count('_')
    if n_sep_a > 0:
        return version.split('.')[0]
    elif n_sep_b > 0:
        return version.split('_')[0]
    else:
        return version

def split_os(os):
    spl = os.split(' ')
    if len(spl) > 1:
        os_name = ' '.join(spl[:-1])
        major_version = split_version(spl[-1])
        return os_name, ' '.join([os_name, major_version])
    else:
        return os, ''
# TODO: add minor

In [None]:
def add_os_features(all_df):
    os, version = zip(*all_df['id_30'].apply(lambda x: split_os(x)).values)
    all_df['OSName'] = os
    all_df['OSMajorVersion'] = version
    return ['OSName', 'OSMajorVersion'], ['OSName', 'OSMajorVersion']

In [None]:
%%time
a, c = add_os_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

In [None]:
# SHAME ON ME
def get_browser(browser):
    if 'safari' in browser:
        return 'safari'
    
    if 'chrome' in browser:
        return 'chrome'
    
    if browser.startswith('ie'):
        return 'internetexplorer'
    
    if 'edge' in browser:
        return 'edge'
    
    if 'firefox' in browser.lower():
        return 'firefox'
    
    if 'samsung' in browser.lower():
        return 'samsung'
    
    if 'google' in browser:
        return 'google'
    
    if 'opera' in browser:
        return 'opera'
    
    if 'android' in browser.lower():
        return 'android'
    
    return browser

def is_mobile(browser):
    br = browser.lower()
    if 'mobile' in br or 'for android' in br:
        return True
    else:
        return False


In [None]:
def get_browser_features(all_df):
    all_df['Browser'] = all_df['id_31'].apply(lambda x: get_browser(x))
    all_df['IsMobile'] = all_df['id_31'].apply(lambda x: is_mobile(x))
    #all_df['VersionNum'] = all_df['id_31'].fillna('0').\
    #    apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    #all_df['BrowserVersion'] = all_df['Browser'] + ' ' + all_df['VersionNum'].astype('str')
    
    return ['Browser', 'IsMobile'],  ['Browser', 'IsMobile']

In [None]:
%%time
a, c = get_browser_features(all_df)
MODEL_FEATURES.update(a)
CATEGORICAL_FEATURES.update(c)

In [None]:
all_df.shape

# Some with M

In [None]:
########################### M columns (except M4)
# All these columns are binary encoded 1/0
# We can have some features from it
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

all_df['M_sum'] = all_df[i_cols].sum(axis=1).astype(np.int8)
all_df['M_na'] = all_df[i_cols].isna().sum(axis=1).astype(np.int8)

MODEL_FEATURES.update(['M_sum', 'M_na'])

# NaN count

In [None]:
all_df['notnull_count'] = all_df[basic_features].notnull().sum(axis=1)

In [None]:
MODEL_FEATURES.add('notnull_count')

# UID

In [None]:
cols_for_uid = ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']

In [None]:
all_df[cols_for_uid] = all_df[cols_for_uid].astype(str)

In [None]:
def extract_registration_date(df):
    tr_dt = 'TransactionDT'

    df[f'{tr_dt}_to_datetime'] = df[tr_dt].apply(
        lambda x: START_DATE + datetime.timedelta(seconds=x)
    )

    df['card_registered_delta_tmp'] = pd.to_timedelta(df['D1'], unit='day')
    df['subcard_reg_date'] = (
            df['TransactionDT_to_datetime'] - df['card_registered_delta_tmp']
    )
    df['subcard_reg_timestamp'] = df['subcard_reg_date']\
        .dt\
        .date\
        .apply(
        lambda x: (
                x - datetime.date(1970, 1, 1)
        ).total_seconds()
    )
    df['subcard_categorical'] = df['subcard_reg_date']\
        .dt\
        .date\
        .astype(str)

    df.drop(
        labels=[
            'card_registered_delta_tmp',
            'subcard_reg_date'
        ],
        axis=1,
        inplace=True
    )

    return df

In [None]:
all_df = extract_registration_date(all_df)

In [None]:
all_df['subcard_categorical_full'] = all_df['card1'] + '_' + all_df['subcard_categorical'].astype('str')

In [None]:
also_add = [('id_19', 'DeviceType', 'id_15'),
            ('id_19', 'id_15', 'R_emaildomain'),
            ('id_15', 'id_38', 'id_13')]

In [None]:
subcard_id_cols = ['addr1', 'DeviceInfo', 'P_emaildomain', 'ProductCD', 'id_20', 'id_19', 'card4']
subcard_id_cols.extend(also_add)

In [None]:
subcard_ids = []
for coll in subcard_id_cols:
    if not isinstance(coll, str):
        col = '_'.join(coll)
        all_df['subcard_' + col] = all_df['subcard_categorical_full'].astype(str) + '_' +\
            all_df[list(coll)].astype(str).apply(lambda x: '_'.join(x), axis=1)
    else:
        col = coll
        all_df['subcard_' + col] = all_df['subcard_categorical_full'].astype(str) + '_' +\
            all_df[coll].astype(str)
    
    subcard_ids.append('subcard_' + col)

In [None]:
MODEL_FEATURES.update(['subcard_categorical', 'subcard_categorical_full'])
CATEGORICAL_FEATURES.update(['subcard_categorical', 'subcard_categorical_full'])

In [None]:
MODEL_FEATURES.update(subcard_ids)
CATEGORICAL_FEATURES.update(subcard_ids)

In [None]:
all_df['uid1'] = all_df['card1'] + '_' + all_df['card2']

all_df['uid2'] = all_df['uid1'] + '_' + all_df['card3'] + '_' + all_df['card5']

all_df['uid3'] = all_df['uid2'] + '_' + all_df['addr1'] + '_' + all_df['addr2']

all_df['uid4'] = all_df['uid3'] + '_' + all_df['P_emaildomain']

all_df['uid5'] = all_df['uid3'] + '_' + all_df['R_emaildomain']


In [None]:
uids = [f'uid{i}' for i in range(1, 6)] + ['subcard_categorical_full'] + subcard_ids

In [None]:
subcard_ids

 # TransactionAmt features

Говорят, если в сумме транзакции есть более двух чисел после запятой, то это транзакция в иностранной валюте

In [None]:
all_df['is_foreign'] = all_df['cents'].apply(lambda x: len(str(x)) > 2)

In [None]:
MODEL_FEATURES.add('is_foreign')

А раз уникальных TransactionAmt не очень много, то почему бы не добавить nunique для каждого из айдишников сверху

In [None]:
from sklearn.preprocessing import QuantileTransformer
from collections import Counter

In [None]:
%%time
unique_amt_cols = []
for uid in uids:
    uniques = all_df.groupby(uid)['TransactionAmt'].nunique()
    all_df[uid + '_unique_amt'] = all_df[uid].map(uniques)
    unique_amt_cols.append(uid + '_unique_amt')
    
QT = QuantileTransformer(n_quantiles=500)
all_df[unique_amt_cols] = QT.fit_transform(all_df[unique_amt_cols])

In [None]:
MODEL_FEATURES.update(unique_amt_cols)

# Следующая транзакция

In [None]:
%%time
near_cols = []
same_cols = []
for col in ['card1', 'subcard_categorical_full'] + subcard_ids:
    print(col)
    for i in range(1, 5):
        print(i)
        a = 'is_same_next_transaction_' + str(i)
        b = 'is_same_prev_transaction_' + str(i)
        all_df[a] = all_df.groupby(col)['TransactionAmt'].diff(i) == 0
        all_df[b] = all_df.groupby(col)['TransactionAmt'].diff(-i) == 0
        same_cols.extend([a, b])
    all_df[f'same_transaction_near_{col}'] = all_df[same_cols].sum(axis=1)
    all_df.drop(same_cols, axis=1, inplace=True)
    near_cols.append(f'same_transaction_near_{col}')

In [None]:
MODEL_FEATURES.update(near_cols)

# Frequency Encoding

In [None]:
def encode_frequency(col, quantile=True):
    return col.map(col.value_counts().to_dict())

In [None]:
uids

In [None]:
%%time
freq_cols = []
for col in list(CATEGORICAL_FEATURES) + uids + ['cents', 'dollars'] :
    all_df[col + '_freq'] = encode_frequency(all_df[col])
    freq_cols.append(col + '_freq')
    
QT = QuantileTransformer(n_quantiles=500)
all_df[freq_cols] = QT.fit_transform(all_df[freq_cols])

In [None]:
MODEL_FEATURES.update(freq_cols)

# Categorical

In [None]:
from multiprocessing import Pool

In [None]:
def encode(col):
    le = LabelEncoder()
    le.fit(all_df[col].astype(str).values)
    return le.transform(all_df[col].astype(str).values)

In [None]:
all_df['cents_categorical'] = all_df['cents'].copy()
all_df['dollars_categorical'] = all_df['dollars'].copy()

In [None]:
CATEGORICAL_FEATURES.update(['cents_categorical', 'dollars_categorical'])

In [None]:
MODEL_FEATURES.update(['cents_categorical', 'dollars_categorical'])

In [None]:
%%time
with Pool(16) as pool:
    encodes = pool.map(encode, CATEGORICAL_FEATURES)

for name, enc in zip(CATEGORICAL_FEATURES, encodes):
    all_df[name] = enc

# Numerical encoding

In [None]:
def calc_smooth_encoding(all_df, by, on, m):
    # Compute the global mean
    mean = all_df[on].mean()
    
    std = all_df[on].std()
    
    median = np.nanmedian(all_df[on])

    # Compute the number of values and the mean of each group
    agg = all_df.groupby(by)[on].agg(['count', 'mean', 'std', np.nanmedian])
    counts = agg['count']
    means = agg['mean']
    stds = agg['std']
    medians = agg['nanmedian']
    

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)
    
    smooth_std = (stds * counts + m * std ) / (counts + m)
    
    smooth_median = (medians * counts + m * median ) / (counts + m)

    # Replace each value by the according smoothed mean
    return all_df[by].map(smooth), all_df[by].map(smooth_std), all_df[by].map(smooth_median)

In [None]:
%%time
amt_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'TransactionAmt', 30)
    all_df[col + '_TransactionAmt_mean'] = mean
    all_df[col + '_TransactionAmt_std'] = std
    #all_df[col + '_TransactionAmt_median'] = median
    amt_features.extend([col + '_TransactionAmt_mean', col + '_TransactionAmt_std'])#, col + '_TransactionAmt_median'])

In [None]:
%%time
D15_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'D15_day_min_max', 30)
    all_df[col + '_D15_mean'] = mean
    all_df[col + '_D15_std'] = std
    #all_df[col + '_D15_median'] = median
    D15_features.extend([col + '_D15_mean', col + '_D15_std'])#, col + '_D15_median'])

In [None]:
%%time
C13_features = []
for col in uids:
    mean, std, median = calc_smooth_encoding(all_df, col, 'C13', 30)
    all_df[col + '_C13_mean'] = mean
    all_df[col + '_C13_std'] = std
    #all_df[col + '_C13_median'] = median
    C13_features.extend([col + '_C13_mean', col + '_C13_std'])#, col + '_C13_median']

In [None]:
MODEL_FEATURES.update(D15_features + C13_features)

In [None]:
MODEL_FEATURES.update(amt_features)

In [None]:
values_normalization(all_df, 'day', 'subcard_reg_timestamp')

In [None]:
MODEL_FEATURES.add('subcard_reg_timestamp_day_min_max')

In [None]:
all_df.to_pickle('many_ids_all_features.pkl')

# Power Transform

In [None]:
from sklearn.preprocessing import power_transform
from scipy.stats import skew


def deskew_this_data(df, col_names, min_skew, method='yeo-johnson'):

    transform_cols = []
    for col in col_names:
        if skew(df[col]) > min_skew:
            transform_cols.append(col)

    X = df[transform_cols]
    n = len(transform_cols)
    if n == 0:
        return
    elif n == 1:
        X = X.reshape(-1, 1)
    else:
        pass

    df[transform_cols] = power_transform(X, method=method)

In [None]:
numeric_features = list(MODEL_FEATURES - CATEGORICAL_FEATURES)

In [None]:
all_df['uid1_TransactionAmt_std'].hist(bins=100)

In [None]:
res = power_transform(all_df['uid1_TransactionAmt_std'].values.reshape(-1, 1))

In [None]:
numeric_features = list(MODEL_FEATURES - CATEGORICAL_FEATURES)
mask = all_df[numeric_features].dtypes != 'O'
numeric_final = np.array(numeric_features)[mask]

In [None]:
%%time
res = power_transform(all_df[numeric_final], method='yeo-johnson')

In [None]:
skew(all_df['uid1_TransactionAmt_std'], nan_policy='omit')

In [None]:
numeric_features

# Splits

In [None]:
values_normalization(all_df, 'day', 'subcard_reg_timestamp')

In [None]:
train_final = all_df[:N_TRAIN_EXAMPLES]
test_final = all_df[N_TRAIN_EXAMPLES:]

tr = train_final[:N_TRAIN]
val = train_final[N_TRAIN:]

In [None]:
def downsample(df, how_strong=2):
    positive = df[df.isFraud == 1]
    negative = df[df.isFraud == 0]
    negative = negative.sample(int(len(negative) / how_strong))
    res = pd.concat([positive, negative])
    return res.sample(len(res))

In [None]:
tr = downsample(tr, 10)
val = downsample(val, 10)

# Train

In [None]:
add_cols = [col + '_day_min_max' for col in d_cols] 
add_cols = add_cols + [col + '_day_std_score' for col in d_cols]

In [None]:
import sys
sys.path.insert(0, 'old_roman/IEEE_FRAUD/')

In [None]:
from settings import CATEGORICAL_FEATURES as roman_categorical

In [None]:
import pickle

In [None]:
roman_categorical = list(set(roman_categorical) - set(['is_holiday']))

In [None]:
final_features = set(list(MODEL_FEATURES) + list(roman_feature_names))

In [None]:
d_cols = [col for col in all_df.columns if col.startswith('D') 
          and not col[-1:].isalpha() and col != 'D9' and len(col) < 4]

In [None]:
good_cols = list(MODEL_FEATURES - set(['card1']) - set(d_cols) ) + add_cols + ['subcard_reg_timestamp_day_min_max']
good_categotical = list(set(CATEGORICAL_FEATURES) - set(['card1']))

In [None]:
all_good_features = set(good_cols
                        + list(roman_feature_names)) -\
    set(['card1', 'dayofmonth'])
all_good_categorical = set(good_categotical + roman_categorical) - set(['card1', 'is_holiday', 'dayofmonth'])

all_good_categorical = all_good_categorical.intersection(all_good_features)

In [None]:
with open('device_info_col_names.pkl', 'wb') as f:
    pickle.dump((MODEL_FEATURES, CATEGORICAL_FEATURES, roman_feature_names, roman_categorical,
                all_good_features, all_good_categorical), f)

In [None]:
len(all_good_features)

In [None]:
for train_ids, val_ids in gkf.split(train_final, train_final['isFraud'], train_final['DT_split']):
    df_tr = train_final.loc[train_ids]
    df_val = train_final.loc[val_ids]
    dtrain = lgb.Dataset(df_tr[all_good_features], label=df_tr['isFraud'],
                         categorical_feature=all_good_categorical, free_raw_data=False)
    dval = lgb.Dataset(df_val[all_good_features], label=df_val['isFraud'],
                       categorical_feature=all_good_categorical, free_raw_data=False)
    break

In [None]:
dtrain = lgb.Dataset(tr[all_good_features], label=tr['isFraud'],
                     categorical_feature=all_good_categorical,free_raw_data=False)
dval = lgb.Dataset(val[all_good_features], label=val['isFraud'],
                   categorical_feature=all_good_categorical, free_raw_data=False)

In [None]:
def fit_on_features(drop_feature):
    features = list(set(all_good_features) - set([drop_feature]))
    categorical = list(set(all_good_categorical) - set([drop_feature]))
    dtrain = lgb.Dataset(tr[features], label=tr['isFraud'],
                     categorical_feature=categorical,
                         free_raw_data=False)
    dval = lgb.Dataset(val[features], label=val['isFraud'],
                   categorical_feature=categorical,
                       free_raw_data=False)
    
    model = lgb.train(params, dtrain, num_boost_round=10000,
                  valid_sets=(dval, dtrain), valid_names=('val', 'train'),
                early_stopping_rounds=25, verbose_eval=0)
    return model.best_score

In [None]:
params = {'num_leaves': 200,
          'min_child_samples': 40,
          #'min_sum_hessian_in_leaf': 5e-3,
          #'max_bin': 1023,
          #'min_data_in_leaf': 
          #'scale_pos_weight': 2,
          'objective': 'binary',
          'metric': 'auc',
          'max_depth': 13,
          'learning_rate': 0.01,
          "boosting_type": "goss",
          "top_rate": 0.6,
          "other_rate": 0.1,
          "bagging_freq": 0,
          'nthread': 16,
          
          'pos_bagging_fraction': 0.8,
          'neg_bagging_fraction': 0.01,
          "bagging_fraction": 0.6,
          "bagging_seed": 11,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          
          'feature_fraction': 0.7,
          'min_data_per_group': 25,
          'cat_smooth': 500,
          'max_cat_to_onehot': 8
          #'categorical_feature': cat_cols
         }

In [None]:
%%time
model = lgb.train(params, dtrain, num_boost_round=10000,
                  valid_sets=(dval, dtrain), valid_names=('val', 'train'),
                early_stopping_rounds=25, verbose_eval=100)

In [None]:
0.928021

0.94599 - дропаем -1e-4
0.946581 - дропаем -1e-3
0.949..- ничего не дропаем
0.949693 - дропнул 10

In [None]:
DEFAULT_AUC = 0.9451639985019628

In [None]:
import warnings

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    feature_metric_loss = {}
    for i, feature in enumerate(all_good_features):
        best_score = fit_on_features(feature)
        feature_metric_loss[feature] = DEFAULT_AUC - best_score['val']['auc']
        print(feature, f'with №{i} loss change:', feature_metric_loss[feature])

In [None]:
import json

In [None]:
with open('feature_auc_loss.json', 'w') as f:
    json.dump(feature_metric_loss, f)

In [None]:
feature_imp = pd.DataFrame.from_dict(feature_metric_loss, orient='index')
feature_imp.reset_index(inplace=True)
feature_imp.columns = ['Feature', 'Value']


In [None]:
feature_imp[feature_imp['Value'] < -0.001]

In [None]:
plt.figure(figsize=(10, 20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[-100:])
plt.tight_layout()
plt.show()

In [None]:
%%time
model = lgb.train(params, dtrain, num_boost_round=10000,
                  valid_sets=(dval, dtrain), valid_names=('val', 'train'),
                early_stopping_rounds=25, verbose_eval=100)

In [None]:
model.best_score

In [None]:
0.945351

In [None]:
0.947028 0.946727 0.946764

In [None]:
%%time
model = lgb.train(params, dtrain, num_boost_round=10000,
                  valid_sets=(dval, dtrain), valid_names=('val', 'train'),
                early_stopping_rounds=25, verbose_eval=25)

# Split importance

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance('split'), good_cols)), columns=['Value','Feature'])
bad_cols = feature_imp[feature_imp['Value'] < 1]['Feature']
len(bad_cols)

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance('split'), all_good_features)), columns=['Value','Feature'])
bad_cols = feature_imp[feature_imp['Value'] < 1]['Feature']
plt.figure(figsize=(10, 20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:100])
plt.tight_layout()
plt.show()

# SHAP importance

In [None]:
shap_test = test_final.sample(7000)

In [None]:
%%time
res = model.predict(shap_test[all_good_features], pred_contrib=True)
mean_shap = np.abs(res).mean(axis=0)
shap_imp = pd.DataFrame(sorted(zip(mean_shap, all_good_features)), columns=['Value','Feature'])
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=shap_imp.sort_values(by="Value", ascending=False)[:60])
plt.tight_layout()
plt.show()

# Fit KFold (Scary)

In [None]:
from sklearn.model_selection import GroupKFold

In [None]:
gkf = GroupKFold(6)

In [None]:
params = {'num_leaves': 200,
          'min_child_samples': 40,
          #'min_sum_hessian_in_leaf': 5e-3,
          #'max_bin': 1023,
          #'min_data_in_leaf': 
          #'scale_pos_weight': 2,
          'objective': 'binary',
          'metric': 'auc',
          'max_depth': 13,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "top_rate": 0.6,
          "other_rate": 0.1,
          "bagging_freq": 0,
          'nthread': 16,
          
          'pos_bagging_fraction': 0.8,
          'neg_bagging_fraction': 0.01,
          "bagging_fraction": 0.7,
          "bagging_seed": 11,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          
          'feature_fraction': 0.9,
          'min_data_per_group': 25,
          'cat_smooth': 500,
          'max_cat_to_onehot': 8
          #'categorical_feature': cat_cols
         }

In [None]:
import gc

In [None]:
del dtrain, dval

In [None]:
gc.collect()

In [None]:
len(all_good_features)

In [None]:
scores = []
all_preds = []
for train_ids, val_ids in gkf.split(train_final, train_final['isFraud'], train_final['DT_split']):
    df_tr = train_final.loc[train_ids]
    df_val = train_final.loc[val_ids]
    dtrain = lgb.Dataset(df_tr[all_good_features], label=df_tr['isFraud'],
                         categorical_feature=all_good_categorical, free_raw_data=False)
    dval = lgb.Dataset(df_val[all_good_features], label=df_val['isFraud'],
                       categorical_feature=all_good_categorical, free_raw_data=False)
    
    
    model = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=(dval),
                early_stopping_rounds=100, verbose_eval=100)
    preds = model.predict(test_final[all_good_features], num_iteration=model.best_iteration)
    
    all_preds.append(preds)
    scores.append(model.best_score)

In [None]:
scores

In [None]:
[score['valid_0']['auc'] for score in scores]

In [None]:
best_score = np.mean([score['valid_0']['auc'] for score in scores])

In [None]:
best_score

In [None]:
from scipy.stats import gmean

In [None]:
from scipy.stats import gmean
gmean_preds = gmean(all_preds, axis=0)
np.corrcoef(gmean_preds, np.mean(all_preds, axis=0))

In [None]:
filename = f'submissions/lgb_with_many_ids{best_score:0.7}.csv.gz'

In [None]:
sub = pd.read_csv('data/sample_submission.csv')
sub['isFraud'] = gmean_preds
sub.to_csv(filename, index=False)

In [None]:
!KAGGLE_USERNAME=tishur KAGGLE_KEY=28da1297bec180204c1c524afa6f3d2e kaggle competitions submit ieee-fraud-detection -f {filename} -m "x"

In [None]:
f'submissions/kfold_without_te_12_folds{best_score:0.4}.csv.gz'

In [None]:
sub2 = pd.read_csv('submissions/gkfold_d_min_max_0.94122.csv.gz')

In [None]:
np.corrcoef(sub.isFraud, sub2.isFraud)

# Permutation Importance

# Adversarial

In [None]:
all_df['is_test'] = [0 for _ in range(len(train))] + [1 for _ in range(len(test))]

In [None]:
adv_df = all_df.sample(100000)

In [None]:
good_cols = list(MODEL_FEATURES - set(['VersionNum'])) #- set(baaad))# -\
#                 set(['BrowserVersion', 'V9', 'id_31', 'id_13']))
good_categotical = list(set(CATEGORICAL_FEATURES))# - set(baaad))# -\
#                        set([ 'BrowserVersion', 'id_31', 'id_13']))\

In [None]:
d_adv = lgb.Dataset(adv_df[good_cols], label=adv_df['is_test'], categorical_feature=good_categotical,
                    free_raw_data=False)

In [None]:
adv_model = lgb.train(params, d_adv,
                      num_boost_round=60,
                      verbose_eval=10)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
pred = adv_model.predict(adv_df[good_cols])

In [None]:
roc_auc_score(adv_df['is_test'], pred)

In [None]:
['card1', 'card2', 'BrowserVersion', 'V9', 'id_31', 'id_13']
baaad = ['id_13_freq', 'D15', 'dist1', 'D11', 'dayofmonth',
         'D10', 'BrowserVersion_freq', 'id_31_freq',
         'dayofmonth_freq', 'D4', 'C12', 'V326', 'id_38',
         'M_na', 'V335', 'id_38_freq', 'V8', 'M9_freq', 'id_01', 'id_34', 'Browser',
         'id_34_freq', 'OSMajorVersion', 'OSMajorVersion_freq','card5', 'id_32', 'id_30_freq', 'id_33_freq'] +\
        [f for f in all_df.columns if f.startswith('D') and len(f) <= 3] +\
        [f for f in all_df.columns if f.startswith('V') and len(f) <= 4] +\
        [f for f in all_df.columns if f.startswith('C') and len(f) <= 4] +\
        [f for f in all_df.columns if f.startswith('id_')
         or f.startswith('uid') or f.startswith('addr')] +\
        [f for f in all_df.columns if f.startswith('M') and (len(f) <= 4 or f.endswith('freq'))]

In [None]:
train['id_32'].hist()

In [None]:
test['id_32'].hist()

In [None]:
all_df[:len(train)]['V8'].hist()

In [None]:
all_df[len(train):]['V8'].hist()

In [None]:
len(good_cols)

In [None]:
all_df['BrowserVersion'].value_counts()

In [None]:
%%time
sample = adv_df.sample(5000)
res = adv_model.predict(sample[good_cols], pred_contrib=True)
mean_shap = np.abs(res).mean(axis=0)
shap_imp = pd.DataFrame(sorted(zip(mean_shap, good_cols)), columns=['Value','Feature'])
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=shap_imp.sort_values(by="Value", ascending=False)[:60])
plt.tight_layout()
plt.show()

In [None]:
from eli5.sklearn import PermutationImportance

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgb_clf = LGBMClassifier(**params)

In [None]:
%%time
lgb_clf.fit(tr[good_cols].fillna(-1), tr['isFraud'], feature_name=good_cols, categorical_feature=good_categotical)

In [None]:
imp = PermutationImportance(lgb_clf, 'roc_auc', refit=False)

In [None]:
%%time
res = imp.fit(val[good_cols].fillna(-1), val['isFraud'])

# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
C_cols = [col for col in all_df.columns if col.startswith('C') and len(col) < 4]

In [None]:
C_pca = PCA(n_components=0.99).fit_transform(all_df[C_cols].fillna(0))

In [None]:
C_pca[:, 0].shape

In [None]:
pca_cols = []
for i, col in enumerate([f'C_PCA_{c}' for c in range(C_pca.shape[1])]):
    all_df[col] = C_pca[:, i]
    pca_cols.append(col)
    

In [None]:
MODEL_FEATURES.update(pca_cols)

In [None]:
V_cols = [col for col in all_df.columns if col.startswith('V') and len(col) < 5]

In [None]:
%%time
V_pca = PCA(n_components=0.99).fit_transform(all_df[V_cols].fillna(0))

In [None]:
v_pca_cols = []
for i, col in enumerate([f'V_PCA_{c}' for c in range(V_pca.shape[1])]):
    all_df[col] = V_pca[:, i]
    v_pca_cols.append(col)
    

In [None]:
MODEL_FEATURES.update(v_pca_cols)

In [None]:
diff_cols = []
d_cols = [col for col in all_df.columns if col.startswith('D') and len(col) < 3 and col != 'DT']
for col in d_cols:
    all_df[col + '_diff'] = all_df[col].diff()
    diff_cols.append(col + '_diff')

In [None]:
g = ['id_33_freq', 'V143', 'id_11', 'id_20_freq', 'V123', 'V323', 'V267', 'uid5_D15_std', 'V269', 'id_24', 'V212', 'V234', 'uid2_D15_std', 'V24', 'V259', 'V316', 'V12', 'V334', 'V115', 'V261', 'V77', 'M6', 'C8', 'V57', 'uid3_D15_std', 'V92', 'id_28', 'V67', 'V34', 'R_emaildomain_2', 'V228', 'C12', 'V4', 'V328', 'V248', 'DeviceType_freq', 'C6', 'V294', 'V106', 'V265', 'id_12_freq', 'V139', 'V250', 'V258', 'V39', 'V18', 'V32', 'V48', 'id_08', 'addr1', 'V80', 'id_02', 'V320', 'M6_freq', 'id_23', 'uid5_TransactionAmt_std', 'OSMajorVersion_freq', 'V111', 'V10', 'P_emaildomain_1_freq', 'V225', 'R_emaildomain_3_freq', 'V199', 'D9', 'V244', 'M7', 'V50', 'OSMajorVersion', 'id_12', 'id_37_freq', 'V313', 'V101', 'id_38', 'V231', 'V216', 'V186', 'V220', 'V293', 'V55', 'uid3_freq', 'V170', 'V84', 'V98', 'V140', 'V256', 'DeviceInfoMajor', 'id_32_freq', 'V181', 'id_25_freq', 'V284', 'V29', 'V58', 'R_emaildomain_1', 'addr2_freq', 'V301', 'id_18', 'dayofmonth_freq', 'V63', 'V298', 'V289', 'V281', 'V255', 'M_na', 'V292', 'M3_freq', 'uid2_D15_mean', 'D12_day_min_max', 'V81', 'V300', 'dayofweek_freq', 'V270', 'uid4_TransactionAmt_std', 'V15', 'V175', 'id_32', 'id_19', 'V69', 'V178', 'V36', 'V260', 'V242', 'D14_day_min_max', 'V251', 'V20', 'DeviceInfo', 'V224', 'V86', 'id_37', 'id_35_freq', 'V156', 'V276', 'V303', 'addr1_freq', 'V253', 'V168', 'V335', 'V132', 'V79', 'V266', 'id_15_freq', 'V208', 'V108', 'V90', 'V185', 'V91', 'decimal_len', 'R_emaildomain_1_freq', 'P_emaildomain_2_freq', 'V133', 'V167', 'C7', 'id_10', 'V271', 'V213', 'V198', 'id_05', 'DeviceInfo_freq', 'V94', 'V126', 'M8_freq', 'V46', 'id_17', 'V155', 'V262', 'id_01', 'V53', 'V273', 'uid4_D15_mean', 'M4', 'V17', 'id_22', 'V286', 'V200', 'V246', 'id_17_freq', 'C10', 'M8', 'id_31_freq', 'V161', 'card5', 'V218', 'V26', 'V162', 'V338', 'V307', 'R_emaildomain', 'V5', 'decimal_value_freq', 'uid3_D15_mean', 'uid5_C13_mean', 'id_16_freq', 'V254', 'is_holiday_freq', 'V183', 'P_emaildomain_2', 'V76', 'P_emaildomain', 'V188', 'V195', 'V124', 'V75', 'dayofmonth', 'V95', 'D6_day_min_max', 'V35', 'V308', 'uid2_unique_amt', 'id_30', 'V336', 'V193', 'R_emaildomain_2_freq', 'V2', 'V127', 'uid4_freq', 'V45', 'V33', 'M1', 'uid2_TransactionAmt_std', 'weekofmonth', 'V73', 'id_28_freq', 'C1', 'V99', 'DeviceInfoTop', 'V243', 'V215', 'V104', 'uid4_C13_mean', 'card4_freq', 'id_34', 'V222', 'dist2', 'V83', 'M9', 'V49', 'OSName_freq', 'M3', 'V169', 'id_30_freq', 'V235', 'V311', 'D7_day_min_max', 'V205', 'V103', 'V209', 'V203', 'id_36', 'V149', 'D11_day_min_max', 'V290', 'V166', 'M9_freq', 'V236', 'V112', 'id_19_freq', 'V116', 'V31', 'id_14', 'V291', 'uid5_TransactionAmt_mean', 'id_38_freq', 'C13', 'V56', 'IsMobile_freq', 'V330', 'V85', 'V128', 'V152', 'id_36_freq', 'V11', 'V22', 'uid1_TransactionAmt_mean', 'V304', 'M5_freq', 'V25', 'V190', 'card4', 'V54', 'V109', 'P_emaildomain_1', 'id_13', 'Browser_freq', 'V43', 'V202', 'is_holiday', 'V174', 'V191', 'V219', 'V277', 'id_20', 'V23', 'id_23_freq', 'V142', 'id_09', 'D4_day_min_max', 'V157', 'V146', 'V275', 'Browser', 'V135', 'id_31', 'V263', 'V9', 'P_emaildomain_freq', 'V221', 'uid1_D15_mean', 'C5', 'V288', 'C3', 'V37', 'V187', 'D3_day_min_max', 'V165', 'V296', 'V72', 'V299', 'card3_freq', 'V147', 'D2_day_min_max', 'V272', 'V171', 'V52', 'V38', 'uid2_freq', 'V138', 'V238', 'ProductCD_freq', 'V110', 'D8_day_min_max', 'card6_freq', 'id_15', 'V264', 'C9', 'V322', 'uid1_freq', 'uid2_C13_mean', 'id_13_freq', 'V283', 'V237', 'V159', 'V227', 'V19', 'V62', 'V130', 'id_06', 'V207', 'R_emaildomain_3', 'uid4_D15_std', 'V47', 'V158', 'V282', 'V180', 'V102', 'V154', 'V206', 'V339', 'uid5_unique_amt', 'V229', 'IsMobile', 'uid4_unique_amt', 'V136', 'V100', 'uid1_C13_std', 'V177', 'V66', 'uid1_unique_amt', 'V201', 'V105', 'TransactionAmt', 'V214', 'id_07', 'V257', 'V332', 'V319', 'M4_freq', 'V114', 'C14', 'V134', 'V249', 'id_14_freq', 'V8', 'V51', 'D13_day_min_max', 'card6', 'V315', 'hour_freq', 'V245', 'V61', 'V268', 'V239', 'DeviceInfoMajor_freq', 'DeviceInfoTop_freq', 'V42', 'V148', 'V297', 'V192', 'dayofweek', 'V247', 'V204', 'id_24_freq', 'C11', 'V196', 'uid5_freq', 'V223', 'uid1_C13_mean', 'card2_freq', 'V78', 'V64', 'V295', 'uid1_D15_std', 'V172', 'V333', 'uid4_TransactionAmt_mean', 'V194', 'id_16', 'V280', 'V232', 'uid3_C13_std', 'V189', 'V324', 'V176', 'V321', 'D10_day_min_max', 'V70', 'V306', 'ProductCD', 'card5_freq', 'V60', 'V164', 'V151', 'V44', 'V337', 'V326', 'V13', 'V331', 'V317', 'V30', 'V278', 'OSName', 'id_34_freq', 'id_03', 'card1_freq', 'V7', 'V121', 'uid3_TransactionAmt_mean', 'V71', 'V226', 'M2', 'V302', 'uid3_unique_amt', 'card3', 'V217', 'V150', 'V96', 'V40', 'C2', 'weekofmonth_freq', 'V184', 'C4', 'M5', 'V145', 'V309', 'V173', 'uid5_D15_mean', 'V137', 'V210', 'decimal_value', 'uid5_C13_std', 'V182', 'V87', 'D1_day_min_max', 'V125', 'V252', 'V287', 'V160', 'D15_day_min_max', 'V211', 'V310', 'uid2_C13_std', 'id_26_freq', 'M7_freq', 'V163', 'V312', 'uid1_TransactionAmt_std', 'V131', 'V329', 'R_emaildomain_freq', 'uid2_TransactionAmt_mean', 'V285', 'V274', 'V129', 'V230', 'V74', 'uid4_C13_std', 'V279', 'V3', 'V59', 'dist1', 'uid3_C13_mean', 'V82', 'uid3_TransactionAmt_std', 'V233', 'id_33', 'V144', 'M2_freq', 'V197', 'id_18_freq', 'V153', 'V6', 'V314', 'V97', 'V318', 'V179', 'id_04', 'id_21_freq', 'D5_day_min_max', 'DeviceType', 'V93']

In [None]:
len(g)