## modules

In [1]:
import logging
import os
import random
import sys
import shutil
import multiprocessing as mp
import lightgbm as lgb
from functools import reduce

import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
#import yaml
from attrdict import AttrDict
from steppy.base import BaseTransformer
from steppy.utils import get_logger

## utils

In [2]:
RANDOM_SEED = 0
def create_submission(meta, predictions):
    submission = pd.DataFrame({'SK_ID_CURR': meta['SK_ID_CURR'].tolist(),
                               'TARGET': predictions
                               })
    return submission


def verify_submission(submission, sample_submission):
    assert submission.shape == sample_submission.shape, \
        'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape)

    for submission_id, correct_id in zip(submission['SK_ID_CURR'].values, sample_submission['SK_ID_CURR'].values):
        assert correct_id == submission_id, \
            'Wrong id: expected {} but got {}'.format(correct_id, submission_id)


def get_logger():
    return logging.getLogger('home-credit')


def init_logger():
    logger = logging.getLogger('home-credit')
    logger.setLevel(logging.INFO)
    message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s',
                                       datefmt='%Y-%m-%d %H-%M-%S')

    # console handler for validation info
    ch_va = logging.StreamHandler(sys.stdout)
    ch_va.setLevel(logging.INFO)

    ch_va.setFormatter(fmt=message_format)

    # add the handlers to the logger
    logger.addHandler(ch_va)

    return logger

def parameter_eval(param):
    try:
        return eval(param)
    except Exception:
        return param


def persist_evaluation_predictions(experiment_directory, y_pred, raw_data, id_column, target_column):
    raw_data.loc[:, 'y_pred'] = y_pred.reshape(-1)
    predictions_df = raw_data.loc[:, [id_column, target_column, 'y_pred']]
    filepath = os.path.join(experiment_directory, 'evaluation_predictions.csv')
    logging.info('evaluation predictions csv shape: {}'.format(predictions_df.shape))
    predictions_df.to_csv(filepath, index=None)


def set_seed(seed=RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)


def calculate_rank(predictions):
    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)
    return rank


def chunk_groups(groupby_object, chunk_size):
    n_groups = groupby_object.ngroups
    group_chunk, index_chunk = [], []
    for i, (index, df) in enumerate(groupby_object):
        group_chunk.append(df)
        index_chunk.append(index)

        if (i + 1) % chunk_size == 0 or i + 1 == n_groups:
            group_chunk_, index_chunk_ = group_chunk.copy(), index_chunk.copy()
            group_chunk, index_chunk = [], []
            yield index_chunk_, group_chunk_


def parallel_apply(groups, func, index_name='Index', num_workers=1, chunk_size=100000):
    n_chunks = np.ceil(1.0 * groups.ngroups / chunk_size)
    indeces, features = [], []
    for index_chunk, groups_chunk in tqdm(chunk_groups(groups, chunk_size), total=n_chunks):
        with mp.pool.Pool(num_workers) as executor:
            features_chunk = executor.map(func, groups_chunk)
        features.extend(features_chunk)
        indeces.extend(index_chunk)

    features = pd.DataFrame(features)
    features.index = indeces
    features.index.name = index_name
    return features


def read_oof_predictions(prediction_dir, train_filepath, id_column, target_column):
    labels = pd.read_csv(train_filepath, usecols=[id_column, target_column])

    filepaths_train, filepaths_test = [], []
    for filepath in sorted(glob.glob('{}/*'.format(prediction_dir))):
        if filepath.endswith('_oof_train.csv'):
            filepaths_train.append(filepath)
        elif filepath.endswith('_oof_test.csv'):
            filepaths_test.append(filepath)

    train_dfs = []
    for filepath in filepaths_train:
        train_dfs.append(pd.read_csv(filepath))
    train_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=[id_column, 'fold_id']), train_dfs)
    train_dfs.columns = _clean_columns(train_dfs, keep_colnames=[id_column, 'fold_id'])
    train_dfs = pd.merge(train_dfs, labels, on=[id_column])

    test_dfs = []
    for filepath in filepaths_test:
        test_dfs.append(pd.read_csv(filepath))
    test_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=[id_column, 'fold_id']), test_dfs)
    test_dfs.columns = _clean_columns(test_dfs, keep_colnames=[id_column, 'fold_id'])
    return train_dfs, test_dfs


def _clean_columns(df, keep_colnames):
    new_colnames = keep_colnames
    feature_colnames = df.drop(keep_colnames, axis=1).columns
    for i, colname in enumerate(feature_colnames):
        new_colnames.append('model_{}'.format(i))
    return new_colnames

def safe_div(a, b):
    try:
        return float(a) / float(b)
    except:
        return 0.0

## Config

In [3]:
params = {
"experiment_directory" :"../steppy/",
"sample_submission_filepath":"../input/sample_submission.csv",

# Kaggle
"kaggle_api": 0,
"kaggle_message": 'solution-0815',
  
# Execution
"clean_experiment_directory_before_training": 1,
"num_workers": 10,
"verbose": 10,

# Preprocessing
"fill_missing": True,
"fill_value": 0,

# Feature Extraction
"installments__last_k_trend_periods": [10, 50, 100, 500],
"installments__last_k_agg_periods": [1, 5, 10, 20, 50, 100],
"installments__last_k_agg_period_fractions": [(5,20),(5,50),(10,50),(10,100),(20,100)],
"pos_cash__last_k_trend_periods": [6, 12],
"pos_cash__last_k_agg_periods": [6, 12, 30],
"application_aggregation__use_diffs_only": True,
"use_nan_count": True,
    
# Light GBM
"lgbm_random_search_runs": 0,
"lgbm__device": "cpu",
"lgbm__boosting_type": "gbdt",
"lgbm__objective": "binary",
"lgbm__metric": "auc",
"lgbm__number_boosting_rounds": 5000,
"lgbm__early_stopping_rounds": 100,
"lgbm__learning_rate": 0.02,
"lgbm__max_bin": 300,
"lgbm__max_depth": -1,
"lgbm__num_leaves": 30,
"lgbm__min_child_samples": 70,
"lgbm__subsample": 1.0,
"lgbm__subsample_freq": 1,
"lgbm__colsample_bytree": 0.05,
"lgbm__min_gain_to_split": 0.5,
"lgbm__reg_lambda": 100,
"lgbm__reg_alpha": 0.0,
"lgbm__scale_pos_weight": 1,
"lgbm__is_unbalance": False,


# XGBoost
"xgb_random_search_runs": 0,
"xgb__booster": "gbtree",
"xgb__tree_method": "hist", # gpu_hist  # auto  hist
"xgb__objective": "binary:logistic",
"xgb__eval_metric": "auc",
"xgb__nrounds": 10000,
"xgb__early_stopping_rounds": 100,
"xgb__eta": 0.001,
"xgb__max_leaves": 40,
"xgb__max_depth": 16,
"xgb__max_bin": 255,
"xgb__subsample": 0.5,
"xgb__colsample_bytree": 0.5,
"xgb__colsample_bylevel": 1,
"xgb__min_child_weight": 4,
"xgb__lambda": 0.001,
"xgb__alpha": 0.001,
"xgb__scale_pos_weight": 1,

# Random forest
"rf_random_search_runs": 0,
"rf__n_estimators": 500,
"rf__criterion": "gini",
"rf__max_features": 0.2,
"rf__min_samples_split": 10,
"rf__min_samples_leaf": 5,
"rf__class_weight": 1,

# Logistic regression
"lr_random_search_runs": 0,
"lr__penalty": "l1",
"lr__tol": 0.00001,
"lr__C": 1,
"lr__fit_intercept": 1,
"lr__class_weight": 1,
"lr__solver": "liblinear",
"lr__max_iter": 10000,

# SVC
"svc_random_search_runs": 0,
"svc__kernel": "rbf",
"svc__C": 1,
"svc__degree": 5,
"svc__gamma": "auto",
"svc__coef0": 0.0,
"svc__probability": True,
"svc__tol": 0.00001,
"svc__max_iter": -1,

# Random search
'random_search': {'light_gbm': {'n_runs':0}},

# Postprocessing
"aggregation_method": "rank_mean"
}

In [4]:
RANDOM_SEED = 90211 # 90210
DEV_SAMPLE_SIZE = 1000

ID_COLUMNS = ['SK_ID_CURR']
TARGET_COLUMNS = ['TARGET']

CATEGORICAL_COLUMNS = ['CODE_GENDER',
                       'EMERGENCYSTATE_MODE',
                       'FLAG_CONT_MOBILE',
                       'FLAG_DOCUMENT_3',
                       'FLAG_DOCUMENT_4',
                       'FLAG_DOCUMENT_5',
                       'FLAG_DOCUMENT_6',
                       'FLAG_DOCUMENT_7',
                       'FLAG_DOCUMENT_8',
                       'FLAG_DOCUMENT_9',
                       'FLAG_DOCUMENT_11',
                       'FLAG_DOCUMENT_18',
                       'FLAG_EMAIL',
                       'FLAG_EMP_PHONE',
                       'FLAG_MOBIL',
                       'FLAG_OWN_CAR',
                       'FLAG_OWN_REALTY',
                       'FLAG_PHONE',
                       'FLAG_WORK_PHONE',
                       'FONDKAPREMONT_MODE',
                       'HOUR_APPR_PROCESS_START',
                       'HOUSETYPE_MODE',
                       'LIVE_CITY_NOT_WORK_CITY',
                       'LIVE_REGION_NOT_WORK_REGION',
                       'NAME_CONTRACT_TYPE',
                       'NAME_TYPE_SUITE',
                       'NAME_INCOME_TYPE',
                       'NAME_EDUCATION_TYPE',
                       'NAME_FAMILY_STATUS',
                       'NAME_HOUSING_TYPE',
                       'OCCUPATION_TYPE',
                       'ORGANIZATION_TYPE',
                       'REG_CITY_NOT_LIVE_CITY',
                       'REG_CITY_NOT_WORK_CITY',
                       'REG_REGION_NOT_LIVE_REGION',
                       'REG_REGION_NOT_WORK_REGION',
                       'WALLSMATERIAL_MODE',
                       'WEEKDAY_APPR_PROCESS_START']

NUMERICAL_COLUMNS = ['AMT_ANNUITY',
                     'AMT_CREDIT',
                     'AMT_INCOME_TOTAL',
                     'AMT_REQ_CREDIT_BUREAU_HOUR',
                     'AMT_REQ_CREDIT_BUREAU_DAY',
                     'AMT_REQ_CREDIT_BUREAU_WEEK',
                     'AMT_REQ_CREDIT_BUREAU_MON',
                     'AMT_REQ_CREDIT_BUREAU_QRT',
                     'AMT_REQ_CREDIT_BUREAU_YEAR',
                     'APARTMENTS_AVG',
                     'BASEMENTAREA_AVG',
                     'COMMONAREA_AVG',
                     'CNT_CHILDREN',
                     'CNT_FAM_MEMBERS',
                     'DAYS_BIRTH',
                     'DAYS_EMPLOYED',
                     'DAYS_ID_PUBLISH',
                     'DAYS_LAST_PHONE_CHANGE',
                     'DAYS_REGISTRATION',
                     'DEF_30_CNT_SOCIAL_CIRCLE',
                     'DEF_60_CNT_SOCIAL_CIRCLE',
                     'ELEVATORS_AVG',
                     'ENTRANCES_AVG',
                     'EXT_SOURCE_1',
                     'EXT_SOURCE_2',
                     'EXT_SOURCE_3',
                     'diff_EXT_SOURCE_1',
                     'diff_EXT_SOURCE_2',
                     'diff_EXT_SOURCE_3',
                     'FLOORSMAX_AVG',
                     'FLOORSMIN_AVG',
                     'LANDAREA_AVG',
                     'LIVINGAPARTMENTS_AVG',
                     'LIVINGAREA_AVG',
                     'NONLIVINGAPARTMENTS_AVG',
                     'NONLIVINGAREA_AVG',
                     'OBS_30_CNT_SOCIAL_CIRCLE',
                     'OWN_CAR_AGE',
                     'REGION_POPULATION_RELATIVE',
                     'REGION_RATING_CLIENT',
                     'TOTALAREA_MODE',
                     'YEARS_BEGINEXPLUATATION_AVG',
                     'YEARS_BUILD_AVG']

USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
                   'FLAG_DOCUMENT_12',
                   'FLAG_DOCUMENT_13',
                   'FLAG_DOCUMENT_14',
                   'FLAG_DOCUMENT_15',
                   'FLAG_DOCUMENT_16',
                   'FLAG_DOCUMENT_17',
                   'FLAG_DOCUMENT_19',
                   'FLAG_DOCUMENT_2',
                   'FLAG_DOCUMENT_20',
                   'FLAG_DOCUMENT_21']

HIGHLY_CORRELATED_NUMERICAL_COLUMNS = ['AMT_GOODS_PRICE',
                                       'APARTMENTS_MEDI',
                                       'APARTMENTS_MODE',
                                       'BASEMENTAREA_MEDI',
                                       'BASEMENTAREA_MODE',
                                       'COMMONAREA_MEDI',
                                       'COMMONAREA_MODE',
                                       'ELEVATORS_MEDI',
                                       'ELEVATORS_MODE',
                                       'ENTRANCES_MEDI',
                                       'ENTRANCES_MODE',
                                       'FLAG_EMP_PHONE',
                                       'FLOORSMAX_MEDI',
                                       'FLOORSMAX_MODE',
                                       'FLOORSMIN_MEDI',
                                       'FLOORSMIN_MODE',
                                       'LANDAREA_MEDI',
                                       'LANDAREA_MODE',
                                       'LIVINGAPARTMENTS_MEDI',
                                       'LIVINGAPARTMENTS_MODE',
                                       'LIVINGAREA_MEDI',
                                       'LIVINGAREA_MODE',
                                       'NONLIVINGAPARTMENTS_MEDI',
                                       'NONLIVINGAPARTMENTS_MODE',
                                       'NONLIVINGAREA_MEDI',
                                       'NONLIVINGAREA_MODE',
                                       'OBS_60_CNT_SOCIAL_CIRCLE',
                                       'REGION_RATING_CLIENT_W_CITY',
                                       'YEARS_BEGINEXPLUATATION_MEDI',
                                       'YEARS_BEGINEXPLUATATION_MODE',
                                       'YEARS_BUILD_MEDI',
                                       'YEARS_BUILD_MODE']

cols_to_agg = ['AMT_CREDIT', 
               'AMT_ANNUITY',
               'AMT_INCOME_TOTAL',
               'AMT_GOODS_PRICE', 
               'EXT_SOURCE_1',
               'EXT_SOURCE_2',
               'EXT_SOURCE_3',
               'diff_EXT_SOURCE_1',
               'diff_EXT_SOURCE_2',
               'diff_EXT_SOURCE_3',
               'OWN_CAR_AGE',
               'REGION_POPULATION_RELATIVE',
               'DAYS_REGISTRATION',
               'CNT_CHILDREN',
               'CNT_FAM_MEMBERS',
               'DAYS_ID_PUBLISH',
               'DAYS_BIRTH',
               'DAYS_EMPLOYED'
]
aggs = ['min', 'mean', 'max', 'sum', 'var']
aggregation_pairs = [(col, agg) for col in cols_to_agg for agg in aggs]

APPLICATION_AGGREGATION_RECIPIES = [
    (['NAME_EDUCATION_TYPE', 'CODE_GENDER'], aggregation_pairs),
    (['NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE'], aggregation_pairs),
    (['NAME_FAMILY_STATUS', 'CODE_GENDER'], aggregation_pairs),
    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                                            ('AMT_INCOME_TOTAL', 'mean'),
                                            ('DAYS_REGISTRATION', 'mean'),
                                            ('diff_EXT_SOURCE_1', 'mean'),
                                            ('EXT_SOURCE_1', 'mean')]),
    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),
                                                 ('CNT_CHILDREN', 'mean'),
                                                 ('DAYS_ID_PUBLISH', 'mean')]),
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
                                                                                           ('diff_EXT_SOURCE_1', 'mean'),
                                                                                           ('diff_EXT_SOURCE_2', 'mean'),
                                                                                           ('EXT_SOURCE_2', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
                                                  ('APARTMENTS_AVG', 'mean'),
                                                  ('BASEMENTAREA_AVG', 'mean'),
                                                  ('EXT_SOURCE_1', 'mean'),
                                                  ('EXT_SOURCE_2', 'mean'),
                                                  ('EXT_SOURCE_3', 'mean'),
                                                  ('diff_EXT_SOURCE_1', 'mean'),
                                                  ('diff_EXT_SOURCE_2', 'mean'),
                                                  ('diff_EXT_SOURCE_3', 'mean'),
                                                  ('NONLIVINGAREA_AVG', 'mean'),
                                                  ('OWN_CAR_AGE', 'mean'),
                                                  ('YEARS_BUILD_AVG', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
                                                                            ('EXT_SOURCE_1', 'mean')]),
    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                           ('CNT_CHILDREN', 'mean'),
                           ('CNT_FAM_MEMBERS', 'mean'),
                           ('DAYS_BIRTH', 'mean'),
                           ('DAYS_EMPLOYED', 'mean'),
                           ('DAYS_ID_PUBLISH', 'mean'),
                           ('DAYS_REGISTRATION', 'mean'),
                           ('diff_EXT_SOURCE_1', 'mean'),
                           ('diff_EXT_SOURCE_2', 'mean'),
                           ('diff_EXT_SOURCE_3', 'mean'),
                           ('EXT_SOURCE_1', 'mean'),
                           ('EXT_SOURCE_2', 'mean'),
                           ('EXT_SOURCE_3', 'mean')]),
]

BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'),
                               ('CREDIT_ACTIVE', 'size')
                               ]
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_ANNUITY',
                   'AMT_CREDIT_SUM',
                   'AMT_CREDIT_SUM_DEBT',
                   'AMT_CREDIT_SUM_LIMIT',
                   'AMT_CREDIT_SUM_OVERDUE',
                   'AMT_CREDIT_MAX_OVERDUE',
                   'CNT_CREDIT_PROLONG',
                   'CREDIT_DAY_OVERDUE',
                   'DAYS_CREDIT',
                   'DAYS_CREDIT_ENDDATE',
                   'DAYS_CREDIT_UPDATE'
                   ]:
        BUREAU_AGGREGATION_RECIPIES.append((select, agg))
BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)]

CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_BALANCE',
                   'AMT_CREDIT_LIMIT_ACTUAL',
                   'AMT_DRAWINGS_ATM_CURRENT',
                   'AMT_DRAWINGS_CURRENT',
                   'AMT_DRAWINGS_OTHER_CURRENT',
                   'AMT_DRAWINGS_POS_CURRENT',
                   'AMT_PAYMENT_CURRENT',
                   'CNT_DRAWINGS_ATM_CURRENT',
                   'CNT_DRAWINGS_CURRENT',
                   'CNT_DRAWINGS_OTHER_CURRENT',
                   'CNT_INSTALMENT_MATURE_CUM',
                   'MONTHS_BALANCE',
                   'SK_DPD',
                   'SK_DPD_DEF'
                   ]:
        CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append((select, agg))
CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES)]

INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_INSTALMENT',
                   'AMT_PAYMENT',
                   'DAYS_ENTRY_PAYMENT',
                   'DAYS_INSTALMENT',
                   'NUM_INSTALMENT_NUMBER',
                   'NUM_INSTALMENT_VERSION'
                   ]:
        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))
INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]

POS_CASH_BALANCE_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['MONTHS_BALANCE',
                   'SK_DPD',
                   'SK_DPD_DEF'
                   ]:
        POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg))
POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)]

PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_ANNUITY',
                   'AMT_APPLICATION',
                   'AMT_CREDIT',
                   'AMT_DOWN_PAYMENT',
                   'AMT_GOODS_PRICE',
                   'CNT_PAYMENT',
                   'DAYS_DECISION',
                   'HOUR_APPR_PROCESS_START',
                   'RATE_DOWN_PAYMENT'
                   ]:
        PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))
PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]

SOLUTION_CONFIG = AttrDict({
    'pipeline': {'experiment_directory': '../input/'
                 },

    'feature_joiner': {'use_nan_count': params["use_nan_count"]
                       },

    'preprocessing': {'impute_missing': {'fill_missing': params["fill_missing"],
                                         'fill_value': params["fill_value"]},
                      'categorical_encoder': {'categorical_columns': CATEGORICAL_COLUMNS
                                              },
                      },

    'applications': {'columns': {'categorical_columns': CATEGORICAL_COLUMNS,
                                 'numerical_columns': NUMERICAL_COLUMNS
                                 },
                     'aggregations': {'groupby_aggregations': APPLICATION_AGGREGATION_RECIPIES,
                                      'use_diffs_only': params["application_aggregation__use_diffs_only"]
                                      },
                     },

    'bureau': {'table_name': 'bureau',
               'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
               'groupby_aggregations': BUREAU_AGGREGATION_RECIPIES,
               'num_workers': params["num_workers"],
               },

    'credit_card_balance': {'table_name': 'credit_card_balance',
                            'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
                            'groupby_aggregations': CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES,
                            'num_workers': params["num_workers"]
                            },

    'installments_payments': {'table_name': 'installments_payments',
                              'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
                              'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES,
                              'last_k_agg_periods': parameter_eval(params["installments__last_k_agg_periods"]),
                              'last_k_agg_period_fractions': parameter_eval(
                                  params["installments__last_k_agg_period_fractions"]),
                              'last_k_trend_periods': parameter_eval(params["installments__last_k_trend_periods"]),
                              'num_workers': params["num_workers"]
                              },

    'pos_cash_balance': {'table_name': 'POS_CASH_balance',
                         'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
                         'groupby_aggregations': POS_CASH_BALANCE_AGGREGATION_RECIPIES,
                         'last_k_agg_periods': parameter_eval(params["pos_cash__last_k_agg_periods"]),
                         'last_k_trend_periods': parameter_eval(params["pos_cash__last_k_trend_periods"]),
                         'num_workers': params["num_workers"]
                         },

    'previous_applications': {'table_name': 'previous_application',
                              'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
                              'groupby_aggregations': PREVIOUS_APPLICATION_AGGREGATION_RECIPIES,
                              'numbers_of_applications': [1, 2, 3, 4, 5],
                              'num_workers': params["num_workers"]
                              },

    'light_gbm': {'device': parameter_eval(params["lgbm__device"]),
                  'boosting_type': parameter_eval(params["lgbm__boosting_type"]),
                  'objective': parameter_eval(params["lgbm__objective"]),
                  'metric': parameter_eval(params["lgbm__metric"]),
                  'is_unbalance': parameter_eval(params["lgbm__is_unbalance"]),
                  'scale_pos_weight': parameter_eval(params["lgbm__scale_pos_weight"]),
                  'learning_rate': parameter_eval(params["lgbm__learning_rate"]),
                  'max_bin': parameter_eval(params["lgbm__max_bin"]),
                  'max_depth': parameter_eval(params["lgbm__max_depth"]),
                  'num_leaves': parameter_eval(params["lgbm__num_leaves"]),
                  'min_child_samples': parameter_eval(params["lgbm__min_child_samples"]),
                  'subsample': parameter_eval(params["lgbm__subsample"]),
                  'colsample_bytree': parameter_eval(params["lgbm__colsample_bytree"]),
                  'subsample_freq': parameter_eval(params["lgbm__subsample_freq"]),
                  'min_gain_to_split': parameter_eval(params["lgbm__min_gain_to_split"]),
                  'reg_lambda': parameter_eval(params["lgbm__reg_lambda"]),
                  'reg_alpha': parameter_eval(params["lgbm__reg_alpha"]),
                  'nthread': parameter_eval(params["num_workers"]),
                  'number_boosting_rounds': parameter_eval(params["lgbm__number_boosting_rounds"]),
                  'early_stopping_rounds': parameter_eval(params["lgbm__early_stopping_rounds"]),
                  'verbose': parameter_eval(params["verbose"]),
                  },
    
    'xgboost': {'booster': parameter_eval(params["xgb__booster"]),
                'objective': parameter_eval(params["xgb__objective"]),
                'tree_method': parameter_eval(params["xgb__tree_method"]),
                'eval_metric': parameter_eval(params["xgb__eval_metric"]),
                'eta': parameter_eval(params["xgb__eta"]),
                'max_depth': parameter_eval(params["xgb__max_depth"]),
                'subsample': parameter_eval(params["xgb__subsample"]),
                'colsample_bytree': parameter_eval(params["xgb__colsample_bytree"]),
                'colsample_bylevel': parameter_eval(params["xgb__colsample_bylevel"]),
                'min_child_weight': parameter_eval(params["xgb__min_child_weight"]),
                'lambda': parameter_eval(params["xgb__lambda"]),
                'alpha': parameter_eval(params["xgb__alpha"]),
                'max_bin': parameter_eval(params["xgb__max_bin"]),
                'num_leaves': parameter_eval(params["xgb__max_leaves"]),
                'nthread': parameter_eval(params["num_workers"]),
                'nrounds': parameter_eval(params["xgb__nrounds"]),
                'early_stopping_rounds': parameter_eval(params["xgb__early_stopping_rounds"]),
                'verbose': parameter_eval(params["verbose"]),
                },
    
    'logistic_regression': {'penalty': parameter_eval(params["lr__penalty"]),
                            'tol': parameter_eval(params["lr__tol"]),
                            'C': parameter_eval(params["lr__C"]),
                            'fit_intercept': parameter_eval(params["lr__fit_intercept"]),
                            'class_weight': parameter_eval(params["lr__class_weight"]),
                            'random_state': RANDOM_SEED,
                            'solver': parameter_eval(params["lr__solver"]),
                            'max_iter': parameter_eval(params["lr__max_iter"]),
                            'verbose': parameter_eval(params["verbose"]),
                            'n_jobs': parameter_eval(params["num_workers"]),
                            },

    'svc': {'kernel': parameter_eval(params["svc__kernel"]),
            'C': parameter_eval(params["svc__C"]),
            'degree': parameter_eval(params["svc__degree"]),
            'gamma': parameter_eval(params["svc__gamma"]),
            'coef0': parameter_eval(params["svc__coef0"]),
            'probability': parameter_eval(params["svc__probability"]),
            'tol': parameter_eval(params["svc__tol"]),
            'max_iter': parameter_eval(params["svc__max_iter"]),
            'verbose': parameter_eval(params["verbose"]),
            'random_state': RANDOM_SEED,
            },
    
    'random_search': {'light_gbm': {'n_runs': params["lgbm_random_search_runs"],
                                    'callbacks':
                                        {'neptune_monitor': {'name': 'light_gbm'},
                                         'persist_results': {'filepath': os.path.join(params["experiment_directory"],
                                                                                      'random_search_light_gbm.pkl')}
                                         },
                                    },
                      
                      'xgboost': {'n_runs': params["xgb_random_search_runs"],
                                  'callbacks':
                                      {'neptune_monitor': {'name': 'xgboost'},
                                       'persist_results': {'filepath': os.path.join(params["experiment_directory"],
                                                                                    'random_search_xgboost.pkl')}
                                       },
                                  },
                      
                      'logistic_regression': {'n_runs': params["lr_random_search_runs"],
                                              'callbacks':
                                                  {'neptune_monitor': {'name': 'logistic_regression'},
                                                   'persist_results':
                                                       {'filepath': os.path.join(params["experiment_directory"],
                                                                                 'random_search_logistic_regression.pkl')}
                                                   },
                                              },
                      
                      'svc': {'n_runs': params["svc_random_search_runs"],
                              'callbacks': 
                                  {'neptune_monitor': {'name': 'svc'},
                                   'persist_results': {'filepath': os.path.join(params["experiment_directory"],
                                                                                         'random_search_svc.pkl')}
                                      },
                             }                              
                     }
})


## Setting lightgbm

In [5]:
class LightGBM(BaseTransformer):
    def __init__(self, name=None, **params):
        super().__init__()
        logger.info('initializing LightGBM...')
        self.params = params
        self.training_params = ['number_boosting_rounds', 'early_stopping_rounds']
        self.evaluation_function = None
        #self.callbacks = callbacks(channel_prefix=name)

    @property
    def model_config(self):
        return AttrDict({param: value for param, value in self.params.items()
                         if param not in self.training_params})

    @property
    def training_config(self):
        return AttrDict({param: value for param, value in self.params.items()
                         if param in self.training_params})

    def fit(self,
            X,
            y,
            X_valid,
            y_valid,
            feature_names='auto',
            categorical_features='auto',
            **kwargs):
        evaluation_results = {}

        self._check_target_shape_and_type(y, 'y')
        self._check_target_shape_and_type(y_valid, 'y_valid')
        y = self._format_target(y)
        y_valid = self._format_target(y_valid)
        
        # ここらへん変
        logger.info('LightGBM, train data shape        {}'.format(X.shape)) #なぜか800
        logger.info('LightGBM, validation data shape   {}'.format(X_valid.shape))
        logger.info('LightGBM, train labels shape      {}'.format(y.shape))
        logger.info('LightGBM, validation labels shape {}'.format(y_valid.shape))

        data_train = lgb.Dataset(data=X,
                                 label=y,
                                 feature_name=feature_names,
                                 categorical_feature=categorical_features,
                                 **kwargs)
        data_valid = lgb.Dataset(X_valid,
                                 label=y_valid,
                                 feature_name=feature_names,
                                 categorical_feature=categorical_features,
                                 **kwargs)

        self.estimator = lgb.train(self.model_config,
                                   data_train,
                                   feature_name=feature_names,
                                   categorical_feature=categorical_features,
                                   valid_sets=[data_train, data_valid],
                                   valid_names=['data_train', 'data_valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.number_boosting_rounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function,
                                   #callbacks=self.callbacks,
                                   **kwargs)
        return self

    def transform(self, X, **kwargs):
        prediction = self.estimator.predict(X)
        return {'prediction': prediction}

    def load(self, filepath):
        self.estimator = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.estimator, filepath)

    def _check_target_shape_and_type(self, target, name):
        if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]):
            raise TypeError(
                '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))
        try:
            assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name,
                                                                                          len(target.shape))
        except AttributeError:
            print('Cannot determine shape of the {}. '
                  'Type must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead'.format(name,
                                                                                                     type(target)))

    def _format_target(self, target):

        if isinstance(target, pd.Series):
            return target.values
        elif isinstance(target, np.ndarray):
            return target
        elif isinstance(target, list):
            return np.array(target)
        else:
            raise TypeError(
                '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))


## set xgboost

In [6]:
import xgboost as xgb

class XGBoost(BaseTransformer):
    def __init__(self, **params):
        super().__init__()
        logger.info('initializing XGBoost...')
        self.params = params
        self.training_params = ['nrounds', 'early_stopping_rounds']
        self.evaluation_function = None

    @property
    def model_config(self):
        return AttrDict({param: value for param, value in self.params.items()
                         if param not in self.training_params})

    @property
    def training_config(self):
        return AttrDict({param: value for param, value in self.params.items()
                         if param in self.training_params})

    def fit(self,
            X, y,
            X_valid, y_valid,
            feature_names=None,
            feature_types=None,
            **kwargs):
        train = xgb.DMatrix(X,
                            label=y,
                            feature_names=feature_names,
                            feature_types=feature_types)
        valid = xgb.DMatrix(X_valid,
                            label=y_valid,
                            feature_names=feature_names,
                            feature_types=feature_types)

        evaluation_results = {}
        self.estimator = xgb.train(params=self.model_config,
                                   dtrain=train,
                                   evals=[(train, 'train'), (valid, 'valid')],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.nrounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self

    def transform(self, X, y=None, feature_names=None, feature_types=None, **kwargs):
        X_DMatrix = xgb.DMatrix(X,
                                label=y,
                                feature_names=feature_names,
                                feature_types=feature_types)
        prediction = self.estimator.predict(X_DMatrix)
        return {'prediction': prediction}

    def load(self, filepath):
        self.estimator = xgb.Booster(params=self.model_config)
        self.estimator.load_model(filepath)
        return self

    def persist(self, filepath):
        self.estimator.save_model(filepath)


## set log_reg and svc

In [7]:
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from toolkit.sklearn_transformers.models import SklearnClassifier

def get_sklearn_classifier(ClassifierClass, normalize, **kwargs):
    class SklearnBinaryClassifier(SklearnClassifier):
        def transform(self, X, y=None, target=1, **kwargs):
            prediction = self.estimator.predict_proba(X)[:, target]
            return {SklearnClassifier.RESULT_KEY: prediction}

    if normalize:
        return SklearnBinaryClassifier(Pipeline([('standarizer', StandardScaler()),
                                                 ('classifier', ClassifierClass(**kwargs))]))

    return SklearnBinaryClassifier(ClassifierClass(**kwargs))

## data_cleaning

In [8]:
import numpy as np
from steppy.base import BaseTransformer
from steppy.utils import get_logger

logger = get_logger()


class ApplicationCleaning(BaseTransformer):
    def __init__(self, **kwargs):
        super().__init__()

    def transform(self, X):
        X['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
        X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
        X['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
        X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
        X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)

        return {'X': X}


class BureauCleaning(BaseTransformer):
    def __init__(self, fill_missing=False, fill_value=0, **kwargs):
        self.fill_missing = fill_missing
        self.fill_value = fill_value

    def transform(self, bureau):
        bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
        bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
        bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan

        if self.fill_missing:
            bureau['AMT_CREDIT_SUM'].fillna(self.fill_value, inplace=True)
            bureau['AMT_CREDIT_SUM_DEBT'].fillna(self.fill_value, inplace=True)
            bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(self.fill_value, inplace=True)
            bureau['CNT_CREDIT_PROLONG'].fillna(self.fill_value, inplace=True)

        return {'bureau': bureau}


class CreditCardCleaning(BaseTransformer):
    def __init__(self, **kwargs):
        super().__init__()

    def transform(self, credit_card):
        credit_card['AMT_DRAWINGS_ATM_CURRENT'][credit_card['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan
        credit_card['AMT_DRAWINGS_CURRENT'][credit_card['AMT_DRAWINGS_CURRENT'] < 0] = np.nan

        return {'credit_card': credit_card}


class PreviousApplicationCleaning(BaseTransformer):
    def __init__(self, **kwargs):
        super().__init__()

    def transform(self, previous_application):
        previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
        previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
        previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
        previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
        previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

        return {'previous_application': previous_application}


## data_extraction

In [9]:
from copy import deepcopy
from functools import partial

import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, iqr, skew
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
from steppy.base import BaseTransformer
from steppy.utils import get_logger

logger = get_logger()


class FeatureJoiner(BaseTransformer):
    def __init__(self, use_nan_count=False, **kwargs):
        super().__init__()
        self.use_nan_count = use_nan_count

    def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
        features = numerical_feature_list + categorical_feature_list
        for feature in features:
            feature.reset_index(drop=True, inplace=True)
        features = pd.concat(features, axis=1).astype(np.float32)
        if self.use_nan_count:
            features['nan_count'] = features.isnull().sum(axis=1)

        outputs = dict()
        outputs['features'] = features
        outputs['feature_names'] = list(features.columns)
        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
        return outputs

    def _get_feature_names(self, dataframes):
        feature_names = []
        for dataframe in dataframes:
            try:
                feature_names.extend(list(dataframe.columns))
            except Exception as e:
                print(e)
                feature_names.append(dataframe.name)

        return feature_names


class CategoricalEncoder(BaseTransformer):
    def __init__(self, **kwargs):
        super().__init__()
        self.categorical_columns = kwargs['categorical_columns']
        params = deepcopy(kwargs)
        params.pop('categorical_columns', None)
        self.params = params
        self.encoder_class = ce.OrdinalEncoder
        self.categorical_encoder = None

    def fit(self, X, y, **kwargs):
        X_ = X[self.categorical_columns]
        self.categorical_encoder = self.encoder_class(cols=self.categorical_columns, **self.params)
        self.categorical_encoder.fit(X_, y)
        return self

    def transform(self, X, **kwargs):
        X_ = X[self.categorical_columns]
        X_ = self.categorical_encoder.transform(X_)
        return {'categorical_features': X_}

    def load(self, filepath):
        self.categorical_encoder = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.categorical_encoder, filepath)


class GroupbyAggregateDiffs(BaseTransformer):
    def __init__(self, groupby_aggregations, use_diffs_only=False, **kwargs):
        super().__init__()
        self.groupby_aggregations = groupby_aggregations
        self.use_diffs_only = use_diffs_only
        self.features = []
        self.groupby_feature_names = []

    @property
    def feature_names(self):
        if self.use_diffs_only:
            return self.diff_feature_names
        else:
            return self.groupby_feature_names + self.diff_feature_names

    def fit(self, main_table, **kwargs):
        for groupby_cols, specs in self.groupby_aggregations:
            group_object = main_table.groupby(groupby_cols)
            for select, agg in specs:
                groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
                group_features = group_object[select].agg(agg).reset_index() \
                    .rename(index=str,
                            columns={select: groupby_aggregate_name})[groupby_cols + [groupby_aggregate_name]]

                self.features.append((groupby_cols, group_features))
                self.groupby_feature_names.append(groupby_aggregate_name)
        return self

    def transform(self, main_table, **kwargs):
        main_table = self._merge_grouby_features(main_table)
        main_table = self._add_diff_features(main_table)

        return {'numerical_features': main_table[self.feature_names].astype(np.float32)}

    def _merge_grouby_features(self, main_table):
        for groupby_cols, groupby_features in self.features:
            main_table = main_table.merge(groupby_features,
                                          on=groupby_cols,
                                          how='left')
        return main_table

    def _add_diff_features(self, main_table):
        self.diff_feature_names = []
        for groupby_cols, specs in self.groupby_aggregations:
            for select, agg in specs:
                if agg in ['mean', 'median', 'max', 'min']:
                    groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
                    diff_feature_name = '{}_diff'.format(groupby_aggregate_name)
                    abs_diff_feature_name = '{}_abs_diff'.format(groupby_aggregate_name)

                    main_table[diff_feature_name] = main_table[select] - main_table[groupby_aggregate_name]
                    main_table[abs_diff_feature_name] = np.abs(main_table[select] - main_table[groupby_aggregate_name])

                    self.diff_feature_names.append(diff_feature_name)
                    self.diff_feature_names.append(abs_diff_feature_name)

        return main_table

    def load(self, filepath):
        params = joblib.load(filepath)
        self.features = params['features']
        self.groupby_feature_names = params['groupby_feature_names']
        return self

    def persist(self, filepath):
        params = {'features': self.features,
                  'groupby_feature_names': self.groupby_feature_names}
        joblib.dump(params, filepath)

    def _create_colname_from_specs(self, groupby_cols, agg, select):
        return '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)


class GroupbyAggregate(BaseTransformer):
    def __init__(self, table_name, id_columns, groupby_aggregations, **kwargs):
        super().__init__()
        self.table_name = table_name
        self.id_columns = id_columns
        self.groupby_aggregations = groupby_aggregations

    def fit(self, table, **kwargs):
        features = pd.DataFrame({self.id_columns[0]: table[self.id_columns[0]].unique()})

        for groupby_cols, specs in self.groupby_aggregations:
            group_object = table.groupby(groupby_cols)
            for select, agg in specs:
                groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
                features = features.merge(group_object[select]
                                          .agg(agg)
                                          .reset_index()
                                          .rename(index=str,
                                                  columns={select: groupby_aggregate_name})
                                          [groupby_cols + [groupby_aggregate_name]],
                                          on=groupby_cols,
                                          how='left')
        self.features = features
        return self

    def transform(self, table, **kwargs):
        return {'features_table': self.features}

    def load(self, filepath):
        self.features = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.features, filepath)

    def _create_colname_from_specs(self, groupby_cols, select, agg):
        return '{}_{}_{}_{}'.format(self.table_name, '_'.join(groupby_cols), agg, select)


class GroupbyMerge(BaseTransformer):
    def __init__(self, id_columns, **kwargs):
        super().__init__()
        self.id_columns = id_columns

    def _feature_names(self, features):
        feature_names = list(features.columns)
        feature_names.remove(self.id_columns[0])
        return feature_names

    def transform(self, table, features, **kwargs):
        table = table.merge(features,
                            left_on=[self.id_columns[0]],
                            right_on=[self.id_columns[1]],
                            how='left',
                            validate='one_to_one')

        return {'numerical_features': table[self._feature_names(features)].astype(np.float32)}


class BasicHandCraftedFeatures(BaseTransformer):
    def __init__(self, num_workers=1, **kwargs):
        self.num_workers = num_workers
        self.features = None

    @property
    def feature_names(self):
        feature_names = list(self.features.columns)
        feature_names.remove('SK_ID_CURR')
        return feature_names

    def transform(self, **kwargs):
        return {'features_table': self.features}

    def load(self, filepath):
        self.features = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.features, filepath)


class ApplicationFeatures(BaseTransformer):
    def __init__(self, categorical_columns, numerical_columns):
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.engineered_numerical_columns = ['annuity_income_percentage',
                                             'car_to_birth_ratio',
                                             'car_to_employ_ratio',
                                             'children_ratio',
                                             'credit_to_annuity_ratio',
                                             'credit_to_goods_ratio',
                                             'credit_to_income_ratio',
                                             'days_employed_percentage',
                                             'income_credit_percentage',
                                             'income_per_child',
                                             'income_per_person',
                                             'payment_rate',
                                             'phone_to_birth_ratio',
                                             'phone_to_employ_ratio',
                                             'external_sources_weighted',
                                             'external_sources_min',
                                             'external_sources_max',
                                             'external_sources_sum',
                                             'external_sources_mean',
                                             'external_sources_nanmedian',
                                             'short_employment',
                                             'young_age',
                                             'cnt_non_child',
                                             'child_to_non_child_ratio',
                                             'income_per_non_child',
                                             'credit_per_person',
                                             'credit_per_child',
                                             'credit_per_non_child',
                                             ]

    def transform(self, X, **kwargs):
        X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
        X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
        X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
        X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
        X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
        X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
        X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
        X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
        X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
        X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
        X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
        X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
        X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
        X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']
        X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4
        X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
        X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']
        X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']
        X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']
        X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])
        X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']
        for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
            X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
                X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

        for function_name in ['min', 'max', 'mean', 'nanmedian']:
            X['diff_external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
                X[['diff_EXT_SOURCE_1', 'diff_EXT_SOURCE_2', 'diff_EXT_SOURCE_3']], axis=1)

        X['short_employment'] = (X['DAYS_EMPLOYED'] < -2000).astype(int)
        X['young_age'] = (X['DAYS_BIRTH'] < -14000).astype(int)

        return {'numerical_features': X[self.engineered_numerical_columns + self.numerical_columns],
                'categorical_features': X[self.categorical_columns]
                }


class BureauFeatures(BasicHandCraftedFeatures):
    def fit(self, bureau, **kwargs):
        bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)
        bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
        features = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()})

        groupby = bureau.groupby(by=['SK_ID_CURR'])

        g = groupby['DAYS_CREDIT'].agg('count').reset_index()
        g.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['CREDIT_TYPE'].agg('nunique').reset_index()
        g.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['bureau_credit_active_binary'].agg('mean').reset_index()
        g.rename(index=str, columns={'bureau_credit_active_binary': 'bureau_credit_active_binary'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()
        g.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['AMT_CREDIT_SUM'].agg('sum').reset_index()
        g.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()
        g.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['CNT_CREDIT_PROLONG'].agg('sum').reset_index()
        g.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['bureau_credit_enddate_binary'].agg('mean').reset_index()
        g.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        features['bureau_average_of_past_loans_per_type'] = \
            features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']

        features['bureau_debt_credit_ratio'] = \
            features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']

        features['bureau_overdue_debt_ratio'] = \
            features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']

        self.features = features
        return self


class CreditCardBalanceFeatures(BasicHandCraftedFeatures):
    def fit(self, credit_card, **kwargs):
        static_features = self._static_features(credit_card, **kwargs)
        dynamic_features = self._dynamic_features(credit_card, **kwargs)

        self.features = pd.merge(static_features,
                                 dynamic_features,
                                 on=['SK_ID_CURR'],
                                 validate='one_to_one')
        return self

    def _static_features(self, credit_card, **kwargs):
        credit_card['number_of_installments'] = credit_card.groupby(
            by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[
            'CNT_INSTALMENT_MATURE_CUM']

        credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby(
            by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(
            lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]

        features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()})

        groupby = credit_card.groupby(by=['SK_ID_CURR'])

        g = groupby['SK_ID_PREV'].agg('nunique').reset_index()
        g.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['SK_DPD'].agg('mean').reset_index()
        g.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
        g.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()
        g.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['number_of_installments'].agg('sum').reset_index()
        g.rename(index=str, columns={'number_of_installments': 'credit_card_total_installments'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = groupby['credit_card_max_loading_of_credit_limit'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features[
            'credit_card_drawings_total']

        features['credit_card_installments_per_loan'] = (
            features['credit_card_total_installments'] / features['credit_card_number_of_loans'])

        return features

    def _dynamic_features(self, credit_card, **kwargs):
        features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()})

        credit_card_sorted = credit_card.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])

        groupby = credit_card_sorted.groupby(by=['SK_ID_CURR'])
        credit_card_sorted['credit_card_monthly_diff'] = groupby['AMT_BALANCE'].diff()
        groupby = credit_card_sorted.groupby(by=['SK_ID_CURR'])

        g = groupby['credit_card_monthly_diff'].agg('mean').reset_index()
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        return features


class POSCASHBalanceFeatures(BasicHandCraftedFeatures):
    def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs):
        super().__init__(num_workers=num_workers)
        self.last_k_agg_periods = last_k_agg_periods
        self.last_k_trend_periods = last_k_trend_periods

        self.num_workers = num_workers
        self.features = None

    def fit(self, pos_cash, **kwargs):
        pos_cash['is_contract_status_completed'] = pos_cash['NAME_CONTRACT_STATUS'] == 'Completed'
        pos_cash['pos_cash_paid_late'] = (pos_cash['SK_DPD'] > 0).astype(int)
        pos_cash['pos_cash_paid_late_with_tolerance'] = (pos_cash['SK_DPD_DEF'] > 0).astype(int)

        features = pd.DataFrame({'SK_ID_CURR': pos_cash['SK_ID_CURR'].unique()})
        groupby = pos_cash.groupby(['SK_ID_CURR'])
        func = partial(POSCASHBalanceFeatures.generate_features,
                       agg_periods=self.last_k_agg_periods,
                       trend_periods=self.last_k_trend_periods)
        g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index()
        features = features.merge(g, on='SK_ID_CURR', how='left')

        self.features = features
        return self

    @staticmethod
    def generate_features(gr, agg_periods, trend_periods):
        one_time = POSCASHBalanceFeatures.one_time_features(gr)
        all = POSCASHBalanceFeatures.all_installment_features(gr)
        agg = POSCASHBalanceFeatures.last_k_installment_features(gr, agg_periods)
        trend = POSCASHBalanceFeatures.trend_in_last_k_installment_features(gr, trend_periods)
        last = POSCASHBalanceFeatures.last_loan_features(gr)
        features = {**one_time, **all, **agg, **trend, **last}
        return pd.Series(features)

    @staticmethod
    def one_time_features(gr):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], inplace=True)
        features = {}

        features['pos_cash_remaining_installments'] = gr_['CNT_INSTALMENT_FUTURE'].tail(1)
        features['pos_cash_completed_contracts'] = gr_['is_contract_status_completed'].agg('sum')

        return features

    @staticmethod
    def all_installment_features(gr):
        return POSCASHBalanceFeatures.last_k_installment_features(gr, periods=[10e16])

    @staticmethod
    def last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            if period > 10e10:
                period_name = 'all_installment_'
                gr_period = gr_.copy()
            else:
                period_name = 'last_{}_'.format(period)
                gr_period = gr_.iloc[:period]

            features = add_features_in_group(features, gr_period, 'pos_cash_paid_late',
                                             ['count', 'mean'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance',
                                             ['count', 'mean'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'SK_DPD',
                                             ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'SK_DPD_DEF',
                                             ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                             period_name)
        return features

    @staticmethod
    def trend_in_last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            gr_period = gr_.iloc[:period]

            features = add_trend_feature(features, gr_period,
                                         'SK_DPD', '{}_period_trend_'.format(period)
                                         )
            features = add_trend_feature(features, gr_period,
                                         'SK_DPD_DEF', '{}_period_trend_'.format(period)
                                         )
            features = add_trend_feature(features, gr_period,
                                         'CNT_INSTALMENT_FUTURE', '{}_period_trend_'.format(period)
                                         )
        return features

    @staticmethod
    def last_loan_features(gr):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)
        last_installment_id = gr_['SK_ID_PREV'].iloc[0]
        gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]

        features={}
        features = add_features_in_group(features, gr_, 'pos_cash_paid_late',
                                         ['count', 'sum', 'mean'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance',
                                         ['mean'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'SK_DPD',
                                         ['sum', 'mean', 'max', 'std'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'SK_DPD_DEF',
                                         ['sum', 'mean', 'max', 'std'],
                                         'last_loan_')

        return features


class PreviousApplicationFeatures(BasicHandCraftedFeatures):
    def __init__(self, numbers_of_applications=[], num_workers=1, **kwargs):
        super().__init__(num_workers=num_workers)
        self.numbers_of_applications = numbers_of_applications

    def fit(self, prev_applications, **kwargs):
        features = pd.DataFrame({'SK_ID_CURR': prev_applications['SK_ID_CURR'].unique()})

        prev_app_sorted = prev_applications.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])
        prev_app_sorted_groupby = prev_app_sorted.groupby(by=['SK_ID_CURR'])

        prev_app_sorted['previous_application_prev_was_approved'] = (
            prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
        g = prev_app_sorted_groupby['previous_application_prev_was_approved'].last().reset_index()
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        prev_app_sorted['previous_application_prev_was_refused'] = (
            prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')
        g = prev_app_sorted_groupby['previous_application_prev_was_refused'].last().reset_index()
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = prev_app_sorted_groupby['SK_ID_PREV'].agg('nunique').reset_index()
        g.rename(index=str, columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'}, inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = prev_app_sorted.groupby(by=['SK_ID_CURR'])['previous_application_prev_was_refused'].mean().reset_index()
        g.rename(index=str, columns={
            'previous_application_prev_was_refused': 'previous_application_fraction_of_refused_applications'},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        prev_app_sorted['prev_applications_prev_was_revolving_loan'] = (
            prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')
        g = prev_app_sorted.groupby(by=['SK_ID_CURR'])[
            'prev_applications_prev_was_revolving_loan'].last().reset_index()
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        for number in self.numbers_of_applications:
            prev_applications_tail = prev_app_sorted_groupby.tail(number)

            tail_groupby = prev_applications_tail.groupby(by=['SK_ID_CURR'])

            g = tail_groupby['CNT_PAYMENT'].agg('mean').reset_index()
            g.rename(index=str,
                     columns={'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},
                     inplace=True)
            features = features.merge(g, on=['SK_ID_CURR'], how='left')

            g = tail_groupby['DAYS_DECISION'].agg('mean').reset_index()
            g.rename(index=str,
                     columns={'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(
                         number)},
                     inplace=True)
            features = features.merge(g, on=['SK_ID_CURR'], how='left')

            g = tail_groupby['DAYS_FIRST_DRAWING'].agg('mean').reset_index()
            g.rename(index=str,
                     columns={
                         'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(
                             number)},
                     inplace=True)
            features = features.merge(g, on=['SK_ID_CURR'], how='left')

        self.features = features
        return self


class InstallmentPaymentsFeatures(BasicHandCraftedFeatures):
    def __init__(self, last_k_agg_periods, last_k_agg_period_fractions, last_k_trend_periods, num_workers=1, **kwargs):
        super().__init__(num_workers=num_workers)
        self.last_k_agg_periods = last_k_agg_periods
        self.last_k_agg_period_fractions = last_k_agg_period_fractions
        self.last_k_trend_periods = last_k_trend_periods

        self.num_workers = num_workers
        self.features = None

    def fit(self, installments, **kwargs):
        installments['installment_paid_late_in_days'] = installments['DAYS_ENTRY_PAYMENT'] - installments[
            'DAYS_INSTALMENT']
        installments['installment_paid_late'] = (installments['installment_paid_late_in_days'] > 0).astype(int)
        installments['installment_paid_over_amount'] = installments['AMT_PAYMENT'] - installments['AMT_INSTALMENT']
        installments['installment_paid_over'] = (installments['installment_paid_over_amount'] > 0).astype(int)

        features = pd.DataFrame({'SK_ID_CURR': installments['SK_ID_CURR'].unique()})
        groupby = installments.groupby(['SK_ID_CURR'])

        func = partial(InstallmentPaymentsFeatures.generate_features,
                       agg_periods=self.last_k_agg_periods,
                       period_fractions=self.last_k_agg_period_fractions,
                       trend_periods=self.last_k_trend_periods)
        g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index()
        features = features.merge(g, on='SK_ID_CURR', how='left')

        self.features = features
        return self

    @staticmethod
    def generate_features(gr, agg_periods, trend_periods, period_fractions):
        all = InstallmentPaymentsFeatures.all_installment_features(gr)
        agg = InstallmentPaymentsFeatures.last_k_installment_features_with_fractions(gr,
                                                                                     agg_periods,
                                                                                     period_fractions)
        trend = InstallmentPaymentsFeatures.trend_in_last_k_installment_features(gr, trend_periods)
        last = InstallmentPaymentsFeatures.last_loan_features(gr)
        features = {**all, **agg, **trend, **last}
        return pd.Series(features)

    @staticmethod
    def all_installment_features(gr):
        return InstallmentPaymentsFeatures.last_k_installment_features(gr, periods=[10e16])

    @staticmethod
    def last_k_installment_features_with_fractions(gr, periods, period_fractions):
        features = InstallmentPaymentsFeatures.last_k_installment_features(gr, periods)

        for short_period, long_period in period_fractions:
            short_feature_names = get_feature_names_by_period(features, short_period)
            long_feature_names = get_feature_names_by_period(features, long_period)

            for short_feature, long_feature in zip(short_feature_names, long_feature_names):
                old_name_chunk = '_{}_'.format(short_period)
                new_name_chunk = '_{}by{}_fraction_'.format(short_period, long_period)
                fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)
                features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])
        return features

    @staticmethod
    def last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            if period > 10e10:
                period_name = 'all_installment_'
                gr_period = gr_.copy()
            else:
                period_name = 'last_{}_'.format(period)
                gr_period = gr_.iloc[:period]

            features = add_features_in_group(features, gr_period, 'NUM_INSTALMENT_VERSION',
                                             ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                             period_name)

            features = add_features_in_group(features, gr_period, 'installment_paid_late_in_days',
                                             ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'installment_paid_late',
                                             ['count', 'mean'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'installment_paid_over_amount',
                                             ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'installment_paid_over',
                                             ['count', 'mean'],
                                             period_name)
        return features

    @staticmethod
    def trend_in_last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            gr_period = gr_.iloc[:period]

            features = add_trend_feature(features, gr_period,
                                         'installment_paid_late_in_days', '{}_period_trend_'.format(period)
                                         )
            features = add_trend_feature(features, gr_period,
                                         'installment_paid_over_amount', '{}_period_trend_'.format(period)
                                         )
        return features

    @staticmethod
    def last_loan_features(gr):
        gr_ = gr.copy()
        gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)
        last_installment_id = gr_['SK_ID_PREV'].iloc[0]
        gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]

        features = {}
        features = add_features_in_group(features, gr_,
                                         'installment_paid_late_in_days',
                                         ['sum', 'mean', 'max', 'min', 'std'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_,
                                         'installment_paid_late',
                                         ['count', 'mean'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_,
                                         'installment_paid_over_amount',
                                         ['sum', 'mean', 'max', 'min', 'std'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_,
                                         'installment_paid_over',
                                         ['count', 'mean'],
                                         'last_loan_')
        return features


class ConcatFeatures(BaseTransformer):
    def transform(self, **kwargs):
        features_concat = []
        for _, feature in kwargs.items():
            feature.reset_index(drop=True, inplace=True)
            features_concat.append(feature)
        features_concat = pd.concat(features_concat, axis=1)
        return {'concatenated_features': features_concat}


def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()

    return features


def add_trend_feature(features, gr, feature_name, prefix):
    y = gr[feature_name].values
    try:
        x = np.arange(0, len(y)).reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(x, y)
        trend = lr.coef_[0]
    except:
        trend = np.nan
    features['{}{}'.format(prefix, feature_name)] = trend
    return features


def get_feature_names_by_period(features, period):
    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])


## pipeline_blocks

In [10]:
experiment_directory= '../steppy/'

In [11]:
from functools import partial

from sklearn.metrics import roc_auc_score
from steppy.adapter import Adapter, E
from steppy.base import Step, make_transformer, IdentityOperation

#from .models import get_sklearn_classifier, XGBoost, LightGBM

def feature_extraction(config, train_mode, suffix, **kwargs):
    if train_mode:
        application, application_valid = _application(config, train_mode, suffix, **kwargs)
        bureau_cleaned = _bureau_cleaning(config, suffix, **kwargs)
        bureau, bureau_valid = _bureau(
            bureau_cleaned,
            config,
            train_mode,
            suffix,
            **kwargs)
        credit_card_balance_cleaned = _credit_card_balance_cleaning(config, suffix, **kwargs)
        credit_card_balance, credit_card_balance_valid = _credit_card_balance(
            credit_card_balance_cleaned,
            config,
            train_mode,
            suffix,
            **kwargs)
        pos_cash_balance, pos_cash_balance_valid = _pos_cash_balance(config, train_mode, suffix, **kwargs)
        previous_application_cleaned = _previous_application_cleaning(config, suffix, **kwargs)
        previous_application, previous_application_valid = _previous_application(
            previous_application_cleaned,
            config,
            train_mode,
            suffix,
            **kwargs)
        installment_payments, installment_payments_valid = _installment_payments(config, train_mode, suffix, **kwargs)

        application_agg, application_agg_valid = _application_groupby_agg(config, train_mode, suffix, **kwargs)
        bureau_agg, bureau_agg_valid = _bureau_groupby_agg(
            bureau_cleaned,
            config,
            train_mode,
            suffix,
            **kwargs)
        credit_card_balance_agg, credit_card_balance_agg_valid = _credit_card_balance_groupby_agg(
            credit_card_balance_cleaned,
            config,
            train_mode, suffix,
            **kwargs)
        installments_payments_agg, installments_payments_agg_valid = _installments_payments_groupby_agg(
            config,
            train_mode, suffix,
            **kwargs)
        pos_cash_balance_agg, pos_cash_balance_agg_valid = _pos_cash_balance_groupby_agg(
            config,
            train_mode, suffix,
            **kwargs)
        previous_applications_agg, previous_applications_agg_valid = _previous_applications_groupby_agg(
            previous_application_cleaned,
            config,
            train_mode, suffix,
            **kwargs)

        categorical_encoder, categorical_encoder_valid = _categorical_encoders(config, train_mode, suffix, **kwargs)

        feature_combiner, feature_combiner_valid = _join_features(
            numerical_features=[application,
                                application_agg,
                                bureau,
                                bureau_agg,
                                credit_card_balance,
                                credit_card_balance_agg,
                                installment_payments,
                                installments_payments_agg,
                                pos_cash_balance,
                                pos_cash_balance_agg,
                                previous_application,
                                previous_applications_agg,
                                ],
            numerical_features_valid=[application_valid,
                                      application_agg_valid,
                                      bureau_valid,
                                      bureau_agg_valid,
                                      credit_card_balance_valid,
                                      credit_card_balance_agg_valid,
                                      installment_payments_valid,
                                      installments_payments_agg_valid,
                                      pos_cash_balance_valid,
                                      pos_cash_balance_agg_valid,
                                      previous_application_valid,
                                      previous_applications_agg_valid,
                                      ],
            categorical_features=[categorical_encoder
                                  ],
            categorical_features_valid=[categorical_encoder_valid
                                        ],
            config=config,
            train_mode=train_mode,
            suffix=suffix,
            **kwargs)

        return feature_combiner, feature_combiner_valid
    else:
        application = _application(config, train_mode, suffix, **kwargs)
        bureau_cleaned = _bureau_cleaning(config, suffix, **kwargs)
        bureau = _bureau(bureau_cleaned, config, train_mode, suffix, **kwargs)
        credit_card_balance_cleaned = _credit_card_balance_cleaning(config, suffix, **kwargs)
        credit_card_balance = _credit_card_balance(credit_card_balance_cleaned, config, train_mode, suffix, **kwargs)
        pos_cash_balance = _pos_cash_balance(config, train_mode, suffix, **kwargs)
        previous_application_cleaned = _previous_application_cleaning(config, suffix, **kwargs)
        previous_application = _previous_application(previous_application_cleaned, config, train_mode, suffix, **kwargs)
        installment_payments = _installment_payments(config, train_mode, suffix, **kwargs)

        application_agg = _application_groupby_agg(config, train_mode, suffix, **kwargs)
        bureau_agg = _bureau_groupby_agg(bureau_cleaned, config, train_mode, suffix, **kwargs)
        credit_card_balance_agg = _credit_card_balance_groupby_agg(credit_card_balance_cleaned,
                                                                   config, train_mode, suffix, **kwargs)
        installments_payments_agg = _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs)
        pos_cash_balance_agg = _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs)
        previous_applications_agg = _previous_applications_groupby_agg(previous_application_cleaned,
                                                                       config, train_mode, suffix, **kwargs)
        categorical_encoder = _categorical_encoders(config, train_mode, suffix, **kwargs)
        feature_combiner = _join_features(numerical_features=[application,
                                                              application_agg,
                                                              bureau,
                                                              bureau_agg,
                                                              credit_card_balance,
                                                              credit_card_balance_agg,
                                                              installment_payments,
                                                              installments_payments_agg,
                                                              pos_cash_balance,
                                                              pos_cash_balance_agg,
                                                              previous_application,
                                                              previous_applications_agg,
                                                              ],
                                          numerical_features_valid=[],
                                          categorical_features=[categorical_encoder
                                                                ],
                                          categorical_features_valid=[],
                                          config=config,
                                          train_mode=train_mode,
                                          suffix=suffix,
                                          **kwargs)

        return feature_combiner


def preprocessing_fillna(features, config, train_mode, suffix, **kwargs):
    if train_mode:
        features_train, features_valid = features
        fillna = Step(name='fillna{}'.format(suffix),
                      transformer=_fillna(**config.preprocessing),
                      input_steps=[features_train, features_valid],
                      adapter=Adapter({'X': E(features_train.name, 'features'),
                                       'X_valid': E(features_valid.name, 'features'),
                                       }),
                      experiment_directory=config.pipeline.experiment_directory,
                      **kwargs
                      )
    else:
        fillna = Step(name='fillna{}'.format(suffix),
                      transformer=_fillna(**config.preprocessing),
                      input_steps=[features],
                      adapter=Adapter({'X': E(features.name, 'features')}),
                      experiment_directory=config.pipeline.experiment_directory,
                      **kwargs
                      )
    return fillna


def _join_features(numerical_features,
                   numerical_features_valid,
                   categorical_features,
                   categorical_features_valid,
                   config, train_mode, suffix,
                   **kwargs):
    if train_mode:
        persist_output = True
        cache_output = True
        load_persisted_output = True
    else:
        persist_output = False
        cache_output = True
        load_persisted_output = False

    feature_joiner = Step(name='feature_joiner{}'.format(suffix),
                          transformer=FeatureJoiner(**config.feature_joiner),
                          input_steps=numerical_features + categorical_features,
                          adapter=Adapter({
                              'numerical_feature_list': [
                                  E(feature.name, 'numerical_features') for feature in numerical_features],
                              'categorical_feature_list': [
                                  E(feature.name, 'categorical_features') for feature in categorical_features],
                          }),
                          experiment_directory=experiment_directory,
                          persist_output=persist_output,
                          cache_output=cache_output,
                          load_persisted_output=load_persisted_output)
    if train_mode:
        feature_joiner_valid = Step(name='feature_joiner_valid{}'.format(suffix),
                                    transformer=feature_joiner,
                                    input_steps=numerical_features_valid + categorical_features_valid,
                                    adapter=Adapter({
                                        'numerical_feature_list': [
                                            E(feature.name,
                                              'numerical_features') for feature in numerical_features_valid],
                                        'categorical_feature_list': [
                                            E(feature.name,
                                              'categorical_features') for feature in categorical_features_valid],
                                    }),
                                    experiment_directory=experiment_directory,
                                    persist_output=persist_output,
                                    cache_output=cache_output,
                                    load_persisted_output=load_persisted_output)

        return feature_joiner, feature_joiner_valid

    else:
        return feature_joiner


def _categorical_encoders(config, train_mode, suffix, **kwargs):
    categorical_encoder = Step(name='categorical_encoder{}'.format(suffix),
                               transformer=CategoricalEncoder(**config.preprocessing.categorical_encoder),
                               input_data=['application'],
                               adapter=Adapter({'X': E('application', 'X'),
                                                'y': E('application', 'y')}
                                               ),
                               experiment_directory=experiment_directory,
                               **kwargs)
    if train_mode:
        categorical_encoder_valid = Step(name='categorical_encoder_valid{}'.format(suffix),
                                         transformer=categorical_encoder,
                                         input_data=['application'],
                                         adapter=Adapter(
                                             {'X': E('application', 'X_valid'),
                                              'y': E('application', 'y_valid')}
                                         ),
                                         experiment_directory=experiment_directory,
                                         **kwargs)
        return categorical_encoder, categorical_encoder_valid
    else:
        return categorical_encoder


def _application_groupby_agg(config, train_mode, suffix, **kwargs):
    if train_mode:
        application_cleaning, application_cleaning_valid = _application_cleaning(config, train_mode, suffix, **kwargs)
    else:
        application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs)

    application_groupby_agg = Step(name='application_groupby_agg{}'.format(suffix),
                                   transformer=GroupbyAggregateDiffs(**config.applications.aggregations),
                                   input_steps=[application_cleaning],
                                   adapter=Adapter({'main_table': E(application_cleaning.name, 'X')}),
                                   experiment_directory=experiment_directory,
                                   **kwargs)

    if train_mode:

        application_groupby_agg_valid = Step(name='application_groupby_agg_valid{}'.format(suffix),
                                             transformer=application_groupby_agg,
                                             input_steps=[application_cleaning_valid],
                                             adapter=Adapter({'main_table': E(application_cleaning_valid.name, 'X')}),
                                             experiment_directory=experiment_directory,
                                             **kwargs)

        return application_groupby_agg, application_groupby_agg_valid

    else:
        return application_groupby_agg


def _bureau_groupby_agg(bureau_cleaned, config, train_mode, suffix, **kwargs):
    bureau_groupby_agg = Step(name='bureau_groupby_agg',
                              transformer=GroupbyAggregate(**config.bureau),
                              input_steps=[bureau_cleaned],
                              adapter=Adapter({'table': E(bureau_cleaned.name, 'bureau')}),
                              experiment_directory=experiment_directory,
                              **kwargs)

    bureau_agg_merge = Step(name='bureau_agg_merge{}'.format(suffix),
                            transformer=GroupbyMerge(**config.bureau),
                            input_data=['application'],
                            input_steps=[bureau_groupby_agg],
                            adapter=Adapter({'table': E('application', 'X'),
                                             'features': E(bureau_groupby_agg.name, 'features_table')}),
                            experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        bureau_agg_merge_valid = Step(name='bureau_agg_merge_valid{}'.format(suffix),
                                      transformer=bureau_agg_merge,
                                      input_data=['application'],
                                      input_steps=[bureau_groupby_agg],
                                      adapter=Adapter({'table': E('application', 'X_valid'),
                                                       'features': E(bureau_groupby_agg.name, 'features_table')}),
                                      experiment_directory=experiment_directory, **kwargs)
        return bureau_agg_merge, bureau_agg_merge_valid
    else:
        return bureau_agg_merge


def _credit_card_balance_groupby_agg(credit_card_balance_cleaned, config, train_mode, suffix, **kwargs):
    credit_card_balance_groupby_agg = Step(name='credit_card_balance_groupby_agg',
                                           transformer=GroupbyAggregate(**config.credit_card_balance),
                                           input_steps=[credit_card_balance_cleaned],
                                           adapter=Adapter({'table': E(credit_card_balance_cleaned.name,
                                                                       'credit_card')}),
                                           experiment_directory=experiment_directory,
                                           **kwargs)

    credit_card_balance_agg_merge = Step(name='credit_card_balance_agg_merge{}'.format(suffix),
                                         transformer=GroupbyMerge(**config.credit_card_balance),
                                         input_data=['application'],
                                         input_steps=[credit_card_balance_groupby_agg],
                                         adapter=Adapter({'table': E('application', 'X'),
                                                          'features': E(credit_card_balance_groupby_agg.name,
                                                                        'features_table')}),
                                         experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        credit_card_balance_agg_merge_valid = Step(name='credit_card_balance_agg_merge_valid{}'.format(suffix),
                                                   transformer=credit_card_balance_agg_merge,
                                                   input_data=['application'],
                                                   input_steps=[credit_card_balance_groupby_agg],
                                                   adapter=Adapter({'table': E('application', 'X_valid'),
                                                                    'features': E(credit_card_balance_groupby_agg.name,
                                                                                  'features_table')}),
                                                   experiment_directory=experiment_directory, **kwargs)
        return credit_card_balance_agg_merge, credit_card_balance_agg_merge_valid
    else:
        return credit_card_balance_agg_merge


def _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs):
    installments_payments_groupby_agg = Step(name='installments_payments_groupby_agg',
                                             transformer=GroupbyAggregate(**config.installments_payments),
                                             input_data=['installments_payments'],
                                             adapter=Adapter({'table': E('installments_payments', 'X')}),
                                             experiment_directory=experiment_directory,
                                             **kwargs)

    installments_payments_agg_merge = Step(name='installments_payments_agg_merge{}'.format(suffix),
                                           transformer=GroupbyMerge(**config.installments_payments),
                                           input_data=['application'],
                                           input_steps=[installments_payments_groupby_agg],
                                           adapter=Adapter({'table': E('application', 'X'),
                                                            'features': E(installments_payments_groupby_agg.name,
                                                                          'features_table')}),
                                           experiment_directory=experiment_directory,
                                           **kwargs)

    if train_mode:
        installments_payments_agg_merge_valid = Step(name='installments_payments_agg_merge_valid{}'.format(suffix),
                                                     transformer=installments_payments_agg_merge,
                                                     input_data=['application'],
                                                     input_steps=[installments_payments_groupby_agg],
                                                     adapter=Adapter({'table': E('application', 'X_valid'),
                                                                      'features': E(
                                                                          installments_payments_groupby_agg.name,
                                                                          'features_table')}),
                                                     experiment_directory=experiment_directory,
                                                     **kwargs)
        return installments_payments_agg_merge, installments_payments_agg_merge_valid
    else:
        return installments_payments_agg_merge


def _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs):
    pos_cash_balance_groupby_agg = Step(name='pos_cash_balance_groupby_agg',
                                        transformer=GroupbyAggregate(**config.pos_cash_balance),
                                        input_data=['pos_cash_balance'],
                                        adapter=Adapter({'table': E('pos_cash_balance', 'X')}),
                                        experiment_directory=experiment_directory,
                                        **kwargs)

    pos_cash_balance_agg_merge = Step(name='pos_cash_balance_agg_merge{}'.format(suffix),
                                      transformer=GroupbyMerge(**config.pos_cash_balance),
                                      input_data=['application'],
                                      input_steps=[pos_cash_balance_groupby_agg],
                                      adapter=Adapter({'table': E('application', 'X'),
                                                       'features': E(pos_cash_balance_groupby_agg.name,
                                                                     'features_table')}),
                                      experiment_directory=experiment_directory, 
                                      **kwargs)

    if train_mode:
        pos_cash_balance_agg_merge_valid = Step(name='pos_cash_balance_agg_merge_valid{}'.format(suffix),
                                                transformer=pos_cash_balance_agg_merge,
                                                input_data=['application'],
                                                input_steps=[pos_cash_balance_groupby_agg],
                                                adapter=Adapter({'table': E('application', 'X_valid'),
                                                                 'features': E(
                                                                     pos_cash_balance_groupby_agg.name,
                                                                     'features_table')}),
                                                experiment_directory=experiment_directory,
                                                **kwargs)
        return pos_cash_balance_agg_merge, pos_cash_balance_agg_merge_valid
    else:
        return pos_cash_balance_agg_merge


def _previous_applications_groupby_agg(previous_application_cleaned, config, train_mode, suffix, **kwargs):
    previous_applications_groupby_agg = Step(name='previous_applications_groupby_agg',
                                             transformer=GroupbyAggregate(**config.previous_applications),
                                             input_steps=[previous_application_cleaned],
                                             adapter=Adapter({'table': E(previous_application_cleaned.name,
                                                                         'previous_application')}),
                                             experiment_directory=experiment_directory, 
                                             **kwargs)

    previous_applications_agg_merge = Step(name='previous_applications_agg_merge{}'.format(suffix),
                                           transformer=GroupbyMerge(**config.previous_applications),
                                           input_data=['application'],
                                           input_steps=[previous_applications_groupby_agg],
                                           adapter=Adapter({'table': E('application', 'X'),
                                                            'features': E(previous_applications_groupby_agg.name,
                                                                          'features_table')}),
                                           experiment_directory=experiment_directory,
                                           **kwargs)

    if train_mode:
        previous_applications_agg_merge_valid = Step(name='previous_applications_agg_merge_valid{}'.format(suffix),
                                                     transformer=previous_applications_agg_merge,
                                                     input_data=['application'],
                                                     input_steps=[previous_applications_groupby_agg],
                                                     adapter=Adapter({'table': E('application', 'X_valid'),
                                                                      'features': E(
                                                                          previous_applications_groupby_agg.name,
                                                                          'features_table')}),
                                                     experiment_directory=experiment_directory,
                                                     **kwargs)
        return previous_applications_agg_merge, previous_applications_agg_merge_valid
    else:
        return previous_applications_agg_merge


def _application_cleaning(config, train_mode, suffix, **kwargs):
    application_cleaning = Step(name='application_cleaning{}'.format(suffix),
                                transformer=ApplicationCleaning(**config.preprocessing.impute_missing),
                                input_data=['application'],
                                adapter=Adapter({'X': E('application', 'X')}),
                                experiment_directory=experiment_directory,
                                **kwargs)
    #print(application_cleaning.shape)
    
    if train_mode:
        application_cleaning_valid = Step(name='application_cleaning_valid{}'.format(suffix),
                                          transformer=ApplicationCleaning(),
                                          input_data=['application'],
                                          adapter=Adapter({'X': E('application', 'X_valid')}),
                                          experiment_directory=experiment_directory,
                                          **kwargs)
        return application_cleaning, application_cleaning_valid
    else:
        return application_cleaning


def _application(config, train_mode, suffix, **kwargs):
    if train_mode:
        application_cleaning, application_cleaning_valid = _application_cleaning(config, train_mode, suffix, **kwargs)
    else:
        application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs)
    
    application = Step(name='application_hand_crafted{}'.format(suffix),
                       transformer=ApplicationFeatures(**config.applications.columns),
                       input_steps=[application_cleaning],
                       adapter=Adapter({'X': E(application_cleaning.name, 'X')}),
                       experiment_directory=experiment_directory,
                       **kwargs)
    
    
    if train_mode:
        application_valid = Step(name='application_hand_crafted_valid{}'.format(suffix),
                                 transformer=application,
                                 input_steps=[application_cleaning_valid],
                                 adapter=Adapter({'X': E(application_cleaning_valid.name, 'X')}),
                                 experiment_directory=experiment_directory,
                                 **kwargs)
        return application, application_valid
    else:
        return application


def _bureau_cleaning(config, suffix, **kwargs):
    bureau_cleaning = Step(name='bureau_cleaning',
                           transformer=BureauCleaning(**config.preprocessing.impute_missing),
                           input_data=['bureau'],
                           adapter=Adapter({'bureau': E('bureau', 'X')}),
                           experiment_directory=experiment_directory,
                           **kwargs)

    return bureau_cleaning


def _bureau(bureau_cleaned, config, train_mode, suffix, **kwargs):
    bureau_hand_crafted = Step(name='bureau_hand_crafted',
                               transformer=BureauFeatures(**config.bureau),
                               input_steps=[bureau_cleaned],
                               adapter=Adapter({'bureau': E(bureau_cleaned.name, 'bureau')}),
                               experiment_directory=experiment_directory,
                               **kwargs)

    bureau_hand_crafted_merge = Step(name='bureau_hand_crafted_merge{}'.format(suffix),
                                     transformer=GroupbyMerge(**config.bureau),
                                     input_data=['application'],
                                     input_steps=[bureau_hand_crafted],
                                     adapter=Adapter({'table': E('application', 'X'),
                                                      'features': E(bureau_hand_crafted.name, 'features_table')}),
                                     experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        bureau_hand_crafted_merge_valid = Step(name='bureau_hand_crafted_merge_valid{}'.format(suffix),
                                               transformer=bureau_hand_crafted_merge,
                                               input_data=['application'],
                                               input_steps=[bureau_hand_crafted],
                                               adapter=Adapter({'table': E('application', 'X_valid'),
                                                                'features': E(bureau_hand_crafted.name,
                                                                              'features_table')}),
                                               experiment_directory=experiment_directory, **kwargs)
        return bureau_hand_crafted_merge, bureau_hand_crafted_merge_valid
    else:
        return bureau_hand_crafted_merge


def _credit_card_balance_cleaning(config, suffix, **kwargs):
    credit_card_balance_cleaning = Step(name='credit_card_balance_cleaning{}'.format(suffix),
                                        transformer=CreditCardCleaning(
                                            **config.preprocessing.impute_missing),
                                        input_data=['credit_card_balance'],
                                        adapter=Adapter({'credit_card': E('credit_card_balance', 'X')}),
                                        experiment_directory=experiment_directory,
                                        **kwargs)

    return credit_card_balance_cleaning


def _credit_card_balance(credit_card_balance_cleaned, config, train_mode, suffix, **kwargs):
    credit_card_balance_hand_crafted = Step(name='credit_card_balance_hand_crafted',
                                            transformer=CreditCardBalanceFeatures(**config.credit_card_balance),
                                            input_steps=[credit_card_balance_cleaned],
                                            adapter=Adapter({'credit_card': E(credit_card_balance_cleaned.name,
                                                                              'credit_card')}),
                                            experiment_directory=experiment_directory,
                                            **kwargs)

    credit_card_balance_hand_crafted_merge = Step(name='credit_card_balance_hand_crafted_merge{}'.format(suffix),
                                                  transformer=GroupbyMerge(**config.credit_card_balance),
                                                  input_data=['application'],
                                                  input_steps=[credit_card_balance_hand_crafted],
                                                  adapter=Adapter({'table': E('application', 'X'),
                                                                   'features': E(credit_card_balance_hand_crafted.name,
                                                                                 'features_table')}),
                                                  experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        credit_card_balance_hand_crafted_merge_valid = Step(
            name='credit_card_balance_hand_crafted_merge_valid{}'.format(suffix),
            transformer=credit_card_balance_hand_crafted_merge,
            input_data=['application'],
            input_steps=[credit_card_balance_hand_crafted],
            adapter=Adapter({'table': E('application', 'X_valid'),
                             'features': E(credit_card_balance_hand_crafted.name,
                                           'features_table')}),
            experiment_directory=experiment_directory, **kwargs)
        return credit_card_balance_hand_crafted_merge, credit_card_balance_hand_crafted_merge_valid
    else:
        return credit_card_balance_hand_crafted_merge


def _pos_cash_balance(config, train_mode, suffix, **kwargs):
    pos_cash_balance_hand_crafted = Step(name='pos_cash_balance_hand_crafted',
                                         transformer=POSCASHBalanceFeatures(**config.pos_cash_balance),
                                         input_data=['pos_cash_balance'],
                                         adapter=Adapter({'pos_cash': E('pos_cash_balance', 'X')}),
                                         experiment_directory=experiment_directory,
                                         **kwargs)

    pos_cash_balance_hand_crafted_merge = Step(name='pos_cash_balance_hand_crafted_merge{}'.format(suffix),
                                               transformer=GroupbyMerge(**config.pos_cash_balance),
                                               input_data=['application'],
                                               input_steps=[pos_cash_balance_hand_crafted],
                                               adapter=Adapter({'table': E('application', 'X'),
                                                                'features': E(pos_cash_balance_hand_crafted.name,
                                                                              'features_table')}),
                                               experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        pos_cash_balance_hand_crafted_merge_valid = Step(
            name='pos_cash_balance_hand_crafted_merge_valid{}'.format(suffix),
            transformer=pos_cash_balance_hand_crafted_merge,
            input_data=['application'],
            input_steps=[pos_cash_balance_hand_crafted],
            adapter=Adapter({'table': E('application', 'X_valid'),
                             'features': E(
                                 pos_cash_balance_hand_crafted.name,
                                 'features_table')}),
            experiment_directory=experiment_directory,
            **kwargs)
        return pos_cash_balance_hand_crafted_merge, pos_cash_balance_hand_crafted_merge_valid
    else:
        return pos_cash_balance_hand_crafted_merge


def _previous_application_cleaning(config, suffix, **kwargs):
    previous_application_cleaning = Step(name='previous_application_cleaning{}'.format(suffix),
                                         transformer=PreviousApplicationCleaning(
                                             **config.preprocessing.impute_missing),
                                         input_data=['previous_application'],
                                         adapter=Adapter({'previous_application': E('previous_application', 'X')}),
                                         experiment_directory=experiment_directory,
                                         **kwargs)

    return previous_application_cleaning


def _previous_application(previous_application_cleaned, config, train_mode, suffix, **kwargs):
    previous_applications_hand_crafted = Step(name='previous_applications_hand_crafted',
                                              transformer=PreviousApplicationFeatures(
                                                  **config.previous_applications),
                                              input_steps=[previous_application_cleaned],
                                              adapter=Adapter(
                                                  {'prev_applications': E(previous_application_cleaned.name,
                                                                          'previous_application')}),
                                              experiment_directory=experiment_directory,
                                              **kwargs)

    previous_applications_hand_crafted_merge = Step(name='previous_applications_hand_crafted_merge{}'.format(suffix),
                                                    transformer=GroupbyMerge(**config.previous_applications),
                                                    input_data=['application'],
                                                    input_steps=[previous_applications_hand_crafted],
                                                    adapter=Adapter({'table': E('application', 'X'),
                                                                     'features': E(
                                                                         previous_applications_hand_crafted.name,
                                                                         'features_table')}),
                                                    experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        previous_applications_hand_crafted_merge_valid = Step(
            name='previous_applications_hand_crafted_merge_valid{}'.format(suffix),
            transformer=previous_applications_hand_crafted_merge,
            input_data=['application'],
            input_steps=[previous_applications_hand_crafted],
            adapter=Adapter({'table': E('application', 'X_valid'),
                             'features': E(
                                 previous_applications_hand_crafted.name,
                                 'features_table')}),
            experiment_directory=experiment_directory,
            **kwargs)
        return previous_applications_hand_crafted_merge, previous_applications_hand_crafted_merge_valid
    else:
        return previous_applications_hand_crafted_merge


def _installment_payments(config, train_mode, suffix, **kwargs):
    installment_payments_hand_crafted = Step(name='installment_payments_hand_crafted',
                                             transformer=InstallmentPaymentsFeatures(**config.installments_payments),
                                             input_data=['installments_payments'],
                                             adapter=Adapter({'installments': E('installments_payments', 'X')}),
                                             experiment_directory=experiment_directory,
                                             **kwargs)

    installment_payments_hand_crafted_merge = Step(name='installment_payments_hand_crafted_merge{}'.format(suffix),
                                                   transformer=GroupbyMerge(**config.installments_payments),
                                                   input_data=['application'],
                                                   input_steps=[installment_payments_hand_crafted],
                                                   adapter=Adapter({'table': E('application', 'X'),
                                                                    'features': E(
                                                                        installment_payments_hand_crafted.name,
                                                                        'features_table')}),
                                                   experiment_directory=experiment_directory, **kwargs)

    if train_mode:
        installment_payments_hand_crafted_merge_valid = Step(
            name='installment_payments_hand_crafted_merge_valid{}'.format(suffix),
            transformer=installment_payments_hand_crafted_merge,
            input_data=['application'],
            input_steps=[installment_payments_hand_crafted],
            adapter=Adapter({'table': E('application', 'X_valid'),
                             'features': E(
                                 installment_payments_hand_crafted.name,
                                 'features_table')}),
            experiment_directory=experiment_directory,
            **kwargs)
        return installment_payments_hand_crafted_merge, installment_payments_hand_crafted_merge_valid
    else:
        return installment_payments_hand_crafted_merge


def _fillna(fillna_value):
    def _inner_fillna(X, X_valid=None):
        if X_valid is None:
            return {'X': X.fillna(fillna_value)}
        else:
            return {'X': X.fillna(fillna_value),
                    'X_valid': X_valid.fillna(fillna_value)}

    return make_transformer(_inner_fillna)


## def_model

In [12]:
def classifier_light_gbm(features, config, train_mode, suffix, **kwargs):
    model_name = 'light_gbm{}'.format(suffix)

    if train_mode:
        features_train, features_valid = features
 
        transformer = LightGBM(name=model_name, **config.light_gbm)

        light_gbm = Step(name=model_name,
                         transformer=transformer,
                         input_data=['application'],
                         input_steps=[features_train, features_valid],
                         adapter=Adapter({'X': E(features_train.name, 'features'),
                                          'y': E('application', 'y'),
                                          'feature_names': E(features_train.name, 'feature_names'),
                                          'categorical_features': E(features_train.name, 'categorical_features'),
                                          'X_valid': E(features_valid.name, 'features'),
                                          'y_valid': E('application', 'y_valid'),
                                          }),
                         force_fitting=True,
                         experiment_directory=experiment_directory,
                         **kwargs)
    else:
        light_gbm = Step(name=model_name,
                         transformer=LightGBM(name=model_name, **config.light_gbm),
                         input_steps=[features],
                         adapter=Adapter({'X': E(features.name, 'features')}),
                         experiment_directory=experiment_directory,
                         **kwargs)
    return light_gbm

def classifier_xgb(features, config, train_mode, suffix, **kwargs):
    if train_mode:
        features_train, features_valid = features
        if config.random_search.xgboost.n_runs:
            transformer = RandomSearchOptimizer(TransformerClass=XGBoost,
                                                params=config.xgboost,
                                                train_input_keys=[],
                                                valid_input_keys=['X_valid', 'y_valid'],
                                                score_func=roc_auc_score,
                                                maximize=True,
                                                n_runs=config.random_search.xgboost.n_runs,
                                                #callbacks=[
                                                #    PersistResults(
                                                #        **config.random_search.xgboost.callbacks.persist_results)]
                                                )
        else:
            transformer = XGBoost(**config.xgboost)

        xgboost = Step(name='xgboost{}'.format(suffix),
                       transformer=transformer,
                       input_data=['application'],
                       input_steps=[features_train, features_valid],
                       adapter=Adapter({'X': E(features_train.name, 'features'),
                                        'y': E('application', 'y'),
                                        'feature_names': E(features_train.name, 'feature_names'),
                                        'X_valid': E(features_valid.name, 'features'),
                                        'y_valid': E('application', 'y_valid'),
                                        }),
                       experiment_directory=config.pipeline.experiment_directory,
                       **kwargs)
    else:
        xgboost = Step(name='xgboost{}'.format(suffix),
                       transformer=XGBoost(**config.xgboost),
                       input_steps=[features],
                       adapter=Adapter({'X': E(features.name, 'features')}),
                       experiment_directory=config.pipeline.experiment_directory,
                       **kwargs)
    return xgboost

def classifier_sklearn(sklearn_features,
                       ClassifierClass,
                       full_config,
                       clf_name,
                       train_mode,
                       suffix,
                       normalize,
                       **kwargs):
    config, model_params, rs_config = full_config
    if train_mode:
        if config.random_search.random_forest.n_runs:
            transformer = RandomSearchOptimizer(
                partial(get_sklearn_classifier,
                        ClassifierClass=ClassifierClass,
                        normalize=normalize),
                model_params,
                train_input_keys=[],
                valid_input_keys=['X_valid', 'y_valid'],
                score_func=roc_auc_score,
                maximize=True,
                n_runs=rs_config.n_runs,
                callbacks=[#NeptuneMonitor(**rs_config.callbacks.neptune_monitor),
                           PersistResults(**rs_config.callbacks.persist_results)]
            )
        else:
            transformer = get_sklearn_classifier(ClassifierClass, normalize, **model_params)

        sklearn_clf = Step(name='{}{}'.format(clf_name, suffix),
                           transformer=transformer,
                           input_data=['application'],
                           input_steps=[sklearn_features],
                           adapter=Adapter({'X': E(sklearn_features.name, 'X'),
                                            'y': E('application', 'y'),
                                            'X_valid': E(sklearn_features.name, 'X_valid'),
                                            'y_valid': E('application', 'y_valid'),
                                            }),
                           experiment_directory=config.pipeline.experiment_directory,
                           **kwargs)
    else:
        sklearn_clf = Step(name='{}{}'.format(clf_name, suffix),
                           transformer=get_sklearn_classifier(ClassifierClass, normalize, **model_params),
                           input_steps=[sklearn_features],
                           adapter=Adapter({'X': E(sklearn_features.name, 'X')}),
                           experiment_directory=config.pipeline.experiment_directory,
                           **kwargs)
    return sklearn_clf

## pipeline

In [13]:
from functools import partial

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score
from steppy.adapter import Adapter, E
from steppy.base import Step, make_transformer, IdentityOperation

def lightGBM(config, train_mode, suffix=''):
    if train_mode:
        features, features_valid = feature_extraction(config,
                                                             train_mode,
                                                             suffix,
                                                             persist_output=False,
                                                             cache_output=False,
                                                             load_persisted_output=False)
        light_gbm = classifier_light_gbm((features, features_valid),
                                                config,
                                                train_mode, suffix)
    else:
        features = feature_extraction(config,
                                             train_mode,
                                             suffix,
                                             cache_output=False)
        light_gbm = classifier_light_gbm(features,
                                                config,
                                                train_mode, suffix)

    return light_gbm


def lightGBM_stacking(config, train_mode, suffix=''):
    features = stacking_features(config, train_mode, suffix,
                                        persist_output=False,
                                        cache_output=False,
                                        load_persisted_output=False)

    light_gbm = classifier_light_gbm_stacking(features, config, train_mode, suffix,
                                                     cache_output=False)
    return light_gbm


def xgboost(config, train_mode, suffix=''):
    if train_mode:
        features, features_valid = feature_extraction(config,
                                                             train_mode,
                                                             suffix,
                                                             persist_output=True,
                                                             cache_output=True,
                                                             load_persisted_output=True)
        xgb = classifier_xgb((features, features_valid),
                                    config,
                                    train_mode,
                                    suffix)
    else:
        features = feature_extraction(config,
                                             train_mode,
                                             suffix,
                                             cache_output=True)
        xgb = classifier_xgb(features,
                                    config,
                                    train_mode,
                                    suffix)

    return xgb


def sklearn_main(config, ClassifierClass, clf_name, train_mode, suffix='', normalize=False):
    model_params = getattr(config, clf_name)
    random_search_config = getattr(config.random_search, clf_name)
    full_config = (config, model_params, random_search_config)
    if train_mode:
        features, features_valid = feature_extraction(config,
                                                             train_mode,
                                                             suffix,
                                                             persist_output=True,
                                                             cache_output=True,
                                                             load_persisted_output=True)

        sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode, suffix)
    else:
        features = feature_extraction(config,
                                             train_mode,
                                             suffix,
                                             cache_output=True)
        sklearn_preproc = preprocessing_fillna(features, config, train_mode, suffix)

    sklearn_clf = classifier_sklearn(sklearn_preproc,
                                            ClassifierClass,
                                            full_config,
                                            clf_name,
                                            train_mode,
                                            suffix,
                                            normalize)
    return sklearn_clf

# make log_reg def
def sklearn_main(config, ClassifierClass, clf_name, train_mode, suffix='', normalize=False):
    model_params = getattr(config, clf_name)
    random_search_config = getattr(config.random_search, clf_name)
    full_config = (config, model_params, random_search_config)
    if train_mode:
        features, features_valid = feature_extraction(config,
                                                             train_mode,
                                                             suffix,
                                                             persist_output=True,
                                                             cache_output=True,
                                                             load_persisted_output=True)

        sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode, suffix)
    else:
        features = feature_extraction(config,
                                             train_mode,
                                             suffix,
                                             cache_output=True)
        sklearn_preproc = preprocessing_fillna(features, config, train_mode, suffix)

    sklearn_clf = classifier_sklearn(sklearn_preproc,
                                            ClassifierClass,
                                            full_config,
                                            clf_name,
                                            train_mode,
                                            suffix,
                                            normalize)
    return sklearn_clf


PIPELINES = {'lightGBM': lightGBM,
             'lightGBM_stacking': lightGBM_stacking,
             'XGBoost': xgboost,
             'log_reg': {'train': partial(sklearn_main,
                                          ClassifierClass=LogisticRegression,
                                          clf_name='logistic_regression',
                                          train_mode=True,
                                          normalize=True),
                         'inference': partial(sklearn_main,
                                              ClassifierClass=LogisticRegression,
                                              clf_name='logistic_regression',
                                              train_mode=False,
                                              normalize=True)
                         },
             'svc': {'train': partial(sklearn_main,
                                      ClassifierClass=SVC,
                                      clf_name='svc',
                                      train_mode=True,
                                      normalize=True),
                     'inference': partial(sklearn_main,
                                          ClassifierClass=SVC,
                                          clf_name='svc',
                                          train_mode=False,
                                          normalize=True)
                     }
             }


## manage pipeline

In [14]:
#DEV_SAMPLE_SIZE = 1000

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

def _read_data(dev_mode, read_train=True, read_test=False):
    print('Reading data...')
    if dev_mode:
        nrows = DEV_SAMPLE_SIZE
        #nrows = None
        print('running in "dev-mode". Sample size is: {}'.format(nrows))
    else:
        nrows = None

    raw_data = {}

    if read_train:
        #raw_data['application_train'] = pd.read_csv('../input/application_train.csv', nrows=nrows)
        raw_data['application_train'] = pd.read_csv('../input/application_train_v2.csv', nrows=nrows)
        print(raw_data['application_train'].shape)

    if read_test:
        #raw_data['application_test'] = pd.read_csv('../input/application_test.csv', nrows=nrows)
        raw_data['application_test'] = pd.read_csv('../input/application_test_v2.csv', nrows=nrows)
    
    raw_data['bureau'] = pd.read_csv('../input/bureau.csv', nrows=nrows)
    raw_data['credit_card_balance'] = pd.read_csv('../input/credit_card_balance.csv', nrows=nrows)
    raw_data['pos_cash_balance'] = pd.read_csv('../input/POS_CASH_balance.csv', nrows=nrows)
    raw_data['previous_application'] = pd.read_csv('../input/previous_application.csv', nrows=nrows)
    raw_data['bureau_balance'] = pd.read_csv('../input/bureau_balance.csv', nrows=nrows)
    raw_data['installments_payments'] = pd.read_csv('../input/installments_payments.csv', nrows=nrows)
    return AttrDict(raw_data)

def _get_fold_generator(target_values):
    if params["stratified_cv"]:
        cv = StratifiedKFold(n_splits=params["n_cv_splits"], shuffle=True, random_state=RANDOM_SEED)
        cv.get_n_splits(target_values)
        fold_generator = cv.split(target_values, target_values)
    else:
        cv = KFold(n_splits=params["n_cv_splits"], shuffle=True, random_state=RANDOM_SEED)
        fold_generator = cv.split(target_values)
    return fold_generator

def train(pipeline_name, dev_mode):
    logger.info('TRAINING')
    if bool(params["clean_experiment_directory_before_training"]) and os.path.isdir(params["experiment_directory"]):
        logger.info('Cleaning experiment_directory...')
        shutil.rmtree(params["experiment_directory"])

    tables = _read_data(dev_mode, read_train=True, read_test=False)

    logger.info('Shuffling and splitting into train and test...')
    train_data_split, valid_data_split = train_test_split(tables.application_train,
                                                          test_size=0.2,
                                                          random_state=RANDOM_SEED,
                                                          shuffle=1)

    logger.info('Target mean in train: {}'.format(train_data_split[TARGET_COLUMNS].mean()))
    logger.info('Target mean in valid: {}'.format(valid_data_split[TARGET_COLUMNS].mean()))
    logger.info('Train shape: {}'.format(train_data_split.shape))
    logger.info('Valid shape: {}'.format(valid_data_split.shape))

    train_data = {'application': {'X': train_data_split.drop(TARGET_COLUMNS, axis=1),
                                  'y': train_data_split[TARGET_COLUMNS].values.reshape(-1),
                                  'X_valid': valid_data_split.drop(TARGET_COLUMNS, axis=1),
                                  'y_valid': valid_data_split[TARGET_COLUMNS].values.reshape(-1)
                                  },
                  'bureau_balance': {'X': tables.bureau_balance},
                  'bureau': {'X': tables.bureau},
                  'credit_card_balance': {'X': tables.credit_card_balance},
                  'installments_payments': {'X': tables.installments_payments},
                  'pos_cash_balance': {'X': tables.pos_cash_balance},
                  'previous_application': {'X': tables.previous_application},
                  }

    pipeline = PIPELINES[pipeline_name](config=SOLUTION_CONFIG, train_mode=True)
    pipeline.clean_cache()
    logger.info('Start pipeline fit and transform')
    pipeline.fit_transform(train_data)
    pipeline.clean_cache()


In [15]:
#train(pipeline_name = 'lightGBM', dev_mode = False)
#train(pipeline_name = 'XGBoost', dev_mode = True)
#train(pipeline_name = 'log_reg', dev_mode = True)

In [16]:
def make_submission(submission_filepath):
    logger.info('making Kaggle submit...')
    os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}'
              .format(submission_filepath, params["kaggle_message"]))

def predict(pipeline_name, dev_mode, submit_predictions):
    logger.info('PREDICTION')

    tables = _read_data(dev_mode, read_train=False, read_test=True)

    test_data = {'application': {'X': tables.application_test,
                                 'y': None,
                                 },
                 'bureau_balance': {'X': tables.bureau_balance},
                 'bureau': {'X': tables.bureau},
                 'credit_card_balance': {'X': tables.credit_card_balance},
                 'installments_payments': {'X': tables.installments_payments},
                 'pos_cash_balance': {'X': tables.pos_cash_balance},
                 'previous_application': {'X': tables.previous_application},
                 }

    pipeline = PIPELINES[pipeline_name](config=SOLUTION_CONFIG, train_mode=False)

    pipeline.clean_cache()
    logger.info('Start pipeline transform')
    output = pipeline.transform(test_data)
    pipeline.clean_cache()
    y_pred = output['prediction']

    #if not dev_mode:
    logger.info('creating submission file...')
    submission = create_submission(tables.application_test, y_pred)

    logger.info('verifying submission...')
    sample_submission = pd.read_csv(params["sample_submission_filepath"])
    verify_submission(submission, sample_submission)

    #submission_filepath = os.path.join(params["experiment_directory"], 'submission_%s.csv' % RANDOM_SEED)
    submission_filepath = os.path.join('../output/', 'submission_addval_%s.csv' % RANDOM_SEED)
    
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission persisted to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))

    if submit_predictions and params["kaggle_api"]:
        make_submission(submission_filepath)

In [17]:
#RANDOM_SEED = 1
#train(pipeline_name = 'lightGBM', dev_mode = False)
#predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)

In [18]:
import gc
gc.collect()

0

In [19]:
import gc

for RANDOM_SEED in range(7,10):
    gc.collect()
    train(pipeline_name = 'lightGBM', dev_mode = False)
    predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)

2018-08-29 15:40:45 steppy >>> TRAINING
2018-08-29 15:40:45 steppy >>> Cleaning experiment_directory...
Reading data...
(307511, 125)
2018-08-29 15:41:19 steppy >>> Shuffling and splitting into train and test...
2018-08-29 15:41:19 steppy >>> Target mean in train: TARGET    0.080697
dtype: float64
2018-08-29 15:41:19 steppy >>> Target mean in valid: TARGET    0.080858
dtype: float64
2018-08-29 15:41:19 steppy >>> Train shape: (246008, 125)
2018-08-29 15:41:19 steppy >>> Valid shape: (61503, 125)
2018-08-29 15:41:19 steppy >>> initializing Step application_cleaning...
2018-08-29 15:41:19 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 15:41:19 steppy >>> done: initializing experiment directories
2018-08-29 15:41:19 steppy >>> Step application_cleaning initialized
2018-08-29 15:41:19 steppy >>> initializing Step application_cleaning_valid...
2018-08-29 15:41:19 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 15:41:19 steppy >>> done: 

2018-08-29 15:41:19 steppy >>> initializing Step application_groupby_agg...
2018-08-29 15:41:19 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 15:41:19 steppy >>> done: initializing experiment directories
2018-08-29 15:41:19 steppy >>> Step application_groupby_agg initialized
2018-08-29 15:41:19 steppy >>> initializing Step application_groupby_agg_valid...
2018-08-29 15:41:19 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 15:41:19 steppy >>> done: initializing experiment directories
2018-08-29 15:41:19 steppy >>> Step application_groupby_agg_valid initialized
2018-08-29 15:41:19 steppy >>> initializing Step bureau_groupby_agg...
2018-08-29 15:41:19 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 15:41:19 steppy >>> done: initializing experiment directories
2018-08-29 15:41:19 steppy >>> Step bureau_groupby_agg initialized
2018-08-29 15:41:19 steppy >>> initializing Step bureau_agg_merge...
2018-08-29 15:

  r = func(a, **kwargs)


2018-08-29 15:41:20 steppy >>> Step application_hand_crafted, persisting transformer to the ../steppy/transformers/application_hand_crafted
2018-08-29 15:41:20 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 15:41:20 steppy >>> Step application_cleaning, loading transformer from the ../steppy/transformers/application_cleaning
2018-08-29 15:41:20 steppy >>> Step application_cleaning, transforming...
2018-08-29 15:41:20 steppy >>> Step application_groupby_agg, adapting inputs...
2018-08-29 15:41:20 steppy >>> Step application_groupby_agg, fitting and transforming...
2018-08-29 15:42:59 steppy >>> Step application_groupby_agg, persisting transformer to the ../steppy/transformers/application_groupby_agg
2018-08-29 15:42:59 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 15:42:59 steppy >>> Step bureau_cleaning, fitting and transforming...
2018-08-29 15:43:00 steppy >>> Step bureau_cleaning, persisting transformer to the ../steppy/transformers/bureau_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-08-29 15:43:02 steppy >>> Step bureau_hand_crafted, persisting transformer to the ../steppy/transformers/bureau_hand_crafted
2018-08-29 15:43:02 steppy >>> Step bureau_hand_crafted_merge, adapting inputs...
2018-08-29 15:43:02 steppy >>> Step bureau_hand_crafted_merge, fitting and transforming...
2018-08-29 15:43:03 steppy >>> Step bureau_hand_crafted_merge, persisting transformer to the ../steppy/transformers/bureau_hand_crafted_merge
2018-08-29 15:43:03 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 15:43:03 steppy >>> Step bureau_cleaning, loading transformer from the ../steppy/transformers/bureau_cleaning
2018-08-29 15:43:03 steppy >>> Step bureau_cleaning, transforming...
2018-08-29 15:43:03 steppy >>> Step bureau_groupby_agg, adapting inputs...
2018-08-29 15:43:03 steppy >>> Step bureau_groupby_agg, fitting and transforming...
2018-08-29 15:43:13 steppy >>> Step bureau_groupby_agg, persisting transformer to the ../steppy/transformers/bureau_groupby_agg
2018-0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-08-29 15:44:09 steppy >>> Step credit_card_balance_hand_crafted, persisting transformer to the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 15:44:09 steppy >>> Step credit_card_balance_hand_crafted_merge, adapting inputs...
2018-08-29 15:44:09 steppy >>> Step credit_card_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/credit_card_balance_hand_crafted_merge
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/transformers/credit_card_balance_cleaning
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_cleaning, transforming...
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_groupby_agg, adapting inputs...
2018-08-29 15:44:10 steppy >>> Step credit_card_balance_groupby_agg, fittin

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out

2018-08-29 16:16:21 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 16:16:34 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 16:16:34 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 16:16:36 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 16:16:36 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 16:16:36 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 16:16:47 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 16:16:47 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 16:16:47 steppy >>> Step 

100%|██████████| 4/4.0 [15:22<00:00, 230.60s/it]


2018-08-29 16:33:24 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 16:34:30 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 16:34:30 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 16:34:34 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 16:34:34 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 16:34:34 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 16:34:39 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 16:34:39 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 16:34:39 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 16:36:51 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 16:36:51 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 16:36:51 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 16:36:51 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 16:36:51 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 16:36:51 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 16:36:52 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 16:36:52 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 16:37:58 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 16:37:58 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 16:37:58 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 16:37:58 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 16:37:59 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 16:37:59 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 16:38:01 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 16:38:01 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 16:38:06 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 16:38:06 steppy >



Training until validation scores don't improve for 100 rounds.
[10]	data_train's auc: 0.709932	data_valid's auc: 0.709421
[20]	data_train's auc: 0.720138	data_valid's auc: 0.716462
[30]	data_train's auc: 0.724383	data_valid's auc: 0.720824
[40]	data_train's auc: 0.729759	data_valid's auc: 0.725991
[50]	data_train's auc: 0.73066	data_valid's auc: 0.726567
[60]	data_train's auc: 0.735689	data_valid's auc: 0.730621
[70]	data_train's auc: 0.737813	data_valid's auc: 0.732127
[80]	data_train's auc: 0.741587	data_valid's auc: 0.735357
[90]	data_train's auc: 0.743677	data_valid's auc: 0.736762
[100]	data_train's auc: 0.747439	data_valid's auc: 0.739686
[110]	data_train's auc: 0.749797	data_valid's auc: 0.741401
[120]	data_train's auc: 0.753321	data_valid's auc: 0.744552
[130]	data_train's auc: 0.756234	data_valid's auc: 0.746893
[140]	data_train's auc: 0.758725	data_valid's auc: 0.749006
[150]	data_train's auc: 0.761176	data_valid's auc: 0.751037
[160]	data_train's auc: 0.763667	data_valid's a

[1370]	data_train's auc: 0.849351	data_valid's auc: 0.793314
[1380]	data_train's auc: 0.849697	data_valid's auc: 0.793339
[1390]	data_train's auc: 0.850042	data_valid's auc: 0.793324
[1400]	data_train's auc: 0.850345	data_valid's auc: 0.793356
[1410]	data_train's auc: 0.850703	data_valid's auc: 0.793416
[1420]	data_train's auc: 0.851031	data_valid's auc: 0.793415
[1430]	data_train's auc: 0.851401	data_valid's auc: 0.793431
[1440]	data_train's auc: 0.851759	data_valid's auc: 0.793456
[1450]	data_train's auc: 0.852089	data_valid's auc: 0.793509
[1460]	data_train's auc: 0.852436	data_valid's auc: 0.793511
[1470]	data_train's auc: 0.852797	data_valid's auc: 0.793519
[1480]	data_train's auc: 0.853156	data_valid's auc: 0.793559
[1490]	data_train's auc: 0.853495	data_valid's auc: 0.793606
[1500]	data_train's auc: 0.853835	data_valid's auc: 0.793619
[1510]	data_train's auc: 0.854165	data_valid's auc: 0.793651
[1520]	data_train's auc: 0.854494	data_valid's auc: 0.793718
[1530]	data_train's auc:

2018-08-29 16:53:05 steppy >>> done: initializing experiment directories
2018-08-29 16:53:05 steppy >>> Step credit_card_balance_hand_crafted_merge initialized
2018-08-29 16:53:05 steppy >>> initializing Step pos_cash_balance_hand_crafted...
2018-08-29 16:53:05 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:53:05 steppy >>> done: initializing experiment directories
2018-08-29 16:53:05 steppy >>> Step pos_cash_balance_hand_crafted initialized
2018-08-29 16:53:05 steppy >>> initializing Step pos_cash_balance_hand_crafted_merge...
2018-08-29 16:53:05 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:53:05 steppy >>> done: initializing experiment directories
2018-08-29 16:53:05 steppy >>> Step pos_cash_balance_hand_crafted_merge initialized
2018-08-29 16:53:05 steppy >>> initializing Step previous_application_cleaning...
2018-08-29 16:53:05 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:53:05 steppy >

2018-08-29 16:53:05 steppy >>> Step application_groupby_agg, loading transformer from the ../steppy/transformers/application_groupby_agg
2018-08-29 16:53:06 steppy >>> Step application_groupby_agg, transforming...
2018-08-29 16:53:31 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 16:53:31 steppy >>> Step bureau_cleaning, loading transformer from the ../steppy/transformers/bureau_cleaning
2018-08-29 16:53:31 steppy >>> Step bureau_cleaning, transforming...
2018-08-29 16:53:31 steppy >>> Step bureau_hand_crafted, adapting inputs...
2018-08-29 16:53:31 steppy >>> Step bureau_hand_crafted, loading transformer from the ../steppy/transformers/bureau_hand_crafted
2018-08-29 16:53:31 steppy >>> Step bureau_hand_crafted, transforming...
2018-08-29 16:53:31 steppy >>> Step bureau_hand_crafted_merge, adapting inputs...
2018-08-29 16:53:31 steppy >>> Step bureau_hand_crafted_merge, loading transformer from the ../steppy/transformers/bureau_hand_crafted_merge
2018-08-29 16:53:31 ste

2018-08-29 16:54:22 steppy >>> Step previous_applications_agg_merge, loading transformer from the ../steppy/transformers/previous_applications_agg_merge
2018-08-29 16:54:22 steppy >>> Step previous_applications_agg_merge, transforming...
2018-08-29 16:54:22 steppy >>> Step categorical_encoder, adapting inputs...
2018-08-29 16:54:22 steppy >>> Step categorical_encoder, loading transformer from the ../steppy/transformers/categorical_encoder
2018-08-29 16:54:22 steppy >>> Step categorical_encoder, transforming...
2018-08-29 16:54:24 steppy >>> Step feature_joiner, adapting inputs...
2018-08-29 16:54:24 steppy >>> Step feature_joiner, loading transformer from the ../steppy/transformers/feature_joiner
2018-08-29 16:54:24 steppy >>> Step feature_joiner, transforming...
2018-08-29 16:54:25 steppy >>> Step feature_joiner, caching outputs to the ../steppy/cache/feature_joiner
2018-08-29 16:54:25 steppy >>> Step feature_joiner, persisting output to the ../steppy/cache/feature_joiner
2018-08-29 1

2018-08-29 16:55:03 steppy >>> done: initializing experiment directories
2018-08-29 16:55:03 steppy >>> Step previous_applications_hand_crafted_merge_valid initialized
2018-08-29 16:55:03 steppy >>> initializing Step installment_payments_hand_crafted...
2018-08-29 16:55:03 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:55:03 steppy >>> done: initializing experiment directories
2018-08-29 16:55:03 steppy >>> Step installment_payments_hand_crafted initialized
2018-08-29 16:55:03 steppy >>> initializing Step installment_payments_hand_crafted_merge...
2018-08-29 16:55:03 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:55:03 steppy >>> done: initializing experiment directories
2018-08-29 16:55:03 steppy >>> Step installment_payments_hand_crafted_merge initialized
2018-08-29 16:55:03 steppy >>> initializing Step installment_payments_hand_crafted_merge_valid...
2018-08-29 16:55:03 steppy >>> initializing experiment directories under

2018-08-29 16:55:03 steppy >>> done: initializing experiment directories
2018-08-29 16:55:03 steppy >>> Step feature_joiner_valid initialized
2018-08-29 16:55:03 steppy >>> initializing LightGBM...
2018-08-29 16:55:03 steppy >>> initializing Step light_gbm...
2018-08-29 16:55:03 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 16:55:03 steppy >>> done: initializing experiment directories
2018-08-29 16:55:03 steppy >>> Step light_gbm initialized
2018-08-29 16:55:03 steppy >>> cleaning cache...
2018-08-29 16:55:03 steppy >>> cleaning cache done
2018-08-29 16:55:03 steppy >>> Start pipeline fit and transform
2018-08-29 16:55:03 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 16:55:03 steppy >>> Step application_cleaning, fitting and transforming...
2018-08-29 16:55:03 steppy >>> Step application_cleaning, persisting transformer to the ../steppy/transformers/application_cleaning
2018-08-29 16:55:03 steppy >>> Step application_hand_crafted, adap

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
 

2018-08-29 17:30:03 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 17:30:20 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 17:30:20 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 17:30:22 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 17:30:22 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 17:30:22 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 17:30:32 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 17:30:32 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 17:30:32 steppy >>> Step 

100%|██████████| 4/4.0 [15:32<00:00, 233.17s/it]


2018-08-29 17:47:20 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 17:48:26 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 17:48:26 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 17:48:30 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 17:48:30 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 17:48:30 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 17:48:35 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 17:48:35 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 17:48:35 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 17:50:41 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 17:50:41 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 17:50:41 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 17:51:48 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 17:51:48 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 17:51:48 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 17:51:48 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 17:51:49 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 17:51:49 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 17:51:51 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 17:51:51 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 17:51:56 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 17:51:56 steppy >

[1150]	data_train's auc: 0.841731	data_valid's auc: 0.788125
[1160]	data_train's auc: 0.842115	data_valid's auc: 0.788178
[1170]	data_train's auc: 0.842518	data_valid's auc: 0.788217
[1180]	data_train's auc: 0.842908	data_valid's auc: 0.788282
[1190]	data_train's auc: 0.843292	data_valid's auc: 0.788378
[1200]	data_train's auc: 0.843663	data_valid's auc: 0.788434
[1210]	data_train's auc: 0.844049	data_valid's auc: 0.788475
[1220]	data_train's auc: 0.844436	data_valid's auc: 0.788528
[1230]	data_train's auc: 0.844817	data_valid's auc: 0.78863
[1240]	data_train's auc: 0.845205	data_valid's auc: 0.788669
[1250]	data_train's auc: 0.845606	data_valid's auc: 0.788776
[1260]	data_train's auc: 0.845975	data_valid's auc: 0.788827
[1270]	data_train's auc: 0.846365	data_valid's auc: 0.78881
[1280]	data_train's auc: 0.84675	data_valid's auc: 0.788862
[1290]	data_train's auc: 0.847128	data_valid's auc: 0.788934
[1300]	data_train's auc: 0.847485	data_valid's auc: 0.789007
[1310]	data_train's auc: 0.

[2500]	data_train's auc: 0.882166	data_valid's auc: 0.79154
[2510]	data_train's auc: 0.882382	data_valid's auc: 0.791569
[2520]	data_train's auc: 0.882637	data_valid's auc: 0.791575
[2530]	data_train's auc: 0.882871	data_valid's auc: 0.791577
[2540]	data_train's auc: 0.883096	data_valid's auc: 0.791582
[2550]	data_train's auc: 0.883323	data_valid's auc: 0.791596
[2560]	data_train's auc: 0.883545	data_valid's auc: 0.791605
[2570]	data_train's auc: 0.883779	data_valid's auc: 0.791612
[2580]	data_train's auc: 0.884006	data_valid's auc: 0.791632
[2590]	data_train's auc: 0.884229	data_valid's auc: 0.791632
[2600]	data_train's auc: 0.88446	data_valid's auc: 0.791633
[2610]	data_train's auc: 0.884701	data_valid's auc: 0.791631
[2620]	data_train's auc: 0.884892	data_valid's auc: 0.791639
[2630]	data_train's auc: 0.885115	data_valid's auc: 0.791655
[2640]	data_train's auc: 0.885341	data_valid's auc: 0.791681
[2650]	data_train's auc: 0.885577	data_valid's auc: 0.791675
[2660]	data_train's auc: 0

2018-08-29 18:12:14 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:12:14 steppy >>> done: initializing experiment directories
2018-08-29 18:12:14 steppy >>> Step previous_application_cleaning initialized
2018-08-29 18:12:14 steppy >>> initializing Step previous_applications_hand_crafted...
2018-08-29 18:12:14 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:12:14 steppy >>> done: initializing experiment directories
2018-08-29 18:12:14 steppy >>> Step previous_applications_hand_crafted initialized
2018-08-29 18:12:14 steppy >>> initializing Step previous_applications_hand_crafted_merge...
2018-08-29 18:12:14 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:12:14 steppy >>> done: initializing experiment directories
2018-08-29 18:12:14 steppy >>> Step previous_applications_hand_crafted_merge initialized
2018-08-29 18:12:14 steppy >>> initializing Step installment_payments_hand_crafted...
2018-08-29 18

2018-08-29 18:12:41 steppy >>> Step bureau_hand_crafted_merge, transforming...
2018-08-29 18:12:41 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 18:12:41 steppy >>> Step bureau_cleaning, loading transformer from the ../steppy/transformers/bureau_cleaning
2018-08-29 18:12:41 steppy >>> Step bureau_cleaning, transforming...
2018-08-29 18:12:41 steppy >>> Step bureau_groupby_agg, adapting inputs...
2018-08-29 18:12:41 steppy >>> Step bureau_groupby_agg, loading transformer from the ../steppy/transformers/bureau_groupby_agg
2018-08-29 18:12:42 steppy >>> Step bureau_groupby_agg, transforming...
2018-08-29 18:12:42 steppy >>> Step bureau_agg_merge, adapting inputs...
2018-08-29 18:12:42 steppy >>> Step bureau_agg_merge, loading transformer from the ../steppy/transformers/bureau_agg_merge
2018-08-29 18:12:42 steppy >>> Step bureau_agg_merge, transforming...
2018-08-29 18:12:42 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 18:12:42 steppy >>> Ste

2018-08-29 18:13:36 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 18:13:36 steppy >>> Step light_gbm, loading transformer from the ../steppy/transformers/light_gbm
2018-08-29 18:13:37 steppy >>> Step light_gbm, transforming...
2018-08-29 18:13:39 steppy >>> cleaning cache...
2018-08-29 18:13:39 steppy >>> removing ../steppy/cache/feature_joiner
2018-08-29 18:13:39 steppy >>> cleaning cache done
2018-08-29 18:13:39 steppy >>> creating submission file...
2018-08-29 18:13:39 steppy >>> verifying submission...
2018-08-29 18:13:39 steppy >>> submission persisted to ../output/submission_addval_8.csv
2018-08-29 18:13:39 steppy >>> submission head 

   SK_ID_CURR    TARGET
0      100001  0.036106
1      100005  0.129785
2      100013  0.035736
3      100028  0.040761
4      100038  0.159314
2018-08-29 18:13:41 steppy >>> TRAINING
2018-08-29 18:13:41 steppy >>> Cleaning experiment_directory...
Reading data...
(307511, 125)
2018-08-29 18:14:13 steppy >>> Shuffling and splitting into t

2018-08-29 18:14:13 steppy >>> done: initializing experiment directories
2018-08-29 18:14:13 steppy >>> Step installment_payments_hand_crafted_merge_valid initialized
2018-08-29 18:14:13 steppy >>> initializing Step application_cleaning...
2018-08-29 18:14:13 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:14:13 steppy >>> done: initializing experiment directories
2018-08-29 18:14:13 steppy >>> Step application_cleaning initialized
2018-08-29 18:14:13 steppy >>> initializing Step application_cleaning_valid...
2018-08-29 18:14:13 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:14:13 steppy >>> done: initializing experiment directories
2018-08-29 18:14:13 steppy >>> Step application_cleaning_valid initialized
2018-08-29 18:14:13 steppy >>> initializing Step application_groupby_agg...
2018-08-29 18:14:13 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 18:14:13 steppy >>> done: initializing experiment di

2018-08-29 18:14:13 steppy >>> Step application_hand_crafted, fitting and transforming...
2018-08-29 18:14:14 steppy >>> Step application_hand_crafted, persisting transformer to the ../steppy/transformers/application_hand_crafted
2018-08-29 18:14:14 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 18:14:14 steppy >>> Step application_cleaning, loading transformer from the ../steppy/transformers/application_cleaning
2018-08-29 18:14:14 steppy >>> Step application_cleaning, transforming...
2018-08-29 18:14:14 steppy >>> Step application_groupby_agg, adapting inputs...
2018-08-29 18:14:14 steppy >>> Step application_groupby_agg, fitting and transforming...
2018-08-29 18:15:52 steppy >>> Step application_groupby_agg, persisting transformer to the ../steppy/transformers/application_groupby_agg
2018-08-29 18:15:52 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 18:15:52 steppy >>> Step bureau_cleaning, fitting and transforming...
2018-08-29 18:15:52 steppy >

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
 

2018-08-29 18:49:15 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 18:49:33 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 18:49:33 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 18:49:35 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 18:49:35 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 18:49:35 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 18:49:45 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 18:49:46 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 18:49:46 steppy >>> Step 

100%|██████████| 4/4.0 [15:34<00:00, 233.61s/it]


2018-08-29 19:06:35 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 19:07:40 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 19:07:40 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 19:07:44 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 19:07:44 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 19:07:44 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 19:07:49 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 19:07:49 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 19:07:49 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 19:10:00 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 19:10:00 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 19:10:00 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 19:10:00 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 19:10:00 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 19:10:00 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 19:10:01 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 19:10:01 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 19:11:07 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 19:11:07 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 19:11:07 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 19:11:07 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 19:11:08 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 19:11:08 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 19:11:11 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 19:11:11 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 19:11:17 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 19:11:17 steppy >

[1150]	data_train's auc: 0.841723	data_valid's auc: 0.79023
[1160]	data_train's auc: 0.842136	data_valid's auc: 0.790296
[1170]	data_train's auc: 0.842539	data_valid's auc: 0.790342
[1180]	data_train's auc: 0.842914	data_valid's auc: 0.790404
[1190]	data_train's auc: 0.843326	data_valid's auc: 0.79046
[1200]	data_train's auc: 0.843684	data_valid's auc: 0.790535
[1210]	data_train's auc: 0.844072	data_valid's auc: 0.790625
[1220]	data_train's auc: 0.844461	data_valid's auc: 0.790672
[1230]	data_train's auc: 0.844869	data_valid's auc: 0.790742
[1240]	data_train's auc: 0.845242	data_valid's auc: 0.790777
[1250]	data_train's auc: 0.845634	data_valid's auc: 0.790816
[1260]	data_train's auc: 0.846001	data_valid's auc: 0.790834
[1270]	data_train's auc: 0.846377	data_valid's auc: 0.79084
[1280]	data_train's auc: 0.846733	data_valid's auc: 0.790896
[1290]	data_train's auc: 0.847123	data_valid's auc: 0.790969
[1300]	data_train's auc: 0.847502	data_valid's auc: 0.790988
[1310]	data_train's auc: 0.

2018-08-29 19:25:45 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 19:25:45 steppy >>> done: initializing experiment directories
2018-08-29 19:25:45 steppy >>> Step credit_card_balance_cleaning initialized
2018-08-29 19:25:45 steppy >>> initializing Step credit_card_balance_hand_crafted...
2018-08-29 19:25:45 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 19:25:45 steppy >>> done: initializing experiment directories
2018-08-29 19:25:45 steppy >>> Step credit_card_balance_hand_crafted initialized
2018-08-29 19:25:45 steppy >>> initializing Step credit_card_balance_hand_crafted_merge...
2018-08-29 19:25:45 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 19:25:45 steppy >>> done: initializing experiment directories
2018-08-29 19:25:45 steppy >>> Step credit_card_balance_hand_crafted_merge initialized
2018-08-29 19:25:45 steppy >>> initializing Step pos_cash_balance_hand_crafted...
2018-08-29 19:25:45 steppy

2018-08-29 19:25:45 steppy >>> Step application_cleaning, transforming...
2018-08-29 19:25:45 steppy >>> Step application_hand_crafted, adapting inputs...
2018-08-29 19:25:45 steppy >>> Step application_hand_crafted, loading transformer from the ../steppy/transformers/application_hand_crafted
2018-08-29 19:25:45 steppy >>> Step application_hand_crafted, transforming...
2018-08-29 19:25:45 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 19:25:45 steppy >>> Step application_cleaning, loading transformer from the ../steppy/transformers/application_cleaning
2018-08-29 19:25:45 steppy >>> Step application_cleaning, transforming...
2018-08-29 19:25:45 steppy >>> Step application_groupby_agg, adapting inputs...
2018-08-29 19:25:45 steppy >>> Step application_groupby_agg, loading transformer from the ../steppy/transformers/application_groupby_agg
2018-08-29 19:25:45 steppy >>> Step application_groupby_agg, transforming...
2018-08-29 19:26:11 steppy >>> Step bureau_cleaning,

2018-08-29 19:27:02 steppy >>> Step previous_application_cleaning, adapting inputs...
2018-08-29 19:27:02 steppy >>> Step previous_application_cleaning, loading transformer from the ../steppy/transformers/previous_application_cleaning
2018-08-29 19:27:02 steppy >>> Step previous_application_cleaning, transforming...
2018-08-29 19:27:02 steppy >>> Step previous_applications_groupby_agg, adapting inputs...
2018-08-29 19:27:02 steppy >>> Step previous_applications_groupby_agg, loading transformer from the ../steppy/transformers/previous_applications_groupby_agg
2018-08-29 19:27:02 steppy >>> Step previous_applications_groupby_agg, transforming...
2018-08-29 19:27:02 steppy >>> Step previous_applications_agg_merge, adapting inputs...
2018-08-29 19:27:02 steppy >>> Step previous_applications_agg_merge, loading transformer from the ../steppy/transformers/previous_applications_agg_merge
2018-08-29 19:27:02 steppy >>> Step previous_applications_agg_merge, transforming...
2018-08-29 19:27:03 st

In [19]:
import gc

for RANDOM_SEED in range(6,9):
    gc.collect()
    train(pipeline_name = 'lightGBM', dev_mode = False)
    predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)

2018-08-29 10:34:34 steppy >>> TRAINING
2018-08-29 10:34:34 steppy >>> Cleaning experiment_directory...
Reading data...
(307511, 125)
2018-08-29 10:35:08 steppy >>> Shuffling and splitting into train and test...
2018-08-29 10:35:08 steppy >>> Target mean in train: TARGET    0.080632
dtype: float64
2018-08-29 10:35:08 steppy >>> Target mean in valid: TARGET    0.081118
dtype: float64
2018-08-29 10:35:08 steppy >>> Train shape: (246008, 125)
2018-08-29 10:35:08 steppy >>> Valid shape: (61503, 125)
2018-08-29 10:35:09 steppy >>> initializing Step application_cleaning...
2018-08-29 10:35:09 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 10:35:09 steppy >>> done: initializing experiment directories
2018-08-29 10:35:09 steppy >>> Step application_cleaning initialized
2018-08-29 10:35:09 steppy >>> initializing Step application_cleaning_valid...
2018-08-29 10:35:09 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 10:35:09 steppy >>> done: 

2018-08-29 10:35:09 steppy >>> initializing Step application_groupby_agg...
2018-08-29 10:35:09 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 10:35:09 steppy >>> done: initializing experiment directories
2018-08-29 10:35:09 steppy >>> Step application_groupby_agg initialized
2018-08-29 10:35:09 steppy >>> initializing Step application_groupby_agg_valid...
2018-08-29 10:35:09 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 10:35:09 steppy >>> done: initializing experiment directories
2018-08-29 10:35:09 steppy >>> Step application_groupby_agg_valid initialized
2018-08-29 10:35:09 steppy >>> initializing Step bureau_groupby_agg...
2018-08-29 10:35:09 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 10:35:09 steppy >>> done: initializing experiment directories
2018-08-29 10:35:09 steppy >>> Step bureau_groupby_agg initialized
2018-08-29 10:35:09 steppy >>> initializing Step bureau_agg_merge...
2018-08-29 10:

  r = func(a, **kwargs)


2018-08-29 10:35:10 steppy >>> Step application_hand_crafted, persisting transformer to the ../steppy/transformers/application_hand_crafted
2018-08-29 10:35:10 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 10:35:10 steppy >>> Step application_cleaning, loading transformer from the ../steppy/transformers/application_cleaning
2018-08-29 10:35:10 steppy >>> Step application_cleaning, transforming...
2018-08-29 10:35:10 steppy >>> Step application_groupby_agg, adapting inputs...
2018-08-29 10:35:10 steppy >>> Step application_groupby_agg, fitting and transforming...
2018-08-29 10:36:49 steppy >>> Step application_groupby_agg, persisting transformer to the ../steppy/transformers/application_groupby_agg
2018-08-29 10:36:49 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 10:36:49 steppy >>> Step bureau_cleaning, fitting and transforming...
2018-08-29 10:36:50 steppy >>> Step bureau_cleaning, persisting transformer to the ../steppy/transformers/bureau_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-08-29 10:36:52 steppy >>> Step bureau_hand_crafted, persisting transformer to the ../steppy/transformers/bureau_hand_crafted
2018-08-29 10:36:52 steppy >>> Step bureau_hand_crafted_merge, adapting inputs...
2018-08-29 10:36:52 steppy >>> Step bureau_hand_crafted_merge, fitting and transforming...
2018-08-29 10:36:53 steppy >>> Step bureau_hand_crafted_merge, persisting transformer to the ../steppy/transformers/bureau_hand_crafted_merge
2018-08-29 10:36:53 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 10:36:53 steppy >>> Step bureau_cleaning, loading transformer from the ../steppy/transformers/bureau_cleaning
2018-08-29 10:36:53 steppy >>> Step bureau_cleaning, transforming...
2018-08-29 10:36:53 steppy >>> Step bureau_groupby_agg, adapting inputs...
2018-08-29 10:36:53 steppy >>> Step bureau_groupby_agg, fitting and transforming...
2018-08-29 10:37:03 steppy >>> Step bureau_groupby_agg, persisting transformer to the ../steppy/transformers/bureau_groupby_agg
2018-0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-08-29 10:38:02 steppy >>> Step credit_card_balance_hand_crafted, persisting transformer to the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 10:38:02 steppy >>> Step credit_card_balance_hand_crafted_merge, adapting inputs...
2018-08-29 10:38:02 steppy >>> Step credit_card_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/credit_card_balance_hand_crafted_merge
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/transformers/credit_card_balance_cleaning
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_cleaning, transforming...
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_groupby_agg, adapting inputs...
2018-08-29 10:38:03 steppy >>> Step credit_card_balance_groupby_agg, fittin

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out

2018-08-29 11:11:06 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 11:11:31 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 11:11:31 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 11:11:33 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 11:11:33 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 11:11:33 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 11:11:43 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 11:11:43 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 11:11:43 steppy >>> Step 

100%|██████████| 4/4.0 [16:15<00:00, 243.91s/it]


2018-08-29 11:29:15 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 11:30:22 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 11:30:22 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 11:30:26 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 11:30:26 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 11:30:26 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 11:30:31 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 11:30:32 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 11:30:32 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 11:32:45 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 11:32:45 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 11:32:45 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 11:33:54 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 11:33:54 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 11:33:54 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 11:33:54 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 11:33:55 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 11:33:55 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 11:34:00 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 11:34:00 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 11:34:03 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 11:34:04 steppy >



Training until validation scores don't improve for 100 rounds.
[10]	data_train's auc: 0.712147	data_valid's auc: 0.706147
[20]	data_train's auc: 0.720177	data_valid's auc: 0.71546
[30]	data_train's auc: 0.724317	data_valid's auc: 0.720013
[40]	data_train's auc: 0.730492	data_valid's auc: 0.724678
[50]	data_train's auc: 0.730936	data_valid's auc: 0.725517
[60]	data_train's auc: 0.735989	data_valid's auc: 0.730458
[70]	data_train's auc: 0.738323	data_valid's auc: 0.732623
[80]	data_train's auc: 0.741876	data_valid's auc: 0.735698
[90]	data_train's auc: 0.743957	data_valid's auc: 0.737643
[100]	data_train's auc: 0.747587	data_valid's auc: 0.740956
[110]	data_train's auc: 0.750227	data_valid's auc: 0.743021
[120]	data_train's auc: 0.753974	data_valid's auc: 0.745763
[130]	data_train's auc: 0.756786	data_valid's auc: 0.747925
[140]	data_train's auc: 0.75917	data_valid's auc: 0.749797
[150]	data_train's auc: 0.761646	data_valid's auc: 0.751702
[160]	data_train's auc: 0.764	data_valid's auc: 

[1370]	data_train's auc: 0.850378	data_valid's auc: 0.791784
[1380]	data_train's auc: 0.850709	data_valid's auc: 0.791784
[1390]	data_train's auc: 0.851072	data_valid's auc: 0.791829
[1400]	data_train's auc: 0.851408	data_valid's auc: 0.791801
[1410]	data_train's auc: 0.851784	data_valid's auc: 0.791828
[1420]	data_train's auc: 0.852124	data_valid's auc: 0.791863
[1430]	data_train's auc: 0.852507	data_valid's auc: 0.791885
[1440]	data_train's auc: 0.852871	data_valid's auc: 0.791925
[1450]	data_train's auc: 0.853232	data_valid's auc: 0.791942
[1460]	data_train's auc: 0.853577	data_valid's auc: 0.791929
[1470]	data_train's auc: 0.853908	data_valid's auc: 0.791888
[1480]	data_train's auc: 0.854274	data_valid's auc: 0.791897
[1490]	data_train's auc: 0.854625	data_valid's auc: 0.791917
[1500]	data_train's auc: 0.85495	data_valid's auc: 0.791973
[1510]	data_train's auc: 0.855291	data_valid's auc: 0.791994
[1520]	data_train's auc: 0.855656	data_valid's auc: 0.792064
[1530]	data_train's auc: 

2018-08-29 11:49:37 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:49:37 steppy >>> done: initializing experiment directories
2018-08-29 11:49:37 steppy >>> Step previous_applications_hand_crafted_merge initialized
2018-08-29 11:49:37 steppy >>> initializing Step installment_payments_hand_crafted...
2018-08-29 11:49:37 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:49:37 steppy >>> done: initializing experiment directories
2018-08-29 11:49:37 steppy >>> Step installment_payments_hand_crafted initialized
2018-08-29 11:49:37 steppy >>> initializing Step installment_payments_hand_crafted_merge...
2018-08-29 11:49:37 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:49:37 steppy >>> done: initializing experiment directories
2018-08-29 11:49:37 steppy >>> Step installment_payments_hand_crafted_merge initialized
2018-08-29 11:49:37 steppy >>> initializing Step application_cleaning...
2018-08-29 11:49:37

2018-08-29 11:50:04 steppy >>> Step bureau_agg_merge, adapting inputs...
2018-08-29 11:50:04 steppy >>> Step bureau_agg_merge, loading transformer from the ../steppy/transformers/bureau_agg_merge
2018-08-29 11:50:04 steppy >>> Step bureau_agg_merge, transforming...
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/transformers/credit_card_balance_cleaning
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_cleaning, transforming...
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_hand_crafted, adapting inputs...
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 11:50:04 steppy >>> Step credit_card_balance_hand_crafted_merge, adapting input

2018-08-29 11:51:02 steppy >>> submission head 

   SK_ID_CURR    TARGET
0      100001  0.039521
1      100005  0.114334
2      100013  0.029662
3      100028  0.039953
4      100038  0.136539
2018-08-29 11:51:04 steppy >>> TRAINING
2018-08-29 11:51:04 steppy >>> Cleaning experiment_directory...
Reading data...
(307511, 125)
2018-08-29 11:51:37 steppy >>> Shuffling and splitting into train and test...
2018-08-29 11:51:38 steppy >>> Target mean in train: TARGET    0.08114
dtype: float64
2018-08-29 11:51:38 steppy >>> Target mean in valid: TARGET    0.079086
dtype: float64
2018-08-29 11:51:38 steppy >>> Train shape: (246008, 125)
2018-08-29 11:51:38 steppy >>> Valid shape: (61503, 125)
2018-08-29 11:51:38 steppy >>> initializing Step application_cleaning...
2018-08-29 11:51:38 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:51:38 steppy >>> done: initializing experiment directories
2018-08-29 11:51:38 steppy >>> Step application_cleaning initialized
2018-08-

2018-08-29 11:51:38 steppy >>> done: initializing experiment directories
2018-08-29 11:51:38 steppy >>> Step application_cleaning_valid initialized
2018-08-29 11:51:38 steppy >>> initializing Step application_groupby_agg...
2018-08-29 11:51:38 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:51:38 steppy >>> done: initializing experiment directories
2018-08-29 11:51:38 steppy >>> Step application_groupby_agg initialized
2018-08-29 11:51:38 steppy >>> initializing Step application_groupby_agg_valid...
2018-08-29 11:51:38 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:51:38 steppy >>> done: initializing experiment directories
2018-08-29 11:51:38 steppy >>> Step application_groupby_agg_valid initialized
2018-08-29 11:51:38 steppy >>> initializing Step bureau_groupby_agg...
2018-08-29 11:51:38 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 11:51:38 steppy >>> done: initializing experiment directories
20

2018-08-29 11:53:17 steppy >>> Step application_groupby_agg, persisting transformer to the ../steppy/transformers/application_groupby_agg
2018-08-29 11:53:17 steppy >>> Step bureau_cleaning, adapting inputs...
2018-08-29 11:53:17 steppy >>> Step bureau_cleaning, fitting and transforming...
2018-08-29 11:53:17 steppy >>> Step bureau_cleaning, persisting transformer to the ../steppy/transformers/bureau_cleaning
2018-08-29 11:53:17 steppy >>> Step bureau_hand_crafted, adapting inputs...
2018-08-29 11:53:17 steppy >>> Step bureau_hand_crafted, fitting and transforming...
2018-08-29 11:53:20 steppy >>> Step bureau_hand_crafted, persisting transformer to the ../steppy/transformers/bureau_hand_crafted
2018-08-29 11:53:20 steppy >>> Step bureau_hand_crafted_merge, adapting inputs...
2018-08-29 11:53:20 steppy >>> Step bureau_hand_crafted_merge, fitting and transforming...
2018-08-29 11:53:21 steppy >>> Step bureau_hand_crafted_merge, persisting transformer to the ../steppy/transformers/bureau_

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out

2018-08-29 12:26:53 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 12:27:09 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 12:27:09 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 12:27:11 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 12:27:11 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 12:27:11 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 12:27:22 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 12:27:22 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 12:27:22 steppy >>> Step 

100%|██████████| 4/4.0 [15:58<00:00, 239.66s/it]


2018-08-29 12:44:36 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 12:45:45 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 12:45:45 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 12:45:48 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 12:45:48 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 12:45:48 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 12:45:54 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 12:45:54 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 12:45:54 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 12:48:09 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 12:48:09 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 12:48:09 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 12:49:14 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 12:49:14 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 12:49:14 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 12:49:14 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 12:49:16 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 12:49:16 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 12:49:19 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 12:49:19 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 12:49:24 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 12:49:24 steppy >

[1150]	data_train's auc: 0.841554	data_valid's auc: 0.791383
[1160]	data_train's auc: 0.841947	data_valid's auc: 0.791478
[1170]	data_train's auc: 0.842335	data_valid's auc: 0.791483
[1180]	data_train's auc: 0.842714	data_valid's auc: 0.791538
[1190]	data_train's auc: 0.843112	data_valid's auc: 0.791581
[1200]	data_train's auc: 0.843497	data_valid's auc: 0.791637
[1210]	data_train's auc: 0.843911	data_valid's auc: 0.79172
[1220]	data_train's auc: 0.844319	data_valid's auc: 0.791783
[1230]	data_train's auc: 0.844718	data_valid's auc: 0.791842
[1240]	data_train's auc: 0.845036	data_valid's auc: 0.791876
[1250]	data_train's auc: 0.84547	data_valid's auc: 0.791925
[1260]	data_train's auc: 0.845871	data_valid's auc: 0.792003
[1270]	data_train's auc: 0.846265	data_valid's auc: 0.792035
[1280]	data_train's auc: 0.846626	data_valid's auc: 0.792048
[1290]	data_train's auc: 0.847011	data_valid's auc: 0.792082
[1300]	data_train's auc: 0.847377	data_valid's auc: 0.792116
[1310]	data_train's auc: 0

[2500]	data_train's auc: 0.882493	data_valid's auc: 0.794472
[2510]	data_train's auc: 0.882737	data_valid's auc: 0.794479
[2520]	data_train's auc: 0.882968	data_valid's auc: 0.794468
[2530]	data_train's auc: 0.883212	data_valid's auc: 0.794449
[2540]	data_train's auc: 0.883482	data_valid's auc: 0.794483
[2550]	data_train's auc: 0.883706	data_valid's auc: 0.794529
[2560]	data_train's auc: 0.883943	data_valid's auc: 0.794547
[2570]	data_train's auc: 0.884176	data_valid's auc: 0.794544
[2580]	data_train's auc: 0.884408	data_valid's auc: 0.794502
[2590]	data_train's auc: 0.884629	data_valid's auc: 0.794469
[2600]	data_train's auc: 0.884865	data_valid's auc: 0.794482
[2610]	data_train's auc: 0.885076	data_valid's auc: 0.794478
[2620]	data_train's auc: 0.8853	data_valid's auc: 0.794467
[2630]	data_train's auc: 0.885486	data_valid's auc: 0.79448
[2640]	data_train's auc: 0.885715	data_valid's auc: 0.79447
[2650]	data_train's auc: 0.885955	data_valid's auc: 0.79449
Early stopping, best iteratio

2018-08-29 13:07:36 steppy >>> done: initializing experiment directories
2018-08-29 13:07:36 steppy >>> Step installments_payments_groupby_agg initialized
2018-08-29 13:07:36 steppy >>> initializing Step installments_payments_agg_merge...
2018-08-29 13:07:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:07:36 steppy >>> done: initializing experiment directories
2018-08-29 13:07:36 steppy >>> Step installments_payments_agg_merge initialized
2018-08-29 13:07:36 steppy >>> initializing Step pos_cash_balance_groupby_agg...
2018-08-29 13:07:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:07:36 steppy >>> done: initializing experiment directories
2018-08-29 13:07:36 steppy >>> Step pos_cash_balance_groupby_agg initialized
2018-08-29 13:07:36 steppy >>> initializing Step pos_cash_balance_agg_merge...
2018-08-29 13:07:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:07:36 steppy >>> done: initializ

2018-08-29 13:08:06 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 13:08:06 steppy >>> Step installments_payments_groupby_agg, loading transformer from the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 13:08:06 steppy >>> Step installments_payments_groupby_agg, transforming...
2018-08-29 13:08:06 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 13:08:06 steppy >>> Step installments_payments_agg_merge, loading transformer from the ../steppy/transformers/installments_payments_agg_merge
2018-08-29 13:08:06 steppy >>> Step installments_payments_agg_merge, transforming...
2018-08-29 13:08:06 steppy >>> Step pos_cash_balance_hand_crafted, adapting inputs...
2018-08-29 13:08:06 steppy >>> Step pos_cash_balance_hand_crafted, loading transformer from the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 13:08:53 steppy >>> Step pos_cash_balance_hand_crafted, transforming...
2018-08-29 13:08:53 st

2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy >>> Step bureau_hand_crafted_merge_valid initialized
2018-08-29 13:09:36 steppy >>> initializing Step credit_card_balance_cleaning...
2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy >>> Step credit_card_balance_cleaning initialized
2018-08-29 13:09:36 steppy >>> initializing Step credit_card_balance_hand_crafted...
2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy >>> Step credit_card_balance_hand_crafted initialized
2018-08-29 13:09:36 steppy >>> initializing Step credit_card_balance_hand_crafted_merge...
2018-08-29 13:09:36 steppy >>> ini

2018-08-29 13:09:36 steppy >>> Step installments_payments_groupby_agg initialized
2018-08-29 13:09:36 steppy >>> initializing Step installments_payments_agg_merge...
2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy >>> Step installments_payments_agg_merge initialized
2018-08-29 13:09:36 steppy >>> initializing Step installments_payments_agg_merge_valid...
2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy >>> Step installments_payments_agg_merge_valid initialized
2018-08-29 13:09:36 steppy >>> initializing Step pos_cash_balance_groupby_agg...
2018-08-29 13:09:36 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 13:09:36 steppy >>> done: initializing experiment directories
2018-08-29 13:09:36 steppy

2018-08-29 13:12:28 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 13:12:28 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/transformers/credit_card_balance_cleaning
2018-08-29 13:12:28 steppy >>> Step credit_card_balance_cleaning, transforming...
2018-08-29 13:12:28 steppy >>> Step credit_card_balance_groupby_agg, adapting inputs...
2018-08-29 13:12:28 steppy >>> Step credit_card_balance_groupby_agg, fitting and transforming...
2018-08-29 13:12:36 steppy >>> Step credit_card_balance_groupby_agg, persisting transformer to the ../steppy/transformers/credit_card_balance_groupby_agg
2018-08-29 13:12:36 steppy >>> Step credit_card_balance_agg_merge, adapting inputs...
2018-08-29 13:12:36 steppy >>> Step credit_card_balance_agg_merge, fitting and transforming...
2018-08-29 13:12:36 steppy >>> Step credit_card_balance_agg_merge, persisting transformer to the ../steppy/transformers/credit_card_balance_agg_merge
2018-08-29 13:12

  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  interpolation=interpolation)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  interpolation=interpolation)
  return np.nanmean(a, axis, out

2018-08-29 13:46:03 steppy >>> Step installment_payments_hand_crafted, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted
2018-08-29 13:46:17 steppy >>> Step installment_payments_hand_crafted_merge, adapting inputs...
2018-08-29 13:46:17 steppy >>> Step installment_payments_hand_crafted_merge, fitting and transforming...
2018-08-29 13:46:19 steppy >>> Step installment_payments_hand_crafted_merge, persisting transformer to the ../steppy/transformers/installment_payments_hand_crafted_merge
2018-08-29 13:46:20 steppy >>> Step installments_payments_groupby_agg, adapting inputs...
2018-08-29 13:46:20 steppy >>> Step installments_payments_groupby_agg, fitting and transforming...
2018-08-29 13:46:30 steppy >>> Step installments_payments_groupby_agg, persisting transformer to the ../steppy/transformers/installments_payments_groupby_agg
2018-08-29 13:46:30 steppy >>> Step installments_payments_agg_merge, adapting inputs...
2018-08-29 13:46:30 steppy >>> Step 

100%|██████████| 4/4.0 [15:44<00:00, 236.04s/it]


2018-08-29 14:03:30 steppy >>> Step pos_cash_balance_hand_crafted, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted
2018-08-29 14:04:34 steppy >>> Step pos_cash_balance_hand_crafted_merge, adapting inputs...
2018-08-29 14:04:34 steppy >>> Step pos_cash_balance_hand_crafted_merge, fitting and transforming...
2018-08-29 14:04:37 steppy >>> Step pos_cash_balance_hand_crafted_merge, persisting transformer to the ../steppy/transformers/pos_cash_balance_hand_crafted_merge
2018-08-29 14:04:37 steppy >>> Step pos_cash_balance_groupby_agg, adapting inputs...
2018-08-29 14:04:37 steppy >>> Step pos_cash_balance_groupby_agg, fitting and transforming...
2018-08-29 14:04:42 steppy >>> Step pos_cash_balance_groupby_agg, persisting transformer to the ../steppy/transformers/pos_cash_balance_groupby_agg
2018-08-29 14:04:42 steppy >>> Step pos_cash_balance_agg_merge, adapting inputs...
2018-08-29 14:04:42 steppy >>> Step pos_cash_balance_agg_merge, fitting and transform

2018-08-29 14:06:52 steppy >>> Step credit_card_balance_hand_crafted, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_hand_crafted, transforming...
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, adapting inputs...
2018-08-29 14:06:52 steppy >>> copying transformer from ../steppy/transformers/credit_card_balance_hand_crafted_merge to ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, loading transformer from the ../steppy/transformers/credit_card_balance_hand_crafted_merge_valid
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_hand_crafted_merge_valid, transforming...
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_cleaning, adapting inputs...
2018-08-29 14:06:52 steppy >>> Step credit_card_balance_cleaning, loading transformer from the ../steppy/tr

2018-08-29 14:07:57 steppy >>> Step feature_joiner_valid, adapting inputs...
2018-08-29 14:07:57 steppy >>> copying transformer from ../steppy/transformers/feature_joiner to ../steppy/transformers/feature_joiner_valid
2018-08-29 14:07:57 steppy >>> Step feature_joiner_valid, loading transformer from the ../steppy/transformers/feature_joiner_valid
2018-08-29 14:07:57 steppy >>> Step feature_joiner_valid, transforming...
2018-08-29 14:07:58 steppy >>> Step feature_joiner_valid, caching outputs to the ../steppy/cache/feature_joiner_valid
2018-08-29 14:07:58 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/cache/feature_joiner_valid
2018-08-29 14:08:01 steppy >>> Step feature_joiner_valid, persisting outputs to the ../steppy/outputs/feature_joiner_valid
2018-08-29 14:08:01 steppy >>> Step feature_joiner_valid, persisting output to the ../steppy/outputs/feature_joiner_valid
2018-08-29 14:08:06 steppy >>> Step light_gbm, adapting inputs...
2018-08-29 14:08:06 steppy >

[1150]	data_train's auc: 0.84165	data_valid's auc: 0.792344
[1160]	data_train's auc: 0.842043	data_valid's auc: 0.792402
[1170]	data_train's auc: 0.842421	data_valid's auc: 0.79243
[1180]	data_train's auc: 0.842818	data_valid's auc: 0.792473
[1190]	data_train's auc: 0.843232	data_valid's auc: 0.792537
[1200]	data_train's auc: 0.843596	data_valid's auc: 0.79259
[1210]	data_train's auc: 0.843981	data_valid's auc: 0.79265
[1220]	data_train's auc: 0.844386	data_valid's auc: 0.792734
[1230]	data_train's auc: 0.844787	data_valid's auc: 0.792788
[1240]	data_train's auc: 0.845158	data_valid's auc: 0.792843
[1250]	data_train's auc: 0.845555	data_valid's auc: 0.792904
[1260]	data_train's auc: 0.845918	data_valid's auc: 0.792954
[1270]	data_train's auc: 0.846307	data_valid's auc: 0.792995
[1280]	data_train's auc: 0.846668	data_valid's auc: 0.793045
[1290]	data_train's auc: 0.847054	data_valid's auc: 0.793084
[1300]	data_train's auc: 0.847415	data_valid's auc: 0.79311
[1310]	data_train's auc: 0.84

2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23 steppy >>> Step bureau_cleaning initialized
2018-08-29 14:23:23 steppy >>> initializing Step bureau_hand_crafted...
2018-08-29 14:23:23 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23 steppy >>> Step bureau_hand_crafted initialized
2018-08-29 14:23:23 steppy >>> initializing Step bureau_hand_crafted_merge...
2018-08-29 14:23:23 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23 steppy >>> Step bureau_hand_crafted_merge initialized
2018-08-29 14:23:23 steppy >>> initializing Step credit_card_balance_cleaning...
2018-08-29 14:23:23 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23

2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23 steppy >>> Step feature_joiner initialized
2018-08-29 14:23:23 steppy >>> initializing LightGBM...
2018-08-29 14:23:23 steppy >>> initializing Step light_gbm...
2018-08-29 14:23:23 steppy >>> initializing experiment directories under ../steppy/
2018-08-29 14:23:23 steppy >>> done: initializing experiment directories
2018-08-29 14:23:23 steppy >>> Step light_gbm initialized
2018-08-29 14:23:23 steppy >>> cleaning cache...
2018-08-29 14:23:23 steppy >>> cleaning cache done
2018-08-29 14:23:23 steppy >>> Start pipeline transform
2018-08-29 14:23:23 steppy >>> Step application_cleaning, adapting inputs...
2018-08-29 14:23:23 steppy >>> Step application_cleaning, loading transformer from the ../steppy/transformers/application_cleaning
2018-08-29 14:23:23 steppy >>> Step application_cleaning, transforming...
2018-08-29 14:23:23 steppy >>> Step application_hand_crafted, adapting inputs...
2018-08-29 1

2018-08-29 14:24:40 steppy >>> Step previous_application_cleaning, transforming...
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted, adapting inputs...
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted, loading transformer from the ../steppy/transformers/previous_applications_hand_crafted
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted, transforming...
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted_merge, adapting inputs...
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted_merge, loading transformer from the ../steppy/transformers/previous_applications_hand_crafted_merge
2018-08-29 14:24:40 steppy >>> Step previous_applications_hand_crafted_merge, transforming...
2018-08-29 14:24:40 steppy >>> Step previous_application_cleaning, adapting inputs...
2018-08-29 14:24:40 steppy >>> Step previous_application_cleaning, loading transformer from the ../steppy/transformers/previous_app

In [20]:
'''
import gc

for RANDOM_SEED in range(4,7):
    gc.collect()
    train(pipeline_name = 'lightGBM', dev_mode = False)
    predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)
'''

"\nimport gc\n\nfor RANDOM_SEED in range(4,7):\n    gc.collect()\n    train(pipeline_name = 'lightGBM', dev_mode = False)\n    predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)\n"

In [21]:
#RANDOM_SEED = 1
#train(pipeline_name = 'lightGBM', dev_mode = True)
#predict(pipeline_name = 'lightGBM', dev_mode = False,submit_predictions = True)

## oof prediction(not yet...)

In [22]:
def train_evaluate_cv_first_level(pipeline_name, dev_mode):
    if bool(params["clean_experiment_directory_before_training"]) and os.path.isdir(params["experiment_directory"]):
        logger.info('Cleaning experiment_directory...')
        shutil.rmtree(params.experiment_directory)

    tables = _read_data(dev_mode, read_train=True, read_test=False)

    target_values = tables.application_train[TARGET_COLUMNS].values.reshape(-1)
    fold_generator = _get_fold_generator(target_values)

    fold_scores = []
    for fold_id, (train_idx, valid_idx) in enumerate(fold_generator):
        (train_data_split,
         valid_data_split) = tables.application_train.iloc[train_idx], tables.application_train.iloc[valid_idx]

        logger.info('Started fold {}'.format(fold_id))
        logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean()))
        logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean()))
        logger.info('Train shape: {}'.format(train_data_split.shape))
        logger.info('Valid shape: {}'.format(valid_data_split.shape))

        score, _, _ = _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name,
                                              model_level='first')

        logger.info('Fold {} ROC_AUC {}'.format(fold_id, score))
        ctx.channel_send('Fold {} ROC_AUC'.format(fold_id), 0, score)

        fold_scores.append(score)

    score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)

    logger.info('ROC_AUC mean {}, ROC_AUC std {}'.format(score_mean, score_std))
    ctx.channel_send('ROC_AUC', 0, score_mean)
    ctx.channel_send('ROC_AUC STD', 0, score_std)