In [None]:
from IPython.display import clear_output
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c geekbrains-competitive-data-analysis
!unzip geekbrains-competitive-data-analysis.zip
!rm geekbrains-competitive-data-analysis.zip

!pip install -qq BorutaShap catboost optuna 
!pip install -qq category_encoders
clear_output()

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, List, Tuple
from pprint import pprint
from tqdm import tqdm
import time
import re
import os
from functools import partial

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from scipy.stats import kurtosis, iqr, skew

from BorutaShap import BorutaShap
import category_encoders as ce

from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler

import gc
gc.enable()

In [None]:
seed = 42
np.random.seed(seed)
np.set_printoptions(2)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

ID_COL = 'APPLICATION_NUMBER'
TARGET= 'TARGET'

In [None]:
def group(df_to_agg, prefix, aggregations, aggregate_by=ID_COL):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].upper())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()


def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by=ID_COL):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)


def do_mean(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_median(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].median().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_std(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].std().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_sum(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].sum().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def one_hot_encoder(df, categorical_columns=None, nan_as_category=True):
    """Create a new column for each categorical value in categorical columns. """
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    categorical_columns = [c for c in df.columns if c not in original_columns]
    return df, categorical_columns


def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns


def add_features(feature_name, aggs, features, feature_names, groupby):
    feature_names.extend(['{}_{}'.format(feature_name, agg) for agg in aggs])

    for agg in aggs:
        if agg == 'kurt':
            agg_func = kurtosis
        elif agg == 'iqr':
            agg_func = iqr
        else:
            agg_func = agg

        g = groupby[feature_name].agg(agg_func).reset_index().rename(index=str,
                                                                     columns={feature_name: '{}_{}'.format(feature_name,agg)})
        features = features.merge(g, on='SK_ID_CURR', how='left')
    return features, feature_names


def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()
    return features


def add_trend_feature(features, gr, feature_name, prefix):
    y = gr[feature_name].values
    try:
        x = np.arange(0, len(y)).reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(x, y)
        trend = lr.coef_[0]
    except:
        trend = np.nan
    features['{}{}'.format(prefix, feature_name)] = trend
    return features


def reduce_memory(df):
    """Reduce memory usage by converting data to more appropriate dtypes"""
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def calculate_permutation_importance(estimator,
                                     metric: callable,
                                     x_valid: pd.DataFrame,
                                     y_valid: pd.DataFrame,
                                     maximize: bool = True,
                                     probas: bool = False
                                     ) -> pd.Series:
    def _predict(estimator, x_valid, probas=True):
        if hasattr(estimator, "predict_proba") and probas:
            y_pred = estimator.predict_proba(x_valid)[:, 1]
        else:
            y_pred = estimator.predict(x_valid)
        return y_pred

    y_pred = _predict(estimator, x_valid, probas)
    base_score = metric(y_valid, y_pred)
    scores, delta = {}, {}

    for feature in tqdm(x_valid.columns):
        x_valid_ = x_valid.copy(deep=True)
        np.random.seed(seed)
        x_valid_[feature] = np.random.permutation(x_valid_[feature])

        y_pred = _predict(estimator, x_valid_, probas)
        feature_score = metric(y_valid, y_pred)

        if maximize:
            delta[feature] = base_score - feature_score
        else:
            delta[feature] = feature_score - base_score

        scores[feature] = feature_score

    scores, delta = pd.Series(scores), pd.Series(delta)
    scores = scores.sort_values(ascending=False)
    delta = delta.sort_values(ascending=False)

    return scores, delta

In [None]:
# INSTALLMENTS TREND PERIODS
INSTALLMENTS_LAST_K_TREND_PERIODS =  [12, 24, 60, 120]

# LIGHTGBM CONFIGURATION AND HYPER-PARAMETERS
NUM_FOLDS = 10
EARLY_STOPPING = 100

LIGHTGBM_PARAMS = {
                    'boosting_type': 'goss',
                    'n_estimators': 10000,
                    'learning_rate': 0.005134,
                    'num_leaves': 54,
                    'max_depth': 10,
                    'subsample_for_bin': 240000,
                    'reg_alpha': 0.436193,
                    'reg_lambda': 0.479169,
                    'colsample_bytree': 0.508716,
                    'min_split_gain': 0.024766,
                    'subsample': 1,
                    'is_unbalance': False,
                    'silent':-1,
                    'verbose':-1
}

In [None]:
def get_train_test(path=os.getcwd(), num_rows=None):
    """ Process application_train.csv and application_test.csv and return a pandas dataframe. """
    def get_age_label(days_birth):
        """ Return the age group label (int). """
        age_years = days_birth / 365.25
        if age_years < 27: return 1
        elif age_years < 40: return 2
        elif age_years < 50: return 3
        elif age_years < 65: return 4
        elif age_years < 99: return 5
        else: return 0

    train = pd.read_csv(os.path.join(path, 'train.csv'), nrows=num_rows)
    test = pd.read_csv(os.path.join(path, 'test.csv'), nrows=num_rows)
    total = pd.concat([train, test], axis=0)
    del train, test; gc.collect()

    clients = pd.read_csv(os.path.join(path, 'client_profile.csv'), nrows=num_rows)
    df = total.merge(clients, on=ID_COL, how='left')
    del total, clients; gc.collect()

    # Data cleaning
    df.loc[df['GENDER'] == 'XNA', 'GENDER'] = df['GENDER'].mode()[0]
    df['GENDER'] = df['GENDER'].map({'F': 0, 'M': 1})
    df.loc[df['FAMILY_STATUS'] == 'Unknown', 'FAMILY_STATUS'] = 'Civil marriage'
    df['DAYS_ON_LAST_JOB'].replace(365243, np.nan, inplace=True)
    df.loc[df['AMT_REQ_CREDIT_BUREAU_QRT'] > 50, 'AMT_REQ_CREDIT_BUREAU_QRT'] = df.loc[df['AMT_REQ_CREDIT_BUREAU_QRT'] <= 50, 'AMT_REQ_CREDIT_BUREAU_QRT'].max() + 1

    # Flag_document features - count and kurtosis
    docs = [f for f in df.columns if 'FLAG_' in f]
    df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
    df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)
    # Categorical age
    df['AGE_RANGE'] = df['AGE'].apply(lambda x: get_age_label(x))

    # New features based on External ratings
    df['EXT_SOURCES_PROD'] = df['EXTERNAL_SCORING_RATING_1'] * df['EXTERNAL_SCORING_RATING_2'] * df['EXTERNAL_SCORING_RATING_3']
    df['EXT_SOURCES_WEIGHTED'] = df.EXTERNAL_SCORING_RATING_1 * 2 + df.EXTERNAL_SCORING_RATING_2 * 1 + df.EXTERNAL_SCORING_RATING_3 * 3

    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXTERNAL_SCORING_RATING_1', 'EXTERNAL_SCORING_RATING_2', 'EXTERNAL_SCORING_RATING_3']], axis=1)

    # Credit ratios
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMOUNT_CREDIT'] / df['AMOUNT_ANNUITY']
    # Income ratios
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMOUNT_ANNUITY'] / df['TOTAL_SALARY']
    df['CREDIT_TO_INCOME_RATIO'] = df['AMOUNT_CREDIT'] / df['TOTAL_SALARY']
    df['INCOME_TO_EMPLOYED_RATIO'] = df['TOTAL_SALARY'] / df['DAYS_ON_LAST_JOB']
    df['INCOME_TO_BIRTH_RATIO'] = df['TOTAL_SALARY'] / df['AGE']
    # Time ratios
    df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_ON_LAST_JOB'] / df['AGE']
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['AGE']
    df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_ON_LAST_JOB']

    # Groupby: Statistics for applications in the same group
    group = ['EDUCATION_LEVEL', 'AGE_RANGE', 'GENDER']
    df = do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_MEDIAN')
    df = do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_STD')
    df = do_mean(df, group, 'TOTAL_SALARY', 'GROUP_INCOME_MEAN')
    df = do_std(df, group, 'TOTAL_SALARY', 'GROUP_INCOME_STD')
    df = do_mean(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_MEAN')
    df = do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_STD')
    df = do_mean(df, group, 'AMOUNT_CREDIT', 'GROUP_CREDIT_MEAN')
    df = do_mean(df, group, 'AMOUNT_ANNUITY', 'GROUP_ANNUITY_MEAN')
    df = do_std(df, group, 'AMOUNT_ANNUITY', 'GROUP_ANNUITY_STD')

    # Encode categorical features (LabelEncoder)
    df, le_encoded_cols = label_encoder(df, None)
    # df = drop_application_columns(df)
    df = reduce_memory(df)

    return df

In [None]:
df = get_train_test()

border = len(df) - df[TARGET].isna().sum()
df_train, df_test = df.iloc[:border, :], df.iloc[border:, :]
X, y = df_train.drop(TARGET, axis=1), df_train[TARGET]
del df_train; gc.collect()
df_test.drop(TARGET, axis=1, inplace=True)
X.shape, df_test.shape

Memory usage of dataframe is 113.39 MB
Memory usage after optimization is: 56.70 MB
Decreased by 50.0%


((110093, 52), (165141, 52))

In [None]:
bureau = pd.read_csv('bki.csv')
bureau.head(1)

Unnamed: 0,APPLICATION_NUMBER,BUREAU_ID,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,123538884,5223613,Active,currency 1,718.0,0,377.0,,19386.81,0,675000.0,320265.495,0.0,0.0,Consumer credit,39.0,


In [None]:
BUREAU_AGG = {
    'BUREAU_ID': ['nunique'],
    'DAYS_CREDIT': ['min', 'max', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean', 'sum'],
    'AMT_ANNUITY': ['mean'],
    'DEBT_CREDIT_DIFF': ['mean', 'sum'],
    # Categorical
    'CREDIT_ACTIVE_Active': ['mean'],
    'CREDIT_ACTIVE_Closed': ['mean'],
    'CREDIT_ACTIVE_Sold': ['mean'],
    'CREDIT_TYPE_Consumer credit': ['mean'],
    'CREDIT_TYPE_Credit card': ['mean'],
    'CREDIT_TYPE_Car loan': ['mean'],
    'CREDIT_TYPE_Mortgage': ['mean'],
    'CREDIT_TYPE_Microloan': ['mean'],
}

BUREAU_ACTIVE_AGG = {
    'DAYS_CREDIT': ['max', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM': ['max', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT_UPDATE': ['min', 'mean'],
    'DEBT_PERCENTAGE': ['mean'],
    'DEBT_CREDIT_DIFF': ['mean'],
    'CREDIT_TO_ANNUITY_RATIO': ['mean'],
}

BUREAU_CLOSED_AGG = {
    'DAYS_CREDIT': ['max', 'var'],
    'DAYS_CREDIT_ENDDATE': ['max'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'sum'],
    'DAYS_CREDIT_UPDATE': ['max'],
    'ENDDATE_DIFF': ['mean'],
}

BUREAU_LOAN_TYPE_AGG = {
    'DAYS_CREDIT': ['mean', 'max'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean', 'max'],
    'AMT_CREDIT_SUM': ['mean', 'max'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'max'],
    'DEBT_PERCENTAGE': ['mean'],
    'DEBT_CREDIT_DIFF': ['mean'],
    'DAYS_CREDIT_ENDDATE': ['max'],
}

BUREAU_TIME_AGG = {
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
    'DEBT_PERCENTAGE': ['mean'],
    'DEBT_CREDIT_DIFF': ['mean'],
}

def get_bureau(path=os.getcwd(), num_rows=None):
    """ Process bureau.csv and bureau_balance.csv and return a pandas dataframe. """
    bureau = pd.read_csv(os.path.join(path, 'bki.csv'), nrows=num_rows)
    # Credit duration and credit/account end date difference
    bureau['CREDIT_DURATION'] = -bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE']
    bureau['ENDDATE_DIFF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
    # Credit to debt ratio and difference
    bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY']

    # One-hot encoder
    bureau, categorical_cols = one_hot_encoder(bureau, nan_as_category=False)

    # General loans aggregations
    agg_bureau = group(bureau, 'BUREAU_', BUREAU_AGG)
    # Active and closed loans aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    agg_bureau = group_and_merge(active,agg_bureau,'BUREAU_ACTIVE_', BUREAU_ACTIVE_AGG)
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    agg_bureau = group_and_merge(closed,agg_bureau,'BUREAU_CLOSED_', BUREAU_CLOSED_AGG)
    del active, closed; gc.collect()
    # Aggregations for the main loan types
    for credit_type in ['Consumer credit', 'Credit card', 'Mortgage', 'Car loan', 'Microloan']:
        type_df = bureau[bureau['CREDIT_TYPE_' + credit_type] == 1]
        prefix = 'BUREAU_' + credit_type.split(' ')[0].upper() + '_'
        agg_bureau = group_and_merge(type_df, agg_bureau, prefix, BUREAU_LOAN_TYPE_AGG)
        del type_df; gc.collect()
    # Time based aggregations: last x months
    for time_frame in [6, 12]:
        prefix = "BUREAU_LAST{}M_".format(time_frame)
        time_frame_df = bureau[bureau['DAYS_CREDIT'] >= -30*time_frame]
        agg_bureau = group_and_merge(time_frame_df, agg_bureau, prefix, BUREAU_TIME_AGG)
        del time_frame_df; gc.collect()

    # Last loan max overdue
    sort_bureau = bureau.sort_values(by=['DAYS_CREDIT'])
    gr = sort_bureau.groupby(ID_COL)['AMT_CREDIT_MAX_OVERDUE'].last().reset_index()
    gr.rename({'AMT_CREDIT_MAX_OVERDUE': 'BUREAU_LAST_LOAN_MAX_OVERDUE'}, inplace=True)
    agg_bureau = agg_bureau.merge(gr, on=ID_COL, how='left')
    # Ratios: total debt/total credit and active loans debt/ active loans credit
    agg_bureau['BUREAU_DEBT_OVER_CREDIT'] = \
        agg_bureau['BUREAU_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_AMT_CREDIT_SUM_SUM']
    agg_bureau['BUREAU_ACTIVE_DEBT_OVER_CREDIT'] = \
        agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_SUM']
    return agg_bureau

In [None]:
bureau = get_bureau()

In [None]:
history = pd.read_csv('applications_history.csv')
history.head(1)

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,AMOUNT_ANNUITY,AMT_APPLICATION,AMOUNT_CREDIT,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,SELLERPLACE_AREA,CNT_PAYMENT,NAME_YIELD_GROUP,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,49298709,123595216,,1730.43,17145.0,17145.0,0.0,17145.0,Approved,73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,35,12.0,middle,365243.0,42.0,300.0,42.0,37.0,0.0


In [None]:
payments = pd.read_csv('payments.csv')
payments.head(1)

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,49011181,123664960,1.0,5,1002.0,1015.0,12156.615,12156.615


In [None]:
PREVIOUS_ACTIVE_AGG = {
    'PREV_APPLICATION_NUMBER': ['nunique'],
    'SIMPLE_INTERESTS': ['mean'],
    'AMOUNT_ANNUITY': ['max', 'sum'],
    'AMT_APPLICATION': ['max', 'mean'],
    'AMOUNT_CREDIT': ['sum'],
    'AMT_DOWN_PAYMENT': ['max', 'mean'],
    'DAYS_DECISION': ['min', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'AMT_PAYMENT': ['sum'],
    'INSTALMENT_PAYMENT_DIFF': ['mean', 'max'],
    'REMAINING_DEBT': ['max', 'mean', 'sum'],
    'REPAYMENT_RATIO': ['mean'],
}

PREVIOUS_AGG = {
    'PREV_APPLICATION_NUMBER': ['nunique'],
    'AMOUNT_ANNUITY': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['max', 'mean'],
    'DAYS_TERMINATION': ['max'],
    # Engineered features
    'CREDIT_TO_ANNUITY_RATIO': ['mean', 'max'],
    'APPLICATION_CREDIT_DIFF': ['min', 'max', 'mean'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean', 'var'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
}

PREVIOUS_APPROVED_AGG = {
    'PREV_APPLICATION_NUMBER': ['nunique'],
    'AMOUNT_ANNUITY': ['min', 'max', 'mean'],
    'AMOUNT_CREDIT': ['min', 'max', 'mean'],
    'AMOUNT_GOODS_PAYMENT': ['max'],
    'CNT_PAYMENT': ['max', 'mean'],
    'DAYS_TERMINATION': ['mean'],
    'DAYS_DECISION': ['min', 'mean'],
    # Engineered features
    'CREDIT_TO_ANNUITY_RATIO': ['mean', 'max'],
    'APPLICATION_CREDIT_DIFF': ['max'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    # The following features are only for approved applications
    'DAYS_FIRST_DRAWING': ['max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['max', 'mean'],
    'DAYS_LAST_DUE_DIFF': ['min', 'max', 'mean'],
    'SIMPLE_INTERESTS': ['min', 'max', 'mean'],
}

PREVIOUS_REFUSED_AGG = {
    'AMT_APPLICATION': ['max', 'mean'],
    'AMOUNT_CREDIT': ['min', 'max'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min', 'max', 'mean', 'var'],
    'APPLICATION_CREDIT_RATIO': ['min', 'mean'],
    'NAME_CONTRACT_TYPE_Cash': ['mean'],
    'NAME_CONTRACT_TYPE_Credit Card': ['mean'],
}

PREVIOUS_LOAN_TYPE_AGG = {
    'AMOUNT_CREDIT': ['sum'],
    'AMOUNT_ANNUITY': ['mean', 'max'],
    'SIMPLE_INTERESTS': ['min', 'mean', 'max', 'var'],
    'APPLICATION_CREDIT_DIFF': ['min', 'var'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['max'],
    'DAYS_LAST_DUE_1ST_VERSION': ['max', 'mean'],
    'CNT_PAYMENT': ['mean'],
}

PREVIOUS_LATE_PAYMENTS_AGG = {
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min'],
    'NAME_CONTRACT_TYPE_Cash': ['mean'],
    'NAME_CONTRACT_TYPE_Credit Card': ['mean'],
}

PREVIOUS_TIME_AGG = {
    'AMOUNT_CREDIT': ['sum'],
    'AMOUNT_ANNUITY': ['mean', 'max'],
    'SIMPLE_INTERESTS': ['mean', 'max'],
    'DAYS_DECISION': ['min', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    'NAME_CONTRACT_TYPE_Cash': ['mean'],
    'NAME_CONTRACT_TYPE_Credit Card': ['mean'],
}

def get_previous_applications(path=os.getcwd(), num_rows=None):
    """ Process previous_application.csv and return a pandas dataframe. """
    prev = pd.read_csv(os.path.join(path, 'applications_history.csv'), nrows=num_rows)
    pay = pd.read_csv(os.path.join(path, 'payments.csv'), nrows=num_rows)
    prev.rename(columns={'AMOUNT_PAYMENT': 'AMT_DOWN_PAYMENT'}, inplace=True)

    # One-hot encode most important categorical features
    ohe_columns = [
                    'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE',
                    'NAME_TYPE_SUITE', 'NAME_YIELD_GROUP',
                    'NAME_PRODUCT_TYPE', 'NAME_CLIENT_TYPE']
    prev, categorical_cols = one_hot_encoder(prev, ohe_columns, nan_as_category=False)

    # Feature engineering: ratios and difference
    prev['APPLICATION_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMOUNT_CREDIT']
    prev['APPLICATION_CREDIT_RATIO'] = prev['AMT_APPLICATION'] / prev['AMOUNT_CREDIT']
    prev['CREDIT_TO_ANNUITY_RATIO'] = prev['AMOUNT_CREDIT']/prev['AMOUNT_ANNUITY']
    prev['DOWN_PAYMENT_TO_CREDIT'] = prev['AMT_DOWN_PAYMENT'] / prev['AMOUNT_CREDIT']
    # Interest ratio on previous application (simplified)
    total_payment = prev['AMOUNT_ANNUITY'] * prev['CNT_PAYMENT']
    prev['SIMPLE_INTERESTS'] = (total_payment/prev['AMOUNT_CREDIT'] - 1)/prev['CNT_PAYMENT']

    # Active loans - approved and not complete yet (last_due 365243)
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    active_df = approved[approved['DAYS_LAST_DUE'] == 365243]
    # Find how much was already payed in active loans (using installments csv)
    active_pay = pay[pay['PREV_APPLICATION_NUMBER'].isin(active_df['PREV_APPLICATION_NUMBER'])]
    active_pay_agg = active_pay.groupby('PREV_APPLICATION_NUMBER')[['AMT_INSTALMENT', 'AMT_PAYMENT']].sum()
    active_pay_agg.reset_index(inplace= True)
    # Active loans: difference of what was payed and installments
    active_pay_agg['INSTALMENT_PAYMENT_DIFF'] = active_pay_agg['AMT_INSTALMENT'] - active_pay_agg['AMT_PAYMENT']
    # Merge with active_df
    active_df = active_df.merge(active_pay_agg, on= 'PREV_APPLICATION_NUMBER', how= 'left')
    active_df['REMAINING_DEBT'] = active_df['AMOUNT_CREDIT'] - active_df['AMT_PAYMENT']
    active_df['REPAYMENT_RATIO'] = active_df['AMT_PAYMENT'] / active_df['AMOUNT_CREDIT']
    # Perform aggregations for active applications
    active_agg_df = group(active_df, 'PREV_ACTIVE_', PREVIOUS_ACTIVE_AGG)
    active_agg_df['TOTAL_REPAYMENT_RATIO'] = active_agg_df['PREV_ACTIVE_AMT_PAYMENT_SUM']/\
                                             active_agg_df['PREV_ACTIVE_AMOUNT_CREDIT_SUM']
    del active_pay, active_pay_agg, active_df; gc.collect()

    # Change 365.243 values to nan (missing)
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    # Days last due difference (scheduled x done)
    prev['DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    approved['DAYS_LAST_DUE_DIFF'] = approved['DAYS_LAST_DUE_1ST_VERSION'] - approved['DAYS_LAST_DUE']

    # Categorical features
    categorical_agg = {key: ['mean'] for key in categorical_cols}
    # Perform general aggregations
    agg_prev = group(prev, 'PREV_', {**PREVIOUS_AGG, **categorical_agg})
    # Merge active loans dataframe on agg_prev
    agg_prev = agg_prev.merge(active_agg_df, how='left', on=ID_COL)
    del active_agg_df; gc.collect()
    # Aggregations for approved and refused loans
    agg_prev = group_and_merge(approved, agg_prev, 'APPROVED_', PREVIOUS_APPROVED_AGG)
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    agg_prev = group_and_merge(refused, agg_prev, 'REFUSED_', PREVIOUS_REFUSED_AGG)
    del approved, refused; gc.collect()
    # Aggregations for Credit Card and Cash loans
    for loan_type in ['Cash', 'Credit Card']:
        type_df = prev[prev['NAME_CONTRACT_TYPE_{}'.format(loan_type)] == 1]
        prefix = 'PREV_' + loan_type.split(" ")[0] + '_'
        agg_prev = group_and_merge(type_df, agg_prev, prefix, PREVIOUS_LOAN_TYPE_AGG)
        del type_df; gc.collect()

    # Get the PREV_APPLICATION_NUMBER for loans with late payments (days past due)
    pay['LATE_PAYMENT'] = pay['DAYS_ENTRY_PAYMENT'] - pay['DAYS_INSTALMENT']
    pay['LATE_PAYMENT'] = pay['LATE_PAYMENT'].apply(lambda x: 1 if x > 0 else 0)
    dpd_id = pay[pay['LATE_PAYMENT'] > 0]['PREV_APPLICATION_NUMBER'].unique()
    # Aggregations for loans with late payments
    agg_dpd = group_and_merge(prev[prev['PREV_APPLICATION_NUMBER'].isin(dpd_id)], agg_prev,
                                    'PREV_LATE_', PREVIOUS_LATE_PAYMENTS_AGG)
    del agg_dpd, dpd_id; gc.collect()
    # Aggregations for loans in the last x months
    for time_frame in [12, 24]:
        time_frame_df = prev[prev['DAYS_DECISION'] >= -30*time_frame]
        prefix = 'PREV_LAST{}M_'.format(time_frame)
        agg_prev = group_and_merge(time_frame_df, agg_prev, prefix, PREVIOUS_TIME_AGG)
        del time_frame_df; gc.collect()
    del prev; gc.collect()
    for col in agg_prev.columns:
        if agg_prev[col].nunique() < 2:
            agg_prev.drop(col, axis=1, inplace=True)
    return agg_prev

In [None]:
pay_history = get_previous_applications()

In [None]:
X = pd.merge(X, bureau, on=ID_COL, how='left')
X = pd.merge(X, pay_history, on=ID_COL, how='left')

In [None]:
def print_scores(folds_scores, train_scores):
    print(f"Train score by each fold: {train_scores}")
    print(f"Valid score by each fold: {folds_scores}")
    print(f"Train mean score by each fold:{np.mean(train_scores):.5f} +/- {np.std(train_scores):.5f}")
    print(f"Valid mean score by each fold:{np.mean(folds_scores):.5f} +/- {np.std(folds_scores):.5f}")
    print("*" * 50)
    
def lightgbm_cross_validation(params, X, y, cv, categorical=None, rounds=50, verbose=True):
    estimators, folds_scores, train_scores = [], [], []

    if not categorical:
        categorical = "auto"

    oof_preds = np.zeros(X.shape[0])
    if verbose:
        print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc",
            verbose=0,
            early_stopping_rounds=rounds
        )
        train_score = model.predict_proba(x_train)[:, 1]
        train_score = roc_auc_score(y_train, train_score)
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        folds_scores.append(round(score, 5))
        train_scores.append(round(train_score, 5))
        if verbose:
            print(f"Fold {fold + 1}, Train score = {train_score:.5f}, Valid score = {score:.5f}")
        estimators.append(model)

    if verbose:
        print_scores(folds_scores, train_scores)
        print(f"OOF-score: {roc_auc_score(y, oof_preds):.5f}")
    return estimators, oof_preds

In [None]:
def add_ratios_features(df):
    # CREDIT TO INCOME RATIO
    df['BUREAU_INCOME_CREDIT_RATIO'] = df['BUREAU_AMT_CREDIT_SUM_MEAN'] / df['TOTAL_SALARY']
    df['BUREAU_ACTIVE_CREDIT_TO_INCOME_RATIO'] = df['BUREAU_ACTIVE_AMT_CREDIT_SUM_SUM'] / df['TOTAL_SALARY']
    # PREVIOUS TO CURRENT CREDIT RATIO
    df['CURRENT_TO_APPROVED_CREDIT_MIN_RATIO'] = df['APPROVED_AMOUNT_CREDIT_MIN'] / df['AMOUNT_CREDIT']
    df['CURRENT_TO_APPROVED_CREDIT_MAX_RATIO'] = df['APPROVED_AMOUNT_CREDIT_MAX'] / df['AMOUNT_CREDIT']
    df['CURRENT_TO_APPROVED_CREDIT_MEAN_RATIO'] = df['APPROVED_AMOUNT_CREDIT_MEAN'] / df['AMOUNT_CREDIT']
    # PREVIOUS TO CURRENT ANNUITY RATIO
    df['CURRENT_TO_APPROVED_ANNUITY_MAX_RATIO'] = df['APPROVED_AMOUNT_ANNUITY_MAX'] / df['AMOUNT_ANNUITY']
    df['CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO'] = df['APPROVED_AMOUNT_ANNUITY_MEAN'] / df['AMOUNT_ANNUITY']
    # PREVIOUS TO CURRENT CREDIT TO ANNUITY RATIO
    df['CTA_CREDIT_TO_ANNUITY_MAX_RATIO'] = df['APPROVED_CREDIT_TO_ANNUITY_RATIO_MAX'] / df[
        'CREDIT_TO_ANNUITY_RATIO']
    df['CTA_CREDIT_TO_ANNUITY_MEAN_RATIO'] = df['APPROVED_CREDIT_TO_ANNUITY_RATIO_MEAN'] / df[
        'CREDIT_TO_ANNUITY_RATIO']
    # DAYS DIFFERENCES AND RATIOS
    df['DAYS_DECISION_MEAN_TO_BIRTH'] = df['APPROVED_DAYS_DECISION_MEAN'] / df['AGE']
    df['DAYS_CREDIT_MEAN_TO_BIRTH'] = df['BUREAU_DAYS_CREDIT_MEAN'] / df['AGE']
    df['DAYS_DECISION_MEAN_TO_EMPLOYED'] = df['APPROVED_DAYS_DECISION_MEAN'] / df['DAYS_ON_LAST_JOB']
    df['DAYS_CREDIT_MEAN_TO_EMPLOYED'] = df['BUREAU_DAYS_CREDIT_MEAN'] / df['DAYS_ON_LAST_JOB']
    return df

In [None]:
X_ = add_ratios_features(X.copy(deep=True))

In [None]:
# permutation
x_train, x_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=seed)
scores, deltas = calculate_permutation_importance(LGBMClassifier(**LIGHTGBM_PARAMS).fit(x_train, y_train, 
                                                                                        eval_set=[(x_test, y_test)],
                                                                                        early_stopping_rounds=EARLY_STOPPING,
                                                                                        verbose=-1), 
                                                  x_valid=x_test, 
                                                  y_valid=y_test,
                                                  probas=True, 
                                                  metric=roc_auc_score)

deltas = deltas[deltas>0].index.tolist()
deltas

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1232]	valid_0's binary_logloss: 0.245117


100%|██████████| 375/375 [30:23<00:00,  4.86s/it]


['EXT_SOURCES_MEAN',
 'CREDIT_TO_ANNUITY_RATIO',
 'EXTERNAL_SCORING_RATING_3',
 'NAME_CONTRACT_TYPE',
 'EMPLOYED_TO_BIRTH_RATIO',
 'DAYS_ON_LAST_JOB',
 'EXT_SOURCES_MIN',
 'GROUP_EXT_SOURCES_MEDIAN',
 'EXT_SOURCES_NANMEDIAN',
 'AGE',
 'AMOUNT_ANNUITY',
 'EXTERNAL_SCORING_RATING_1',
 'GROUP_INCOME_STD',
 'CAR_TO_BIRTH_RATIO',
 'OWN_CAR_AGE',
 'EXT_SOURCES_MAX',
 'GENDER',
 'GROUP_EXT_SOURCES_STD',
 'FAMILY_STATUS',
 'EDUCATION_LEVEL',
 'ANNUITY_TO_INCOME_RATIO',
 'GROUP_CREDIT_TO_ANNUITY_MEAN',
 'INCOME_TO_EMPLOYED_RATIO',
 'GROUP_ANNUITY_MEAN',
 'REFUSED_APPLICATION_CREDIT_DIFF_MAX',
 'DAYS_DECISION_MEAN_TO_EMPLOYED',
 'GROUP_INCOME_MEAN',
 'PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MAX',
 'EXTERNAL_SCORING_RATING_2',
 'CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO',
 'AMT_CREDIT_MAX_OVERDUE',
 'PREV_Cash_SIMPLE_INTERESTS_MEAN',
 'BUREAU_CLOSED_AMT_CREDIT_MAX_OVERDUE_MAX',
 'PREV_Cash_DAYS_LAST_DUE_1ST_VERSION_MEAN',
 'EXT_SOURCES_PROD',
 'APPROVED_CREDIT_TO_ANNUITY_RATIO_MAX',
 'GROUP_ANNUITY_S

In [None]:
deltas = ['EXT_SOURCES_MEAN', 'CREDIT_TO_ANNUITY_RATIO', 'EXTERNAL_SCORING_RATING_3', 'NAME_CONTRACT_TYPE', 'EMPLOYED_TO_BIRTH_RATIO',
          'DAYS_ON_LAST_JOB', 'EXT_SOURCES_MIN', 'GROUP_EXT_SOURCES_MEDIAN', 'EXT_SOURCES_NANMEDIAN', 'AGE', 'AMOUNT_ANNUITY',
          'EXTERNAL_SCORING_RATING_1', 'GROUP_INCOME_STD', 'CAR_TO_BIRTH_RATIO', 'OWN_CAR_AGE', 'EXT_SOURCES_MAX', 'GENDER',
          'GROUP_EXT_SOURCES_STD', 'FAMILY_STATUS', 'EDUCATION_LEVEL', 'ANNUITY_TO_INCOME_RATIO', 'GROUP_CREDIT_TO_ANNUITY_MEAN',
          'INCOME_TO_EMPLOYED_RATIO', 'GROUP_ANNUITY_MEAN', 'REFUSED_APPLICATION_CREDIT_DIFF_MAX', 'DAYS_DECISION_MEAN_TO_EMPLOYED',
          'GROUP_INCOME_MEAN', 'PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MAX', 'EXTERNAL_SCORING_RATING_2', 
          'CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO', 'AMT_CREDIT_MAX_OVERDUE', 'PREV_Cash_SIMPLE_INTERESTS_MEAN',
          'BUREAU_CLOSED_AMT_CREDIT_MAX_OVERDUE_MAX', 'PREV_Cash_DAYS_LAST_DUE_1ST_VERSION_MEAN', 'EXT_SOURCES_PROD',
          'APPROVED_CREDIT_TO_ANNUITY_RATIO_MAX', 'GROUP_ANNUITY_STD', 'PREV_LAST12M_DAYS_LAST_DUE_1ST_VERSION_MIN',
          'BUREAU_CREDIT_DAYS_CREDIT_MAX', 'APPROVED_AMOUNT_ANNUITY_MEAN', 'BUREAU_CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN',
          'PREV_LAST12M_DAYS_LAST_DUE_1ST_VERSION_MEAN', 'BUREAU_CLOSED_DAYS_CREDIT_UPDATE_MAX', 
          'PREV_NAME_YIELD_GROUP_low_action_MEAN', 'BUREAU_DEBT_CREDIT_DIFF_SUM', 'PREV_Cash_SIMPLE_INTERESTS_MAX',
          'PREV_NAME_PRODUCT_TYPE_XNA_MEAN', 'PREV_CREDIT_TO_ANNUITY_RATIO_MAX', 'APPROVED_DAYS_FIRST_DUE_MIN',
          'PREV_APPLICATION_CREDIT_DIFF_MIN', 'BUREAU_CREDIT_AMT_CREDIT_SUM_MEAN', 'PREV_Cash_APPLICATION_CREDIT_RATIO_MEAN',
          'BUREAU_ACTIVE_DAYS_CREDIT_ENDDATE_MAX', 'AMOUNT_CREDIT', 'PREV_NAME_YIELD_GROUP_middle_MEAN', 
          'PREV_LAST12M_SIMPLE_INTERESTS_MEAN', 'PREV_NAME_CONTRACT_TYPE_Cash_MEAN', 'BUREAU_CLOSED_ENDDATE_DIFF_MEAN',
          'BUREAU_DAYS_CREDIT_ENDDATE_MIN', 'PREV_NAME_CLIENT_TYPE_Refreshed_MEAN', 'REFUSED_APPLICATION_CREDIT_RATIO_MEAN',
          'REFUSED_AMT_APPLICATION_MAX', 'APPROVED_DAYS_FIRST_DRAWING_MEAN', 'APPROVED_AMOUNT_GOODS_PAYMENT_MAX',
          'PREV_NAME_PRODUCT_TYPE_x-sell_MEAN', 'CURRENT_TO_APPROVED_ANNUITY_MAX_RATIO', 'BUREAU_CREDIT_TYPE_Consumer credit_MEAN',
          'REFUSED_AMOUNT_CREDIT_MIN', 'REFUSED_DAYS_DECISION_MIN', 'PREV_Cash_SIMPLE_INTERESTS_MIN',
          'PREV_LAST24M_AMOUNT_ANNUITY_MEAN', 'APPROVED_CNT_PAYMENT_MEAN', 'PREV_LAST12M_NAME_CONTRACT_TYPE_Credit Card_MEAN',
          'PREV_ACTIVE_AMOUNT_ANNUITY_SUM', 'REGION_POPULATION', 'BUREAU_AMT_CREDIT_SUM_DEBT_SUM', 'APPROVED_AMOUNT_ANNUITY_MAX',
          'REFUSED_APPLICATION_CREDIT_RATIO_MIN', 'DAYS_CREDIT_MEAN_TO_BIRTH', 'BUREAU_LAST6M_AMT_CREDIT_SUM_DEBT_MEAN',
          'PREV_NAME_CONTRACT_STATUS_Approved_MEAN', 'PREV_LAST12M_AMOUNT_CREDIT_SUM', 'PREV_CNT_PAYMENT_MAX',
          'APPROVED_AMOUNT_CREDIT_MAX', 'BUREAU_LAST6M_DEBT_CREDIT_DIFF_MEAN', 'BUREAU_CONSUMER_DAYS_CREDIT_MAX',
          'PREV_LAST24M_APPLICATION_CREDIT_RATIO_MIN', 'PREV_NAME_CLIENT_TYPE_Repeater_MEAN', 'BUREAU_CAR_AMT_CREDIT_SUM_MAX',
          'BUREAU_DAYS_CREDIT_ENDDATE_MAX', 'PREV_APPLICATION_CREDIT_RATIO_MIN', 'PREV_NAME_CONTRACT_TYPE_Credit Card_MEAN',
          'REFUSED_APPLICATION_CREDIT_DIFF_MEAN', 'PREV_Cash_AMOUNT_CREDIT_SUM', 'EXT_SOURCES_VAR', 'CTA_CREDIT_TO_ANNUITY_MAX_RATIO',
          'CHILDRENS', 'PREV_Cash_AMOUNT_ANNUITY_MAX', 'BUREAU_CONSUMER_AMT_CREDIT_MAX_OVERDUE_MEAN', 'BUREAU_AMT_CREDIT_SUM_MEAN',
          'CAR_TO_EMPLOYED_RATIO', 'BUREAU_CAR_DEBT_PERCENTAGE_MEAN', 'PREV_LAST24M_APPLICATION_CREDIT_RATIO_MEAN',
          'BUREAU_LAST12M_AMT_CREDIT_SUM_MAX', 'PREV_LAST12M_AMOUNT_ANNUITY_MEAN', 'PREV_Cash_APPLICATION_CREDIT_DIFF_MIN',
          'CTA_CREDIT_TO_ANNUITY_MEAN_RATIO', 'PREV_APPLICATION_CREDIT_DIFF_MEAN', 'BUREAU_AMT_CREDIT_MAX_OVERDUE_MEAN',
          'BUREAU_CAR_DAYS_CREDIT_ENDDATE_MAX', 'BUREAU_ACTIVE_AMT_CREDIT_SUM_MAX', 'EXT_SOURCES_WEIGHTED',
          'PREV_NAME_TYPE_SUITE_Group of people_MEAN', 'BUREAU_CLOSED_AMT_CREDIT_SUM_MAX', 'AGE_RANGE',
          'BUREAU_AMT_CREDIT_MAX_OVERDUE_MAX', 'PREV_LAST24M_SIMPLE_INTERESTS_MEAN', 'PREV_ACTIVE_CNT_PAYMENT_SUM',
          'PREV_ACTIVE_SIMPLE_INTERESTS_MEAN', 'BUREAU_CAR_AMT_CREDIT_SUM_MEAN', 'PREV_APPLICATION_CREDIT_RATIO_MEAN',
          'BUREAU_CAR_DAYS_CREDIT_MEAN', 'BUREAU_DAYS_CREDIT_MIN', 'DAYS_DECISION_MEAN_TO_BIRTH',
          'APPROVED_APPLICATION_CREDIT_DIFF_MAX', 'PREV_LAST12M_APPLICATION_CREDIT_RATIO_MEAN',
          'BUREAU_CAR_AMT_CREDIT_MAX_OVERDUE_MEAN', 'APPROVED_DAYS_LAST_DUE_1ST_VERSION_MEAN', 'FLAG_PHONE',
          'BUREAU_LAST12M_AMT_CREDIT_MAX_OVERDUE_MEAN', 'PREV_Cash_APPLICATION_CREDIT_RATIO_MAX',
          'PREV_Credit_APPLICATION_CREDIT_DIFF_VAR', 'APPROVED_AMOUNT_CREDIT_MIN', 'BUREAU_CAR_AMT_CREDIT_SUM_DEBT_MEAN',
          'PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MIN', 'REFUSED_CNT_PAYMENT_MAX', 'BUREAU_CAR_AMT_CREDIT_SUM_DEBT_MAX',
          'PREV_NAME_CONTRACT_STATUS_Unused offer_MEAN', 'PREV_LAST12M_DAYS_DECISION_MIN', 'BUREAU_CAR_AMT_CREDIT_MAX_OVERDUE_MAX',
          'PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MEAN', 'BUREAU_MORTGAGE_DEBT_CREDIT_DIFF_MEAN', 'PREV_NAME_TYPE_SUITE_Children_MEAN',
          'BUREAU_CAR_DEBT_CREDIT_DIFF_MEAN', 'BUREAU_DEBT_CREDIT_DIFF_MEAN', 'PREV_LAST24M_NAME_CONTRACT_TYPE_Cash_MEAN',
          'BUREAU_CLOSED_AMT_CREDIT_SUM_DEBT_SUM', 'REFUSED_NAME_CONTRACT_TYPE_Credit Card_MEAN',
          'BUREAU_MICROLOAN_DAYS_CREDIT_ENDDATE_MAX', 'BUREAU_LAST6M_AMT_CREDIT_MAX_OVERDUE_MEAN',
          'PREV_ACTIVE_INSTALMENT_PAYMENT_DIFF_MAX', 'PREV_LAST24M_DAYS_DECISION_MIN', 'BUREAU_LAST12M_AMT_CREDIT_SUM_DEBT_SUM',
          'BUREAU_MICROLOAN_AMT_CREDIT_MAX_OVERDUE_MEAN', 'APPROVED_DAYS_LAST_DUE_1ST_VERSION_MAX',
          'BUREAU_LAST12M_AMT_CREDIT_MAX_OVERDUE_MAX', 'DOCUMENT_COUNT', 'BUREAU_MORTGAGE_AMT_CREDIT_SUM_DEBT_MAX',
          'BUREAU_ACTIVE_AMT_CREDIT_SUM_OVERDUE_MAX', 'PREV_Credit_APPLICATION_CREDIT_RATIO_MIN',
          'PREV_NAME_TYPE_SUITE_Other_A_MEAN', 'BUREAU_BUREAU_ID_NUNIQUE', 'BUREAU_LAST6M_AMT_CREDIT_MAX_OVERDUE_MAX',
          'BUREAU_MICROLOAN_AMT_CREDIT_SUM_DEBT_MAX']

In [None]:
estimators_lgbm6, oof_preds_lgbm6 = lightgbm_cross_validation(LIGHTGBM_PARAMS, 
                                                              X_[deltas], 
                                                              y, 
                                                              cv=KFold(n_splits=NUM_FOLDS, random_state=seed, shuffle=True), 
                                                              rounds=EARLY_STOPPING)

Wed Aug 31 16:30:52 2022, Cross-Validation, 110093 rows, 164 cols
Fold 1, Train score = 0.86773, Valid score = 0.74653
Fold 2, Train score = 0.86409, Valid score = 0.73794
Fold 3, Train score = 0.84495, Valid score = 0.72629
Fold 4, Train score = 0.83559, Valid score = 0.72070
Fold 5, Train score = 0.72415, Valid score = 0.72269
Fold 6, Train score = 0.87117, Valid score = 0.72261
Fold 7, Train score = 0.86119, Valid score = 0.73162
Fold 8, Train score = 0.84422, Valid score = 0.71605
Fold 9, Train score = 0.85642, Valid score = 0.71965
Fold 10, Train score = 0.86587, Valid score = 0.71075
Train score by each fold: [0.86773, 0.86409, 0.84495, 0.83559, 0.72415, 0.87117, 0.86119, 0.84422, 0.85642, 0.86587]
Valid score by each fold: [0.74653, 0.73794, 0.72629, 0.7207, 0.72269, 0.72261, 0.73162, 0.71605, 0.71965, 0.71075]
Train mean score by each fold:0.84354 +/- 0.04131
Valid mean score by each fold:0.72548 +/- 0.01008
**************************************************
OOF-score: 0.71758


In [None]:
test_data_ = pd.merge(df_test.copy(deep=True), bureau, on=ID_COL, how='left')
test_data_ = pd.merge(test_data_, pay_history, on=ID_COL, how='left')
test_data_ = add_ratios_features(test_data_)

In [None]:
test_preds = np.array([model.predict_proba(test_data_[deltas])[:,1] for model in estimators_lgbm6]).T
# test_preds = stats.gmean(test_preds, axis=1)
test_preds = np.mean(test_preds, axis=1)
test_preds = pd.DataFrame({ID_COL: test_data_[ID_COL], TARGET: test_preds})
test_preds.to_csv('submission.csv', index=False)
!kaggle competitions submit -c geekbrains-competitive-data-analysis -f submission.csv -m "Message"

100% 4.73M/4.73M [00:02<00:00, 1.87MB/s]
Successfully submitted to GeekBrains Competitive Data Analysis