In [None]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
# from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('data/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('data/application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    df['MONTH_EMPLOYED'] = df['DAYS_EMPLOYED']/30
    df['YEARS_EMPLOYED'] = df['DAYS_EMPLOYED']/365
    
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['AMT_INCOME_TOTAL_LOG'] = df['AMT_INCOME_TOTAL'].apply(np.log1p)
    df['AMT_CREDIT_LOG'] = df['AMT_CREDIT'].apply(np.log1p)
    df['INCOME_CREDIT_PERC_LOG'] = df['AMT_INCOME_TOTAL_LOG'] - df['AMT_CREDIT_LOG']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['INCOME_PER_PERSON_LOG'] = df['AMT_INCOME_TOTAL_LOG'] - df['CNT_FAM_MEMBERS']
    
    df['AMT_ANNUITY_LOG'] = df['AMT_ANNUITY'].apply(np.log1p)
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY_LOG'] - df['AMT_INCOME_TOTAL_LOG']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['PAYMENT_RATE_LOG'] = df['AMT_ANNUITY_LOG'] - df['AMT_CREDIT_LOG']
    
    df['cnt_non_child'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']
    df['child_to_non_child_ratio'] = df['CNT_CHILDREN'] / df['cnt_non_child']
    df['income_per_non_child'] = df['AMT_INCOME_TOTAL'] / df['cnt_non_child']
    df['credit_per_person_log'] = df['AMT_CREDIT_LOG'] / df['CNT_FAM_MEMBERS']
    df['credit_per_person'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
    df['credit_per_child'] = df['AMT_CREDIT'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_child_log'] = df['AMT_CREDIT_LOG'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_non_child'] = df['AMT_CREDIT'] / df['cnt_non_child']
    df['annuity_income_percentage'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['annuity_income_percentage_log'] = df['AMT_ANNUITY_LOG'] / df['AMT_INCOME_TOTAL_LOG']
    
    df['car_to_birth_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['car_to_employ_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['children_ratio'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']
    df['credit_to_annuity_ratio'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['credit_to_goods_ratio'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['credit_to_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
#     df['days_employed_percentage'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
#     df['income_credit_percentage'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
#     df['income_per_child'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
#     df['income_per_person'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
#     df['payment_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['phone_to_birth_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['phone_to_employ_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    del test_df
    gc.collect()
    return df

# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('data/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('data/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var', 'median'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean', 'median'],
        'DAYS_CREDIT_UPDATE': ['mean', 'median'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean', 'median'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean', 'median'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum', 'median'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum', 'median'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean', 'median'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum', 'median'],
        'AMT_ANNUITY': ['max', 'mean', 'median'],
        'CNT_CREDIT_PROLONG': ['sum', 'median'],
        'MONTHS_BALANCE_MIN': ['min', 'median'],
        'MONTHS_BALANCE_MAX': ['max', 'median'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum', 'median']
    }
    bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
    bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
    bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('data/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    
    
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    prev['DIF_ASK_GOOD'] = prev['AMT_GOODS_PRICE'] / prev['AMT_APPLICATION']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': [('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))),
                           ('log_std', lambda x: np.nanstd(np.log1p(x))), ('median', np.nanmedian),
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': [('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))),('median', np.nanmedian),
                           ('log_std', lambda x: np.nanstd(np.log1p(x))), 
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var', 'median'],
        'AMT_DOWN_PAYMENT': [('max', np.nanmax), ('mean', np.nanmean), ('median', np.nanmedian),
                             ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), 
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'AMT_GOODS_PRICE': [('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), ('median', np.nanmedian),
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean', 'median'],
        'CNT_PAYMENT': ['mean', 'sum', 'median'],
        'DIF_ASK_GOOD':['mean','median'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('data/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size', 'median'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('data/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    
    ins['DPD_ratio'] = ins['DPD'].apply(lambda x: 1 if x > 0 else 0)
    ins['DBD_ratio'] = ins['DBD'].apply(lambda x: 1 if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DBD':['mean', 'median'],
        'DPD':['mean', 'median'],
        'DPD': [('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), ('median', np.nanmedian),
                           ('log_std', lambda x: np.nanstd(np.log1p(x))), 
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'DPBD':[('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd)],
        
        'DBD': [('max', np.nanmax), ('mean', np.nanmean), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), ('median', np.nanmedian),
                           ('log_std', lambda x: np.nanstd(np.log1p(x))), 
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': [('max', np.nanmax), ('mean', np.nanmean), ('median', np.nanmedian), ('sum', np.nansum), ('std', np.nanstd)],
        
        'AMT_INSTALMENT': [('max', np.nanmax), ('mean', np.nanmean),('median', np.nanmedian), ('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), 
                           ('log_std', lambda x: np.std(np.log1p(x))), 
                       ('log_sum', lambda x: np.sum(np.log1p(x)))],
        
        'AMT_PAYMENT': [('max', np.max), ('mean', np.nanmean), ('median', np.nanmedian),('sum', np.nansum), ('std', np.nanstd),
        ('log_mean', lambda x: np.nanmean(np.log1p(x))), ('log_std', lambda x: np.nanstd(np.log1p(x))), 
                       ('log_sum', lambda x: np.nansum(np.log1p(x)))],
        
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum', 'var', 'median']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean', 'sum']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('data/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan
    cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var', 'median'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg


In [None]:
?np.nanmedian

In [None]:
ins = pd.read_csv('data/installments_payments.csv')
# ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

In [None]:
ins['AMT_INSTALMENT'].value_counts()

In [None]:
bsts = []
def run_xgb(train_X, train_y, val_X, val_y, test_X, params, model=None):
    

    start_time = time.time()
    xgb_train = xgb.DMatrix(train_X, train_y)
    xgb_val = xgb.DMatrix(val_X, val_y)
    
#     if model:
#         bst = xgb.Booster(params)
#         bst.load_model(model)
#         model = bst
    
    model = xgb.train(params, xgb_train, 5000, 
                      evals=[(xgb_train, 'train'), (xgb_val, 'val')], 
                      early_stopping_rounds=50,verbose_eval=50, xgb_model=model)
    
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    xgb_test = xgb.DMatrix(test_X)
    pred_test_y = model.predict(xgb_test, ntree_limit=model.best_iteration)
    pred_oof = model.predict(xgb_val, ntree_limit=model.best_iteration)
    bsts.append(model.get_fscore())
    
    
    return pred_test_y, pred_oof, model

In [None]:
# from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
def run_extra_trees(train_X, train_y, val_X, val_y, test_X, params, model=None):
    
    train_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    val_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    test_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    
    start_time = time.time()
    clf = ExtraTreesClassifier(max_depth=5, min_samples_split=30, min_samples_leaf=5, class_weight='balanced')
    clf.fit(train_X, train_y)
    
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = clf.predict_proba(test_X)[:,1]
    pred_oof = clf.predict_proba(val_X)[:,1]
#     bsts.append(model.get_fscore())
    
    
    return pred_test_y, pred_oof, model

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
def run_log_reg(train_X, train_y, val_X, val_y, test_X, params, model=None):
    
    train_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    val_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    test_X.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    
    sc = StandardScaler()
    train_X = sc.fit_transform(train_X)
    val_X = sc.transform(val_X)
    test_X = sc.transform(test_X)
    
    start_time = time.time()
    clf = SGDClassifier(penalty='l2',alpha=0.001, class_weight='balanced', loss='modified_huber')
    clf.fit(train_X, train_y)
    
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = clf.predict_proba(test_X)[:,1]
    pred_oof = clf.predict_proba(val_X)[:,1]
#     bsts.append(model.get_fscore())
    
    
    return pred_test_y, pred_oof, model

In [None]:
from catboost import CatBoostClassifier
probs = []
# catboost_params = {
#     'iterations': 200,
#     'learning_rate': 0.5,
#     'depth': 3,
#     'l2_leaf_reg': 40,
#     'bootstrap_type': 'Bernoulli',
#     'subsample': 0.7,
#     'scale_pos_weight': 5,
#     'eval_metric': 'AUC',
#     'od_type': 'Iter',
#     'allow_writing_files': False
# }

def run_catboost(train_X, train_y, val_X, val_y, test_X, params, model=None):
    

    start_time = time.time()
    
#     if model:
#         bst = xgb.Booster(params)
#         bst.load_model(model)
#         model = bst
    categorical_columns = [i for i, col in enumerate(train_X.columns) \
                           if not pd.api.types.is_numeric_dtype(train_X[col].dtype)]
    
    model = CatBoostClassifier(iterations=1000, depth=6, 
                               learning_rate=0.2,
                               cat_features=categorical_columns,
                               l2_leaf_reg=40,
#                                scale_pos_weight=11,
#                                loss_function='Logloss',
                               bootstrap_type='Bernoulli',
                               eval_metric='AUC',
#                                rsm=0.8,
#                                logging_level='Verbose',
                               used_ram_limit='3gb',
                               task_type='GPU',
                               class_weights=[1, 11],
                               devices=['cuda:0'],
                               subsample=0.9,
                              early_stopping_rounds=30)
    
    if train_X.isna().any().any():
        train_X.fillna(0, inplace=True)
    
    if val_X.isna().any().any():
        val_X.fillna(0, inplace=True)
    
    if test_X.isna().any().any():
        test_X.fillna(0, inplace=True)
    
    model.fit(train_X, train_y, 
              use_best_model=True, 
              eval_set=(val_X, val_y), 
              verbose=50)
    
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_oof = model.predict_proba(val_X)[:,1]

    #     bsts.append(model.get_fscore())
    
    
    return pred_test_y, pred_oof, model

In [None]:
oofs = []

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, params, func, stratified = False, debug= False):
    # Divide in training/validation and test data
#     df['cluster_label'] = labels
#     df = pd.get_dummies(df, columns=['cluster_label'], drop_first=True)
    
    train_df = df[df['TARGET'].notnull()]
    for i in range(stack_inter_train.shape[1]):
        train_df['col_stack_{}'.format(i)] = stack_inter_train[:, i]
#     train_df['stack_0'] = oof_train
#     train_df['stack_1'] = oof_train1
#     train_df['stack_2'] = oof_train2
#     train_df['stack_3'] = oof_train3
    
    
    
    test_df = df[df['TARGET'].isnull()]
    for i in range(stack_inter_test.shape[1]):
        test_df['col_stack_{}'.format(i)] = stack_inter_test[:, i]
#     test_df['stack_0'] = pred_test
#     test_df['stack_1'] = pred_test1
#     test_df['stack_2'] = pred_test2
#     test_df['stack_3'] = pred_test3
    
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization

#         print('Fitting small trees...')
        fold_test_preds, oof_fold, clf = func(train_x, train_y, valid_x, valid_y, test_df[feats], params)
#         clf.save_model('xgb_small_trees')
#         print('Fitting bit trees...')
#         fold_test_preds, oof_fold, clf = run_xgb(train_x, train_y, valid_x, valid_y, test_df[feats], 
#                                                  params2, 'xgb_small_trees')
#         oofs.append(oof_fold)
#         raise StopIteration
        oof_preds[valid_idx] = oof_fold
        sub_preds += fold_test_preds / folds.n_splits
        
#         fscore = clf.get_fscore()
#         fold_importance_df = pd.DataFrame()
#         fold_importance_df["feature"] = fscore.keys()
#         fold_importance_df["importance"] = fscore.values()
#         fold_importance_df["fold"] = n_fold + 1
        
#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
#     display_importances(feature_importance_df)
    return oof_preds, sub_preds #feature_importance_df

# # Display/plot feature importance
# def display_importances(feature_importance_df_):
#     cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
#     best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
#     plt.figure(figsize=(8, 10))
#     sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
#     plt.title('LightGBM Features (avg over folds)')
#     plt.tight_layout()
#     plt.savefig('lgbm_importances01.png')


In [None]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

In [None]:
test_df = df[df['TARGET'].isnull()]
test_df['TARGET'] = pred_test3
test_df[['SK_ID_CURR', 'TARGET']].to_csv('stack_xgb.csv', index= False)

In [None]:
?xgb.train

In [None]:
num_rows = None
df = application_train_test(num_rows)
with timer("Process bureau and bureau_balance"):
    bureau = bureau_and_balance(num_rows)
    print("Bureau df shape:", bureau.shape)
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    del bureau
    gc.collect()
with timer("Process previous_applications"):
    prev = previous_applications(num_rows)
    print("Previous applications df shape:", prev.shape)
    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev
    gc.collect()
with timer("Process POS-CASH balance"):
    pos = pos_cash(num_rows)
    print("Pos-cash balance df shape:", pos.shape)
    df = df.join(pos, how='left', on='SK_ID_CURR')
    del pos
    gc.collect()
with timer("Process installments payments"):
    ins = installments_payments(num_rows)
    print("Installments payments df shape:", ins.shape)
    df = df.join(ins, how='left', on='SK_ID_CURR')
    del ins
    gc.collect()
with timer("Process credit card balance"):
    cc = credit_card_balance(num_rows)
    print("Credit card balance df shape:", cc.shape)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc
    gc.collect()

In [None]:
df.shape

In [None]:
params = {'n_estimators':100,
             'max_depth':3,
             'lambda':10000,
             'eta': 0.3, 
            "colsample_bytree":0.8,
          "colsample_bylevel":0.3,
            "sample":0.7,
            "min_child_weight":50,
          'gamma':5,
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }


# params2 = {'n_estimators':200,
#              'max_depth':10,
#              'lambda':1000,
#              'eta': 0.01, 
#             "colsample_bytree":0.1,
#           "colsample_bylevel":0.1,
#             "sample":0.1,
#             "min_child_weight":10,
# #           'gamma':5,
# #              "tweedie_variance_power":1.50,
#              'objective': 'gpu:binary:logistic', 
#               'tree_method':'gpu_hist',
#              'eval_metric':'auc',
#          }

In [None]:
df = pd.read_csv('ann_data.csv')

In [None]:
def one_hot_encoder(data, nan_as_category = True):
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    for c in categorical_columns:
        if nan_as_category:
            data[c].fillna('NaN', inplace = True)
        values = list(data[c].unique())
        for v in values:
            data[str(c) + '_' + str(v)] = (data[c] == v).astype(np.uint8)
    data.drop(categorical_columns, axis = 1, inplace = True)
    return data, [c for c in data.columns if c not in original_columns]

In [None]:
df, cats = one_hot_encoder(df)

In [None]:
oof_train, pred_test = kfold_lightgbm(df.drop(columns=['index']), num_folds=3, 
                                      params=None, func=run_catboost,
                                     stratified= True, debug=True)

In [None]:
oof_train1, pred_test1 = kfold_lightgbm(df.drop(columns=['index']), num_folds=3, params=None, func=run_extra_trees,
                                     stratified= True, debug=True)

In [None]:
oof_train2, pred_test2 = kfold_lightgbm(df.drop(columns=['index']), num_folds=3, params=None, func=run_log_reg,
                                     stratified= True, debug=True)

In [None]:
params1 = {'n_estimators':1000,
             'max_depth':5,
             'lambda':10,
             'eta': 0.05, 
            "colsample_bytree":0.7,
          "colsample_bylevel":0.7,
            "sample":0.5,
            "min_child_weight":150,
#           'gamma':5,
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }

oof_train3, pred_test3 = kfold_lightgbm(df.drop(columns=['index']), num_folds=5, params=params1, func=run_xgb,
                                     stratified= True, debug=True)

In [None]:
list(df.columns)

In [None]:
[x[0] for x in sorted(bsts[2].items(), key=lambda x: -x[1])][:20]

In [None]:
params3 = {'n_estimators':500,
             'max_depth':7,
#              'lambda':10000,
             'eta': 0.1, 
            "colsample_bytree":0.71,
          "colsample_bylevel":0.71,
            "sample":0.8,
            "min_child_weight":150,
          'gamma':5,
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }
oof_train4, pred_test4 = kfold_lightgbm(df.drop(columns=['index']), num_folds=5, params=params3, func=run_xgb,
                                     stratified= True, debug=True)

In [None]:
params3 = {'max_depth':10,
             'lambda':1000,
             'eta': 0.1, 
            "colsample_bytree":0.5,
          "colsample_bylevel":0.5,
            "sample":0.8,
            "min_child_weight":250,
          'gamma':15,
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }

oof_train3, pred_test3 = kfold_lightgbm(df, num_folds=10, params=params3, func=run_xgb,
                                     stratified= True, debug=True)

In [None]:
params4 = {'n_estimators':500,
             'max_depth':3,
             'lambda':10,
             'eta': 0.1,
               'alpha':10,
            "colsample_bytree":0.5,
          "colsample_bylevel":0.5,
            "sample":0.9,
            "min_child_weight":150,
          'gamma':5,
#            'booster':'gblinear',
           "max_delta_step":9,
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }

oof_train4, pred_test4 = kfold_lightgbm(df, num_folds=5, params=params4, func=run_xgb,
                                     stratified= True, debug=True)

In [None]:
params5 = {'max_depth':3,
             
             'eta': 0.1,
               'alpha':1000,
            "colsample_bytree":0.5,
          "colsample_bylevel":0.5,
            "sample":0.7,
            "min_child_weight":150,
          'gamma':15,
           'booster':'dart',
#              "tweedie_variance_power":1.50,
             'objective': 'gpu:binary:logistic', 
              'tree_method':'gpu_hist',
             'eval_metric':'auc',
         }

oof_train5, pred_test5 = kfold_lightgbm(df, num_folds=7, params=params5, func=run_catboost,
                                     stratified= True, debug=True)

In [None]:
bsts[0]

In [None]:
stack_df = np.concatenate([oof_train.reshape(-1,1),
                          oof_train1.reshape(-1,1),
                          oof_train2.reshape(-1,1), oof_train3.reshape(-1,1), 
                          train_df[[x[0] for x in sorted(bsts[2].items(), key=lambda x: -x[1])][:10]].fillna(0).values], axis=1)

In [None]:
stack_df_test = np.concatenate([pred_test.reshape(-1,1),
                          pred_test1.reshape(-1,1),
                          pred_test2.reshape(-1,1), pred_test3.reshape(-1,1),
                               test_df[[x[0] for x in sorted(bsts[2].items(), key=lambda x: -x[1])][:10]].fillna(0).values], axis=1)

In [None]:
df.drop(columns=['index', 'TARGET']).fillna(0)

In [None]:
from sklearn.cluster import MiniBatchKMeans
gc.collect()
km = MiniBatchKMeans(1000, verbose=True)
data = StandardScaler().fit_transform(df.drop(columns=['index', 'TARGET']).replace([np.nan, np.inf, -np.inf], 0))
km.fit(data)

In [None]:
labels = km.labels_

In [None]:
train_df = df[df['TARGET'].notnull()]

In [None]:
test_df

In [None]:
y = train_df.TARGET.values

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve

In [None]:
?roc_auc_score

In [None]:
regr.predict_proba(valid_X)[:,1]

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pl = PolynomialFeatures(degree=3)
stack_inter_train = pl.fit_transform(stack_df)

In [None]:
stack_inter_train.shape

In [None]:
stack_inter_test = pl.transform(stack_df_test)

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(stack_inter_train, y, random_state=23)
regr = LogisticRegression(C=100, class_weight='balanced')
regr.fit(train_X, train_y)
preds = regr.predict_proba(valid_X)[:,1]
print(roc_auc_score(valid_y, preds))

In [None]:
regr = LogisticRegression(C=100, class_weight='balanced')
regr.fit(X, y)

In [None]:
sub_preds = regr.predict_proba(X_test)[:,1]

In [None]:
sub_preds = regr.predict(stack_df_test)

In [None]:
sub_preds

In [None]:
test_df = df[df['TARGET'].isnull()]
test_df['TARGET'] = sub_preds
test_df[['SK_ID_CURR', 'TARGET']].to_csv('stack_subm', index= False)

In [None]:
test_df['TARGET']

In [None]:
submission_file_name = 'subm_2.csv'

In [None]:
df['EXT_SOURCE_1']

In [None]:
features = feat_importance.groupby('feature')['importance'].sum().sort_values(ascending = False)

In [None]:
features

In [None]:
df

In [None]:
poly_columns = list(features[features > 100].index)

In [None]:
poly_coluns.index

In [None]:
poly_features = df[list(poly_columns)]

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median', )

# poly_target = poly_features['TARGET']

# poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
# poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 2, interaction_only=True)

In [None]:
poly_features[np.isinf(poly_features)] = 0

In [None]:
poly_features

In [None]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
# poly_features_test = poly_transformer.transform(poly_features_test)
# print('Polynomial Features shape: ', poly_features.shape)

In [None]:
feats = poly_transformer.get_feature_names(input_features =poly_columns)

In [None]:
poly_df = pd.DataFrame(poly_features, 
                             columns = feats)


In [None]:
poly_df['TARGET'] = df['TARGET']

In [None]:
poly_coluns = list(features[features > 5].index)

In [None]:
len(poly_coluns)

In [None]:
poly_df[poly_coluns] = df[poly_coluns]

In [None]:
del df

In [None]:
gc.collect()

In [None]:
list(poly_df.columns)

In [None]:
df.shape