In [None]:
import sys
import os
import numpy as np
import pandas as pd
from scipy.stats import gmean
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook as tqdm
from toolkit.postprocessing import BlendingOptimizer

sys.path.append('../')
from src.utils import read_oof_predictions, calculate_rank

PROJECT_DIR = 'PATH/TO/YOUR/EXPERIMENT'
FIRST_LEVEL_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','first_level')
SECOND_LEVEL_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','second_level')
MISC_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','misc')
TRAIN_FILEPATH = os.path.join(PROJECT_DIR,'files','unzipped_data','application_train.csv')
OUTPUT_DIR = 'PATH/TO/YOUR/OUTPUT'

In [None]:
train_oof, test_oof = read_oof_predictions(FIRST_LEVEL_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')
train_oof_second, test_oof_second = read_oof_predictions(SECOND_LEVEL_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')
train_oof_misc, test_oof_misc = read_oof_predictions(MISC_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')

In [None]:
train_oof.head()

# Predictions
## Correlations

In [None]:
X_cols = [col for col in train_oof.columns if '_cv_' in col]
X, y = train_oof[X_cols], train_oof['TARGET']
X_corr = pd.concat([X,y], axis=1).corr()
display(X_corr.sort_values('TARGET', ascending=False)['TARGET'])
plt.figure(figsize=(16,12))
sns.heatmap(X_corr, xticklabels=X_corr.columns, yticklabels=X_corr.columns, 
            annot=True,vmin=0.0,vmax=1.0)
plt.show()

# Ranks 

In [None]:
def transform_to_ranks(train_oof, test_oof):
    X_cols = [col for col in train_oof.columns if '_cv_' in col]
    train_oof_rank, test_oof_rank = [],[]
    for fold_id in range(5):
        train_oof_fold = train_oof[train_oof['fold_id']==fold_id]
        test_oof_fold = test_oof[test_oof['fold_id']==fold_id]
        for col in X_cols:
            train_oof_fold[col] = calculate_rank(train_oof_fold[col])
            test_oof_fold[col] = calculate_rank(test_oof_fold[col])
        train_oof_rank.append(train_oof_fold)
        test_oof_rank.append(test_oof_fold)

    train_oof_rank = pd.concat(train_oof_rank, axis=0)
    test_oof_rank = pd.concat(test_oof_rank, axis=0)
    return train_oof_rank, test_oof_rank

train_oof_rank, test_oof_rank = transform_to_ranks(train_oof, test_oof)

In [None]:
X_cols = [col for col in train_oof_rank.columns if '_cv_' in col]
X, y = train_oof_rank[X_cols], train_oof_rank['TARGET']
X_corr = pd.concat([X,y], axis=1).corr()
display(X_corr.sort_values('TARGET', ascending=False)['TARGET'])
plt.figure(figsize=(16,12))
sns.heatmap(X_corr, xticklabels=X_corr.columns, yticklabels=X_corr.columns, 
            annot=True,vmin=0.0,vmax=1.0)
plt.show()

# Weights optimization

In [None]:
blender = BlendingOptimizer(metric=roc_auc_score, maximize=True)

blender.fit(X=X.transpose(), y=y.tolist(),
            step_size = 0.25, init_weights = None, warm_start = False)

In [None]:
X_cols = [col for col in train_oof_rank.columns if '_cv_' in col]
y_pred = blender.transform(train_oof_rank[X_cols].transpose())['y_pred']
roc_auc_score(y, y_pred)

In [None]:
y_pred_test = blender.transform(test_oof_rank[X_cols].transpose())['y_pred']

def create_submission(y_pred_test, test):
    test_predictions = test[['SK_ID_CURR','fold_id']]
    test_predictions['TARGET'] = y_pred_test
    submission = []
    for fold_id, fold_df in test_predictions.groupby('fold_id'):
        fold_df['TARGET'] = calculate_rank(fold_df['TARGET'])
        submission.append(fold_df)
    submission = pd.concat(submission, axis=0)
    submission = submission.groupby('SK_ID_CURR')['TARGET'].apply(np.mean).reset_index()
    return submission

submission_first = create_submission(y_pred_test, test_oof_rank)

In [None]:
submission_first.head()

# Second level models

In [None]:
train_oof_second_rank, test_oof_second_rank = transform_to_ranks(train_oof_second, 
                                                                 test_oof_second)

In [None]:
X_cols = [col for col in train_oof_second_rank.columns if '_cv_' in col]
X, y = train_oof_second_rank[X_cols], train_oof_second_rank['TARGET']

In [None]:
X.head()

In [None]:
blender_second = BlendingOptimizer(metric=roc_auc_score, maximize=True)

blender_second.fit(X=X.transpose(), y=y.tolist(),
            step_size = 0.05, init_weights = None, warm_start = False)

In [None]:
y_pred = blender_second.transform(train_oof_second_rank[X_cols].transpose())['y_pred']
roc_auc_score(y, y_pred)

In [None]:
y_pred_test_second = blender_second.transform(test_oof_second_rank[X_cols].transpose())['y_pred']

submission_second = create_submission(y_pred_test_second, test_oof_second_rank)

In [None]:
submission_second.head()

# Misc Models

In [None]:
train_oof_misc_rank, test_oof_misc_rank = transform_to_ranks(train_oof_misc, test_oof_misc)

In [None]:
test_oof_misc_rank.head()

In [None]:
submission_misc = create_submission(test_oof_misc_rank['hc_11095_cv_7957_lb_805'], test_oof_misc_rank)

In [None]:
submission_misc.head()

# Average First+Second+Misc

In [None]:
submission = submission_first.copy()

sub_first = submission_first['TARGET']
sub_second = submission_second['TARGET']
sub_misc = submission_misc['TARGET']

submission['TARGET'] = (1.0 * sub_first + 1.0 * sub_second + 1.0 * sub_misc)/3.

In [None]:
submission.head()

# Submission

In [None]:
submission.to_csv(os.path.join(OUTPUT_DIR,'submission.csv'),index=None)