In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

# Stack models

In [None]:
# number of folds for cross validation training

kfold = 3
skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=0)

In [None]:
# Open solution with different seeds https://github.com/neptune-ml/open-solution-home-credit

# seed 0
df_cv_nept_1 = pd.read_csv(r'open_solution_cv_1.csv')
df_sub_nept_1 = pd.read_csv(r'open_solution_sub_1.csv')

# seed 90210
df_cv_nept_2 = pd.read_csv(r'open_solution_cv_2.csv')
df_sub_nept_2 = pd.read_csv(r'open_solution_sub_2.csv')

# kaggle solution https://www.kaggle.com/aantonova/797-lgbm-and-bayesian-optimization

df_cv_kaggle = pd.read_csv(r'kaggle_cv.csv')
df_sub_kaggle = pd.read_csv(r'kaggle_sub.csv')

# my XGBoost model 

df_cv = pd.read_csv(r'model_cv.csv')
df_sub = pd.read_csv(r'model_sub.csv')

In [None]:
df_cv = pd.merge(df_cv, df_cv_nept_1, how = 'left', on = ['SK_ID_CURR'])
df_cv = pd.merge(df_cv, df_cv_nept_2, how = 'left', on = ['SK_ID_CURR'])
df_cv = pd.merge(df_cv, df_cv_kaggle, how = 'left', on = ['SK_ID_CURR'])

df_cv.columns = ['SK_ID_CURR', 'TARGET', 'MODEL', 'OPEN_SOLUTION_1', 'OPEN_SOLUTION_2', 'KAGGLE']

In [None]:
df_sub = pd.merge(df_sub, df_sub_nept_1, how = 'left', on = ['SK_ID_CURR'])
df_sub = pd.merge(df_sub, df_sub_nept_2, how = 'left', on = ['SK_ID_CURR'])
df_sub = pd.merge(df_sub, df_sub_kaggle, how = 'left', on = ['SK_ID_CURR'])

df_sub.columns = ['SK_ID_CURR', 'MODEL', 'OPEN_SOLUTION_1', 'OPEN_SOLUTION_2', 'KAGGLE']

In [None]:
def normalize(df1, df2):
    df2['TARGET'] = 999
    cols = df1.columns
    df_ = pd.concat([df1[cols], df2[cols]])
    for col in df_.columns:
        if col not in ['SK_ID_CURR', 'TARGET']:
            df_[col] = df_[col].rank()
            min_ = df_[col].min()
            max_ = df_[col].max()
            df_[col] = (df_[col] - min_ + 1e-7) / (max_ - min_)
    return df_[df_['TARGET'] != 999].reset_index().drop('index', axis = 1), df_[df_['TARGET'] == 999].reset_index().drop('index', axis = 1)

df, df_test = normalize(df_cv, df_sub)

In [None]:
cols = [c for c in df.columns if c not in ['SK_ID_CURR', 'TARGET']]

In [None]:
# go from probabilities to odds

for col in cols:
    df[col] = np.log(df[col] / (1 - df[col]+1e-7))
    df_test[col] = np.log(df_test[col] / (1 - df_test[col]+1e-7))

In [None]:
X = df[cols].values
y = df['TARGET'].values

y_pred = 0

X_test = df_test[cols].values


In [None]:
pred_train = 0*df['TARGET']

n_rand = 1

for i, (train_index, test_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    y_train = y[train_index]
    X_val = X[test_index]
    y_val = y[test_index]
    clf = LogisticRegression(fit_intercept=True,  C = n_rand, random_state=n_rand)
    clf.fit(X_train, y_train)
    print('\nFold {}:\n'.format(str(i+1)))
    print('AUC STACK: ', roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1]))
    y_pred += clf.predict_proba(X_test)[:, 1] / (kfold*n_rand)

    pred_train.loc[test_index] += clf.predict_proba(X_val)[:, 1] / (kfold*n_rand)

print('\nExpected AUC stack perfomance: ', roc_auc_score(y, pred_train))

In [None]:
df_test['TARGET'] = y_pred

df_test['TARGET'] = df_test['TARGET'] / df_test['TARGET'].max()
df_test[['SK_ID_CURR', 'TARGET']].to_csv(r'best_sub_085.csv', index = False)

# Add data leak

## Find customers with history of loans

In [None]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')

tr = train[['SK_ID_CURR','DAYS_BIRTH',
       'DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','CODE_GENDER','REGION_POPULATION_RELATIVE','TARGET']].copy()

test["TARGET"] = np.nan
tst = test[['SK_ID_CURR','DAYS_BIRTH',
       'DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','CODE_GENDER','REGION_POPULATION_RELATIVE','TARGET']].copy()

tr['DAYS_REGISTRATION'] = tr['DAYS_REGISTRATION'] - tr['DAYS_BIRTH']
tr['DAYS_ID_PUBLISH'] = tr['DAYS_ID_PUBLISH'] - tr['DAYS_BIRTH']

tst['DAYS_REGISTRATION'] = tst['DAYS_REGISTRATION'] - tst['DAYS_BIRTH']
tst['DAYS_ID_PUBLISH'] = tst['DAYS_ID_PUBLISH'] - tst['DAYS_BIRTH']

data = pd.concat([tr, tst])

d = data.groupby(['DAYS_REGISTRATION','DAYS_ID_PUBLISH',
               'CODE_GENDER','REGION_POPULATION_RELATIVE'])['SK_ID_CURR'].count().sort_values(ascending = False)

d = d[d > 1]

d = d.reset_index()
d.columns = ['DAYS_REGISTRATION','DAYS_ID_PUBLISH',
               'CODE_GENDER','REGION_POPULATION_RELATIVE', 'n']

d['cluster'] = d.index

data = pd.merge(data, d, how = 'left', on = ['DAYS_REGISTRATION','DAYS_ID_PUBLISH','CODE_GENDER','REGION_POPULATION_RELATIVE'])



data = data.sort_values(by = ['cluster', 'DAYS_BIRTH'])

data['lag_target'] = data.groupby('cluster', sort = False)['TARGET'].shift(-1)
data['lead_target'] = data.groupby('cluster', sort = False)['TARGET'].shift(1)

data.loc[data['cluster'].isnull(), 'lag_target'] = -1
data.loc[data['cluster'].isnull(), 'lead_target'] = -1

# data.groupby('lead_target')['SK_ID_CURR'].count()

# data.groupby('lead_target')['TARGET'].mean()

# data.groupby('lag_target')['SK_ID_CURR'].count()

# data.groupby('lag_target')['TARGET'].mean()



special_ids = data[(data['lag_target'] == 1)]['SK_ID_CURR'].unique()

## Customers who previosly had credit default have 82.8% chance for new default

In [None]:
best_try = pd.read_csv(r'best_sub_085.csv')

best_try.loc[best_try['SK_ID_CURR'].isin(special_ids), 'TARGET'] = best_try[best_try['SK_ID_CURR']\
.isin(special_ids)]['TARGET'].map(lambda x: np.random.choice([0,1], p = [1 - 0.828, 0.828]) + x)
best_try.loc[best_try['TARGET'] > 1, 'TARGET'] = 1

best_try.to_csv(r'best_sub_leaky.csv', index = False)