In [108]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

train = pd.read_csv("data/application_train.csv")
test = pd.read_csv("data/application_test.csv")

In [109]:
#common fuc
def error(actual, predicted):
    actual = np.log(actual)
    predicted = np.log(predicted)
    return np.sqrt(np.sum(np.square(actual-predicted))/len(actual))

def log_transform(frame, feature):
    frame[feature] = np.log1p(frame[feature].values)

def quadratic(frame, feature):
    frame[feature+'2'] = frame[feature]**2
    
#customize func    
def func_NAME_EDUCATION_TYPE(x):
    if x in ('Higher education', 'Academic degree'):
        return 1
    else:
        return 0
    
def func_NAME_HOUSING_TYPE(x):
    if x in ('Maternity leave', 'Unemployede'):
        return 1
    else:
        return 0

In [110]:
def feature_processing(frame):
    _FLAG_SUM = frame[['FLAG_DOCUMENT_2',
                            'FLAG_DOCUMENT_3',
                            'FLAG_DOCUMENT_4',
                            'FLAG_DOCUMENT_5',
                            'FLAG_DOCUMENT_6',
                            'FLAG_DOCUMENT_7',
                            'FLAG_DOCUMENT_8',
                            'FLAG_DOCUMENT_9',
                            'FLAG_DOCUMENT_10',
                            'FLAG_DOCUMENT_11',
                            'FLAG_DOCUMENT_12',
                            'FLAG_DOCUMENT_13',
                            'FLAG_DOCUMENT_14',
                            'FLAG_DOCUMENT_15',
                            'FLAG_DOCUMENT_16',
                            'FLAG_DOCUMENT_17',
                            'FLAG_DOCUMENT_18',
                            'FLAG_DOCUMENT_19',
                            'FLAG_DOCUMENT_20',
                            'FLAG_DOCUMENT_21']]
    frame['FLAG_DOCUMENT_SUM'] = _FLAG_SUM.sum(axis=1)

    _PHONE_SUM = frame[[
        'FLAG_MOBIL',
        'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE',
        'FLAG_CONT_MOBILE',
        'FLAG_PHONE']]
    frame['PHONE_SUM'] = _PHONE_SUM.sum(axis=1)

    # frame_DAYS_COLS = [col for col in frame.columns if 'DAYS_' in col]
    # frame_DAYS_COLS
    frame['YEARS_BIRTH'] = frame['DAYS_BIRTH'] * (-1) / 365
    frame['YEARS_EMPLOYED'] = frame['DAYS_EMPLOYED'] * (-1) / 365
    frame['YEARS_REGISTRATION'] = frame['DAYS_REGISTRATION'] * (-1) / 365
    frame['YEARS_ID_PUBLISH'] = frame['DAYS_ID_PUBLISH'] * (-1) / 365
    frame['YEARS_LAST_PHONE_CHANGE'] = frame['DAYS_LAST_PHONE_CHANGE'] * (-1) / 365

    frame['AMT_INCOME_TOTAL_PER_FAM_MEMBERS'] = frame['AMT_INCOME_TOTAL'] / frame['CNT_FAM_MEMBERS']



    frame['NAME_CONTRACT_TYPE'] = frame['NAME_CONTRACT_TYPE'].apply(lambda x: 1 if x =='Cash loans' else 0)
    frame['FLAG_OWN_CAR'] = frame['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'y' else 0)
    frame['AMT_INCOME_TOTAL'] = frame['AMT_INCOME_TOTAL'].apply(lambda x: 1 if x > 13.3 else 0)


    frame['NAME_EDUCATION_TYPE'] = frame['NAME_EDUCATION_TYPE'].apply(func_NAME_EDUCATION_TYPE)


    frame['NAME_HOUSING_TYPE'] = frame['NAME_HOUSING_TYPE'].apply(func_NAME_HOUSING_TYPE)

    frame['REGION_POPULATION_RELATIVE'] = frame['REGION_POPULATION_RELATIVE'].apply(lambda x: 1 if x >= 0.02 else 0)
    frame['OWN_CAR_AGE'] = frame['OWN_CAR_AGE'].apply(lambda x: 1 if x <= 10 else 0)
    
#     if isTrain == 'T' :
#         frame['ORGANIZATION_spmean'] = frame[['ORGANIZATION_TYPE', 'TARGET']].groupby(['ORGANIZATION_TYPE']).mean()['TARGET']
#         frame['ORGANIZATION_TYPE'] = frame['ORGANIZATION_spmean'].apply(lambda x: 1 if x > 0.08 else 0)
    
feature_processing(train)
feature_processing(test)

In [111]:
log_transform(train, 'AMT_CREDIT')
log_transform(train, 'AMT_ANNUITY')
log_transform(train, 'AMT_GOODS_PRICE')
log_transform(train, 'AMT_INCOME_TOTAL')

log_transform(test, 'AMT_CREDIT')
log_transform(test, 'AMT_ANNUITY')
log_transform(test, 'AMT_GOODS_PRICE')
log_transform(test, 'AMT_INCOME_TOTAL')

In [112]:
def drop_columns(frame):   
    frame = frame.drop(columns=['APARTMENTS_MEDI',
                                'BASEMENTAREA_MEDI',
                                'YEARS_BEGINEXPLUATATION_MEDI',
                                'YEARS_BUILD_MEDI',
                                'COMMONAREA_MEDI',
                                'ELEVATORS_MEDI',
                                'ENTRANCES_MEDI',
                                'FLOORSMAX_MEDI',
                                'FLOORSMIN_MEDI',
                                'LANDAREA_MEDI',
                                'LIVINGAPARTMENTS_MEDI',
                                'LIVINGAREA_MEDI',
                                'NONLIVINGAPARTMENTS_MEDI',
                                'NONLIVINGAREA_MEDI',
                                'APARTMENTS_MODE',
                                'BASEMENTAREA_MODE',
                                'YEARS_BEGINEXPLUATATION_MODE',
                                'YEARS_BUILD_MODE',
                                'COMMONAREA_MODE',
                                'ELEVATORS_MODE',
                                'ENTRANCES_MODE',
                                'FLOORSMAX_MODE',
                                'FLOORSMIN_MODE',
                                'LANDAREA_MODE',
                                'LIVINGAPARTMENTS_MODE',
                                'LIVINGAREA_MODE',
                                'NONLIVINGAPARTMENTS_MODE',
                                'NONLIVINGAREA_MODE',
                                'FONDKAPREMONT_MODE',
                                'HOUSETYPE_MODE',
                                'TOTALAREA_MODE',
                                'WALLSMATERIAL_MODE',
                                'EMERGENCYSTATE_MODE',
                                'APARTMENTS_AVG',
                                'BASEMENTAREA_AVG',
                                'YEARS_BEGINEXPLUATATION_AVG',
                                'YEARS_BUILD_AVG',
                                'COMMONAREA_AVG',
                                'ELEVATORS_AVG',
                                'ENTRANCES_AVG',
                                'FLOORSMAX_AVG',
                                'FLOORSMIN_AVG',
                                'LANDAREA_AVG',
                                'LIVINGAPARTMENTS_AVG',
                                'LIVINGAREA_AVG',
                                'NONLIVINGAPARTMENTS_AVG',
                                'NONLIVINGAREA_AVG'
                                ,
                                'FLAG_DOCUMENT_2',
                                'FLAG_DOCUMENT_3',
                                'FLAG_DOCUMENT_4',
                                'FLAG_DOCUMENT_5',
                                'FLAG_DOCUMENT_6',
                                'FLAG_DOCUMENT_7',
                                'FLAG_DOCUMENT_8',
                                'FLAG_DOCUMENT_9',
                                'FLAG_DOCUMENT_10',
                                'FLAG_DOCUMENT_11',
                                'FLAG_DOCUMENT_12',
                                'FLAG_DOCUMENT_13',
                                'FLAG_DOCUMENT_14',
                                'FLAG_DOCUMENT_15',
                                'FLAG_DOCUMENT_16',
                                'FLAG_DOCUMENT_17',
                                'FLAG_DOCUMENT_18',
                                'FLAG_DOCUMENT_19',
                                'FLAG_DOCUMENT_20',
                                'FLAG_DOCUMENT_21',
                                'FLAG_MOBIL',
                                'FLAG_EMP_PHONE',
                                'FLAG_WORK_PHONE',
                                'FLAG_CONT_MOBILE',
                                'FLAG_PHONE'
                                ,
#                                 'ORGANIZATION_spmean'
#                                 ,
                                'SK_ID_CURR'
                                ,
                                'DAYS_BIRTH',
                                'DAYS_EMPLOYED',
                                'DAYS_REGISTRATION',
                                'DAYS_ID_PUBLISH',
                                'DAYS_LAST_PHONE_CHANGE'
                                ])

drop_columns(train)
drop_columns(test)

In [113]:
numerical = [f for f in train.columns if train.dtypes[f] != 'object']
numerical.remove('TARGET')
categorical = [f for f in train.columns if train.dtypes[f] == 'object']

In [114]:
def categorical_processing(frame):
    for c in categorical:
        frame[c] = frame[c].astype('category')
        if frame[c].isnull().any():
            frame[c] = frame[c].cat.add_categories(['MISSING'])
            frame[c] = frame[c].fillna('MISSING')

categorical_processing(train)    
categorical_processing(test)

In [115]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # ordering['spmean'] = frame[[feature, 'TARGET']].groupby(feature).mean()['TARGET']
    # ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()

    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

cate_encoded = []
for q in categorical:
    encode(train, q)
    encode(test, q)
    cate_encoded.append(q+'_E')
# print(cate_encoded)

In [120]:
split = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
for train_index, test_index in split.split(train, train["NAME_INCOME_TYPE"]):
    train_set = train.loc[train_index]
    test_set = train.loc[test_index]

In [122]:
features = numerical + cate_encoded

# X_train = train[features].fillna(0.).values
# y_train = train['TARGET'].values
X_test = test[features].fillna(0.).values


X_train = train_set[features].fillna(0.).values
y_train = train_set['TARGET'].values
X_test_set = test_set[features].fillna(0.).values
y_test_set = test_set['TARGET'].values

In [123]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)


In [98]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

logreg = LogisticRegression()
logreg.fit(X_resampled, y_resampled) 
y_pred = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred) # 0.6373158605096496

0.611076092665894

In [99]:
logreg_10 = LogisticRegression(C=10)
logreg_10.fit(X_resampled, y_resampled)
y_pred_10 = logreg_10.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_10) # 0.6221440069544937

0.6433452197102968

In [100]:
logreg_0_1 = LogisticRegression(C=0.1)
logreg_0_1.fit(X_resampled, y_resampled)
y_pred_0_1 = logreg_0_1.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_0_1) # 0.6153402506562919

0.6126248017069886

In [101]:
logreg_00_1 = LogisticRegression(C=0.01)
logreg_00_1.fit(X_resampled, y_resampled)
y_pred_00_1 = logreg_00_1.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_00_1) # 0.658615496478248


0.594616189715705

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)            # 0.7190124215159207
# rf.fit(X_resampled, y_resampled)  # 0.6841750535878548 
y_pred_rf = rf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_rf) 


0.7128036728221188

In [20]:
# importances = rf.feature_importances_
# indices = np.argsort(importances)[::-1]
# 
# Print the feature ranking
# print("Feature ranking:")
# 
# for f in range(X_train.shape[1]):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [103]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

lda = LDA()
lda.fit(X_train, y_train)           # 0.7211824305608072
# lda.fit(X_resampled, y_resampled)   # 0.7209399233541647
y_pred_lda = lda.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_lda) 



0.7194741707011784

In [22]:
# qda =QDA()
# qda.fit(X_train, y_train)           # 0.5799225915230429
# qda.fit(X_resampled, y_resampled)     # 0.6290432696915695
# y_pred_qda = qda.predict_proba(X_test)[:,1]
# roc_auc_score(y_test, y_pred_qda)



0.6290432696915695

In [23]:
# from sklearn.svm import SVC
# svc = SVC(random_state=0, probability=True)
# # svc.fit(X_train, y_train)
# svc.fit(X_resampled, y_resampled)
# y_pred_svc = svc.predict_proba(X_test)[:, 1]
# roc_auc_score(y_test, y_pred_svc)


In [104]:
import sklearn
from brew.base import Ensemble, EnsembleClassifier
from brew.stacking.stacker import EnsembleStack, EnsembleStackClassifier
from brew.combination.combiner import Combiner

# Creating Ensemble
ensemble = Ensemble([logreg_0_1, rf, lda])
eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean'))

eclf.fit(X_resampled, y_resampled) # 0.720510230719374
# eclf.fit(X_train, y_train)           # 0.7082597786530709
y_pred_ensemble = eclf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_ensemble)     



0.7130636693391459

In [105]:
# Creating Stacking
layer_1 = Ensemble([logreg_0_1, rf, lda])
layer_2 = Ensemble([sklearn.clone(logreg_0_1)])
stack = EnsembleStack(cv=3)
stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifier(stack)

# sclf.fit(X_resampled, y_resampled) # 0.6557789148329445
sclf.fit(X_train, y_train)           # 0.7247774845417322
y_pred_stack = sclf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_stack)    




0.7227254393559674

In [124]:
import lightgbm as lgb

# create dataset for lightgbm
# lgb_train = lgb.Dataset(X_resampled, y_resampled) # 0.7032034257774565
lgb_train = lgb.Dataset(X_train, y_train)           # 0.7310050369948817
lgb_eval = lgb.Dataset(X_test_set, y_test_set, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                 valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict
y_pred_gbm = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# roc_auc_score(y_test, y_pred_gbm) 


[1]	valid_0's auc: 0.698063	valid_0's l2: 0.0742975
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.703834	valid_0's l2: 0.0739496
[3]	valid_0's auc: 0.710396	valid_0's l2: 0.0737181
[4]	valid_0's auc: 0.713534	valid_0's l2: 0.073414
[5]	valid_0's auc: 0.714685	valid_0's l2: 0.0731426
[6]	valid_0's auc: 0.715765	valid_0's l2: 0.0728919
[7]	valid_0's auc: 0.715841	valid_0's l2: 0.0726669
[8]	valid_0's auc: 0.716341	valid_0's l2: 0.0724565
[9]	valid_0's auc: 0.717728	valid_0's l2: 0.0722567
[10]	valid_0's auc: 0.722815	valid_0's l2: 0.072134
[11]	valid_0's auc: 0.72424	valid_0's l2: 0.0719841
[12]	valid_0's auc: 0.725156	valid_0's l2: 0.0718427
[13]	valid_0's auc: 0.725968	valid_0's l2: 0.0716795
[14]	valid_0's auc: 0.726247	valid_0's l2: 0.0715674
[15]	valid_0's auc: 0.726843	valid_0's l2: 0.0714255
[16]	valid_0's auc: 0.727099	valid_0's l2: 0.0712983
[17]	valid_0's auc: 0.727225	valid_0's l2: 0.0711978
[18]	valid_0's auc: 0.728034	valid_0's l2: 0.07108

In [94]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
# ada_clf.fit(X_resampled, y_resampled) # 0.7121451170928985
ada_clf.fit(X_train, y_train)           # 0.7458267768231585
y_pred_ada = ada_clf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_ada)    

In [107]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth=1, random_state=0).fit(X_train, y_train)          # 0.7257854798881231
# gbrt = GradientBoostingClassifier(max_depth=1, random_state=0).fit(X_resampled, y_resampled)    # 0.6811196557588269
y_pred_gbrt = gbrt.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_gbrt)  




0.7256956739415097

In [125]:
my_submission = pd.DataFrame({'SK_ID_CURR': test.SK_ID_CURR, 'TARGET': y_pred_gbm})
my_submission.to_csv('submission_lightgbm.csv', index=False)