In [52]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

train = pd.read_csv("4th_week/data/application_train.csv")
test = pd.read_csv("4th_week/data/application_test.csv")

In [53]:
TRAIN_FLAG_SUM = train[['FLAG_DOCUMENT_2',
                        'FLAG_DOCUMENT_3',
                        'FLAG_DOCUMENT_4',
                        'FLAG_DOCUMENT_5',
                        'FLAG_DOCUMENT_6',
                        'FLAG_DOCUMENT_7',
                        'FLAG_DOCUMENT_8',
                        'FLAG_DOCUMENT_9',
                        'FLAG_DOCUMENT_10',
                        'FLAG_DOCUMENT_11',
                        'FLAG_DOCUMENT_12',
                        'FLAG_DOCUMENT_13',
                        'FLAG_DOCUMENT_14',
                        'FLAG_DOCUMENT_15',
                        'FLAG_DOCUMENT_16',
                        'FLAG_DOCUMENT_17',
                        'FLAG_DOCUMENT_18',
                        'FLAG_DOCUMENT_19',
                        'FLAG_DOCUMENT_20',
                        'FLAG_DOCUMENT_21']]
train['FLAG_DOCUMENT_SUM'] = TRAIN_FLAG_SUM.sum(axis=1)

In [54]:
TRAIN_PHONE_SUM = train[[
    'FLAG_MOBIL',
    'FLAG_EMP_PHONE',
    'FLAG_WORK_PHONE',
    'FLAG_CONT_MOBILE',
    'FLAG_PHONE']]
train['PHONE_SUM'] = TRAIN_PHONE_SUM.sum(axis=1)

In [55]:
def error(actual, predicted):
    actual = np.log(actual)
    predicted = np.log(predicted)
    return np.sqrt(np.sum(np.square(actual-predicted))/len(actual))

def log_transform(feature):
    train[feature] = np.log1p(train[feature].values)

def quadratic(feature):
    train[feature+'2'] = train[feature]**2

In [56]:
log_transform('AMT_CREDIT')
log_transform('AMT_ANNUITY')
log_transform('AMT_GOODS_PRICE')
log_transform('AMT_INCOME_TOTAL')

train['NAME_CONTRACT_TYPE'] = train['NAME_CONTRACT_TYPE'].apply(lambda x: 1 if x =='Cash loans' else 0)
train['FLAG_OWN_CAR'] = train['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'y' else 0)
train['AMT_INCOME_TOTAL'] = train['AMT_INCOME_TOTAL'].apply(lambda x: 1 if x > 13.3 else 0)

def func_NAME_EDUCATION_TYPE(x):
    if x in ('Higher education', 'Academic degree'):
        return 1
    else:
        return 0
train['NAME_EDUCATION_TYPE'] = train['NAME_EDUCATION_TYPE'].apply(func_NAME_EDUCATION_TYPE)

def func_NAME_HOUSING_TYPE(x):
    if x in ('Maternity leave', 'Unemployede'):
        return 1
    else:
        return 0
train['NAME_HOUSING_TYPE'] = train['NAME_HOUSING_TYPE'].apply(func_NAME_HOUSING_TYPE)

train['REGION_POPULATION_RELATIVE'] = train['REGION_POPULATION_RELATIVE'].apply(lambda x: 1 if x >= 0.02 else 0)
train['OWN_CAR_AGE'] = train['OWN_CAR_AGE'].apply(lambda x: 1 if x <= 10 else 0)

train['ORGANIZATION_spmean'] = train[['ORGANIZATION_TYPE', 'TARGET']].groupby(['ORGANIZATION_TYPE']).mean()['TARGET']
train['ORGANIZATION_TYPE'] = train['ORGANIZATION_spmean'].apply(lambda x: 1 if x > 0.08 else 0)


In [57]:
train = train.drop(columns=['APARTMENTS_MEDI',
                            'BASEMENTAREA_MEDI',
                            'YEARS_BEGINEXPLUATATION_MEDI',
                            'YEARS_BUILD_MEDI',
                            'COMMONAREA_MEDI',
                            'ELEVATORS_MEDI',
                            'ENTRANCES_MEDI',
                            'FLOORSMAX_MEDI',
                            'FLOORSMIN_MEDI',
                            'LANDAREA_MEDI',
                            'LIVINGAPARTMENTS_MEDI',
                            'LIVINGAREA_MEDI',
                            'NONLIVINGAPARTMENTS_MEDI',
                            'NONLIVINGAREA_MEDI',
                            'APARTMENTS_MODE',
                            'BASEMENTAREA_MODE',
                            'YEARS_BEGINEXPLUATATION_MODE',
                            'YEARS_BUILD_MODE',
                            'COMMONAREA_MODE',
                            'ELEVATORS_MODE',
                            'ENTRANCES_MODE',
                            'FLOORSMAX_MODE',
                            'FLOORSMIN_MODE',
                            'LANDAREA_MODE',
                            'LIVINGAPARTMENTS_MODE',
                            'LIVINGAREA_MODE',
                            'NONLIVINGAPARTMENTS_MODE',
                            'NONLIVINGAREA_MODE',
                            'FONDKAPREMONT_MODE',
                            'HOUSETYPE_MODE',
                            'TOTALAREA_MODE',
                            'WALLSMATERIAL_MODE',
                            'EMERGENCYSTATE_MODE',
                            'APARTMENTS_AVG',
                            'BASEMENTAREA_AVG',
                            'YEARS_BEGINEXPLUATATION_AVG',
                            'YEARS_BUILD_AVG',
                            'COMMONAREA_AVG',
                            'ELEVATORS_AVG',
                            'ENTRANCES_AVG',
                            'FLOORSMAX_AVG',
                            'FLOORSMIN_AVG',
                            'LANDAREA_AVG',
                            'LIVINGAPARTMENTS_AVG',
                            'LIVINGAREA_AVG',
                            'NONLIVINGAPARTMENTS_AVG',
                            'NONLIVINGAREA_AVG'
                            ,
                            'FLAG_DOCUMENT_2',
                            'FLAG_DOCUMENT_3',
                            'FLAG_DOCUMENT_4',
                            'FLAG_DOCUMENT_5',
                            'FLAG_DOCUMENT_6',
                            'FLAG_DOCUMENT_7',
                            'FLAG_DOCUMENT_8',
                            'FLAG_DOCUMENT_9',
                            'FLAG_DOCUMENT_10',
                            'FLAG_DOCUMENT_11',
                            'FLAG_DOCUMENT_12',
                            'FLAG_DOCUMENT_13',
                            'FLAG_DOCUMENT_14',
                            'FLAG_DOCUMENT_15',
                            'FLAG_DOCUMENT_16',
                            'FLAG_DOCUMENT_17',
                            'FLAG_DOCUMENT_18',
                            'FLAG_DOCUMENT_19',
                            'FLAG_DOCUMENT_20',
                            'FLAG_DOCUMENT_21',
                            'FLAG_MOBIL',
                            'FLAG_EMP_PHONE',
                            'FLAG_WORK_PHONE',
                            'FLAG_CONT_MOBILE',
                            'FLAG_PHONE'
                            ,
                            'ORGANIZATION_spmean'
                            ,
                            'SK_ID_CURR'
                            ])

In [58]:
numerical = [f for f in train.columns if train.dtypes[f] != 'object']
numerical.remove('TARGET')
categorical = [f for f in train.columns if train.dtypes[f] == 'object']

In [59]:
for c in categorical:
    train[c] = train[c].astype('category')
    if train[c].isnull().any():
        train[c] = train[c].cat.add_categories(['MISSING'])
        train[c] = train[c].fillna('MISSING')

In [60]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # ordering['spmean'] = frame[[feature, 'TARGET']].groupby(feature).mean()['TARGET']
    # ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()

    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

cate_encoded = []
for q in categorical:
    encode(train, q)
    encode(test, q)
    cate_encoded.append(q+'_E')
# print(cate_encoded)

In [35]:
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42, )
for train_index, test_index in split.split(train, train["NAME_INCOME_TYPE"]):
    train_set = train.loc[train_index]
    test_set = train.loc[test_index]

In [62]:
# features = numerical + cate_encoded
# X_train = train[features].fillna(0.).values
# y_train = train['TARGET'].values
# X_test = test[features].fillna(0.).values


X_train = train_set[features].fillna(0.).values
y_train = train_set['TARGET'].values
X_test = test_set[features].fillna(0.).values
y_test = test_set['TARGET'].values

In [37]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)


In [38]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# logreg = LogisticRegression()
# logreg.fit(X_resampled, y_resampled) 
# y_pred = logreg.predict_proba(X_test)[:,1]
# roc_auc_score(y_test, y_pred) # 0.6373158605096496

In [15]:
# logreg_10 = LogisticRegression(C=10)
# logreg_10.fit(X_resampled, y_resampled)
# y_pred_10 = logreg_10.predict_proba(X_test)[:,1]
# roc_auc_score(y_test, y_pred_10) # 0.6221440069544937

In [45]:
logreg_0_1 = LogisticRegression(C=0.1)
logreg_0_1.fit(X_resampled, y_resampled)
y_pred_0_1 = logreg_0_1.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_0_1) # 0.6607293666286557

0.6325535127299906

In [17]:
# logreg_00_1 = LogisticRegression(C=0.01)
# logreg_00_1.fit(X_resampled, y_resampled)
# y_pred_00_1 = logreg_00_1.predict_proba(X_test)[:,1]
# roc_auc_score(y_test, y_pred_00_1) # 0.658615496478248


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)            # 0.7190124215159207
# rf.fit(X_resampled, y_resampled)  # 0.6841750535878548 
y_pred_rf = rf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_rf) 


0.6832749406941135

In [25]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 44 (0.151524)
2. feature 30 (0.094911)
3. feature 8 (0.093183)
4. feature 10 (0.088280)
5. feature 29 (0.071331)
6. feature 25 (0.064045)
7. feature 15 (0.061227)
8. feature 31 (0.052512)
9. feature 33 (0.043026)
10. feature 18 (0.033106)
11. feature 41 (0.032314)
12. feature 47 (0.032179)
13. feature 26 (0.030855)
14. feature 19 (0.029111)
15. feature 45 (0.025792)
16. feature 43 (0.015207)
17. feature 17 (0.013287)
18. feature 50 (0.010965)
19. feature 28 (0.010716)
20. feature 3 (0.009622)
21. feature 24 (0.007029)
22. feature 32 (0.006869)
23. feature 49 (0.004867)
24. feature 42 (0.004544)
25. feature 1 (0.003758)
26. feature 48 (0.003414)
27. feature 34 (0.002043)
28. feature 7 (0.001702)
29. feature 22 (0.000591)
30. feature 23 (0.000541)
31. feature 5 (0.000440)
32. feature 35 (0.000329)
33. feature 16 (0.000220)
34. feature 20 (0.000216)
35. feature 46 (0.000145)
36. feature 40 (0.000081)
37. feature 11 (0.000022)
38. feature 14 (0.000000)
39. featu

In [40]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

lda = LDA()
lda.fit(X_train, y_train)           # 0.7211824305608072
lda.fit(X_resampled, y_resampled)   # 0.7209399233541647
y_pred_lda = lda.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_lda) 



0.7211824305608072

In [43]:
qda =QDA()
qda.fit(X_train, y_train)           # 0.5799225915230429
# qda.fit(X_resampled, y_resampled) # 0.6193209887224465
y_pred_qda = qda.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_qda)



0.5799225915230429

In [21]:
# from sklearn.svm import SVC
# svc = SVC(random_state=0, probability=True)
# # svc.fit(X_train, y_train)
# svc.fit(X_resampled, y_resampled)
# y_pred_svc = svc.predict_proba(X_test)[:, 1]
# roc_auc_score(y_test, y_pred_svc)


In [46]:
import sklearn
from brew.base import Ensemble, EnsembleClassifier
from brew.stacking.stacker import EnsembleStack, EnsembleStackClassifier
from brew.combination.combiner import Combiner

# Creating Ensemble
ensemble = Ensemble([logreg_0_1, rf, lda])
eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean'))

# eclf.fit(X_resampled, y_resampled) # 0.7175472150622211
eclf.fit(X_train, y_train)           # 0.7082597786530709
y_pred_ensemble = eclf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_ensemble)     





0.7082597786530709

In [47]:
# Creating Stacking
layer_1 = Ensemble([logreg_0_1, rf, lda])
layer_2 = Ensemble([sklearn.clone(logreg_0_1)])
stack = EnsembleStack(cv=3)
stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifier(stack)

# sclf.fit(X_resampled, y_resampled) # 0.6557789148329445
sclf.fit(X_train, y_train)           # 0.7247774845417322
y_pred_stack = sclf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_stack)    










0.7247774845417322

In [48]:
import lightgbm as lgb

# create dataset for lightgbm
# lgb_train = lgb.Dataset(X_resampled, y_resampled) # 0.7032034257774565
lgb_train = lgb.Dataset(X_train, y_train)           # 0.7310050369948817
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict
y_pred_gbm = gbm.predict(X_test, num_iteration=gbm.best_iteration)
roc_auc_score(y_test, y_pred_gbm) 


[1]	valid_0's auc: 0.700379	valid_0's l2: 0.0738693
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.716268	valid_0's l2: 0.073691


[3]	valid_0's auc: 0.718226	valid_0's l2: 0.0734266
[4]	valid_0's auc: 0.722806	valid_0's l2: 0.073111
[5]	valid_0's auc: 0.72497	valid_0's l2: 0.0728113
[6]	valid_0's auc: 0.724578	valid_0's l2: 0.0725492
[7]	valid_0's auc: 0.724426	valid_0's l2: 0.0723091
[8]	valid_0's auc: 0.724563	valid_0's l2: 0.0720812


[9]	valid_0's auc: 0.724327	valid_0's l2: 0.0718823
[10]	valid_0's auc: 0.725367	valid_0's l2: 0.0717242
[11]	valid_0's auc: 0.725559	valid_0's l2: 0.07155
[12]	valid_0's auc: 0.727919	valid_0's l2: 0.0714469
[13]	valid_0's auc: 0.728134	valid_0's l2: 0.0712896
[14]	valid_0's auc: 0.728355	valid_0's l2: 0.0711481


[15]	valid_0's auc: 0.728629	valid_0's l2: 0.0710039
[16]	valid_0's auc: 0.729219	valid_0's l2: 0.0708939
[17]	valid_0's auc: 0.730231	valid_0's l2: 0.070791
[18]	valid_0's auc: 0.730428	valid_0's l2: 0.0706768
[19]	valid_0's auc: 0.730615	valid_0's l2: 0.0705705
[20]	valid_0's auc: 0.731005	valid_0's l2: 0.0704654
Did not meet early stopping. Best iteration is:
[20]	valid_0's auc: 0.731005	valid_0's l2: 0.0704654
Save model...
Start predicting...


0.7310050369948817

In [49]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
# ada_clf.fit(X_resampled, y_resampled) # 0.7121451170928985
ada_clf.fit(X_train, y_train)           # 0.7458267768231585
y_pred_ada = ada_clf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_ada)    

0.7458267768231585

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

# gbrt = GradientBoostingClassifier(max_depth=1, random_state=0).fit(X_train, y_train)          # 0.7257854798881231
gbrt = GradientBoostingClassifier(max_depth=1, random_state=0).fit(X_resampled, y_resampled)    # 0.6811196557588269
y_pred_gbrt = gbrt.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_gbrt)  




0.6811196557588269

In [25]:
my_submission = pd.DataFrame({'SK_ID_CURR': test.SK_ID_CURR, 'TARGET': y_pred})
my_submission.to_csv('submission.csv', index=False)