In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')
sample_submission = pd.read_csv('/kaggle/input/home-credit-default-risk/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.isna().sum()

In [None]:
# Empty fields fulfillment

train.fillna(method = "ffill", inplace = True)
test.fillna(method = "ffill", inplace = True)

# NaN's removement

train = train.dropna()
train = train.reset_index(drop=True)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
# Categorical features encoding

categorical_fields = []

for col, col_type in dict(train.dtypes).items():
    if col_type == object:
        categorical_fields.append(col)
        
print(categorical_fields)

In [None]:
for col in categorical_fields:
    train_encoder = preprocessing.LabelEncoder()
    train[col] = train_encoder.fit_transform(train[col])
    
    test_encoder = preprocessing.LabelEncoder()
    test[col] = test_encoder.fit_transform(test[col].astype(str))
    
train.head()

In [None]:
# Normalization

from sklearn.preprocessing import MinMaxScaler

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

train[train.columns[:-1]] = scaler_x.fit_transform(train.iloc[:, :-1])
train[['TARGET']] = scaler_y.fit_transform(train[['TARGET']])

test[test.columns] = scaler_x.fit_transform(test)

train.head()

In [None]:
# Features selection

# RFE

model = LogisticRegression(max_iter = 1000)

# help(RFE)

rfe = RFE(model, n_features_to_select = 10, step = 10, verbose = 1)

columns = list(train.columns)
columns.remove('TARGET')

fit = rfe.fit(train.loc[:, columns], train["TARGET"])

In [None]:
features_list = list(pd.Index(columns)[fit.support_])

print(features_list)

In [None]:
for feature in features_list:
    train[feature + '_LOG'] = np.log1p(train[feature])
    test[feature + '_LOG'] =np.log1p(test[feature])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize= (8,8))
plt.hist(train['AMT_CREDIT_LOG'], bins = 100)
plt.show()

In [None]:
plt.figure(figsize= (8,8))
plt.hist(train['AMT_ANNUITY_LOG'], bins = 100)
plt.show()

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression()
features = features_list[:1]

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression()
features = features_list[:2]

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression()
features = features_list[:4]

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression()
features = features_list[:6]

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression(max_iter = 1000)
features = features_list[:8]

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

roc_auc_list = []

kf = KFold(n_splits=5)
kf.get_n_splits(train)

model = LogisticRegression(max_iter = 1000)
features = features_list

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    X_train, X_test = train.loc[train_index, features], train.loc[test_index, features ]
    y_train, y_test = train.loc[train_index, 'TARGET'], train.loc[test_index, 'TARGET']

    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, predict)
    roc_auc_list.append(roc_auc)
    print(i, roc_auc)
    
print("mean rmse for 5-fold: {}".format(str(np.mean(roc_auc_list))))
print("std roc_auc for 5-fold: {}".format(str(np.std(roc_auc_list))))

In [None]:
model.fit(train[features], train['TARGET'])

sample_submission['TARGET'] = model.predict_proba(test[features]) [:,1]

sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index = None)

In [None]:
# Dependence of cross-validation and leaderboard metric

cross_validation_rates = [0.51859, 0.53020, 0.59376, 0.67591, 0.71215, 0.71230]
leaderboard_rates = [0.51045, 0.53447, 0.59210, 0.65682, 0.70142, 0.70151]

metrics = np.arange(0.0, 1.0, 0.01)

plt.plot(leaderboard_rates, cross_validation_rates)
plt.ylabel('RMSE', fontsize=18)
plt.xlabel('Leaderboard', fontsize=18)
plt.show()