In [20]:
# import necessary modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# listing files
print(os.listdir("C:/Users/manas/Desktop/Springboard/Capstone ideas/Home credit default risk/Data"))
# training data
train = pd.read_csv("C:/Users/manas/Desktop/Springboard/Capstone ideas/Home credit default risk/Data/application_train.csv")
print('training dataset shape:', train.shape)
#print(train.columns)
test = pd.read_csv("C:/Users/manas/Desktop/Springboard/Capstone ideas/Home credit default risk/Data/application_test.csv")
print('test dataset shape:', test.shape)

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']
training dataset shape: (307511, 122)
test dataset shape: (48744, 121)


In [22]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In the earlier nootbook I predicted the default risk without any feature engineering. Here I will be doing so and examine improvement in the score.

In [3]:
## Missing values in appication_train
print('Total number of variables:', train.shape[1])
miss_val = train.isnull().sum() # number of missing value in each col
miss_val = miss_val.sort_values(ascending=False) # sorting from max to min
miss_list = list(miss_val)
ind = miss_list.index(0)
miss_var = miss_val[0:ind] # only variables with missing values
print('Out of', train.shape[1], ', number of variables with missing values:', ind)

percent_miss = miss_var/len(train) * 100 # percent missing value for each variable
percents =[10 , 30, 50]
for percent in percents:
    for i in range(len(percent_miss)):
        if percent_miss.sort_values()[i] > percent:
            print('Out of', ind, ', there are', ind-i, 'variables with more than', percent, '% missing values')
            break

Total number of variables: 122
Out of 122 , number of variables with missing values: 67
Out of 67 , there are 57 variables with more than 10 % missing values
Out of 67 , there are 50 variables with more than 30 % missing values
Out of 67 , there are 41 variables with more than 50 % missing values


Discussion
Among 122 variables 67 variables have missing values. 57 variables have more than 10% value missing, 50 variables have more than 30% values missing, and 41 variables have more than 50% values missing. So, a large number of variables missing values. How can we deal with those missing values??

In [4]:
# variable type, object type are categorical variables
print('Variable types are:\n', train.dtypes.value_counts()) # variable type: integer, float, categorical
obj = train.select_dtypes('object') # selcting only categorical variable
obj_uni = obj.apply(pd.Series.nunique, axis=0) # unique values in category
print('\nUnique values in categorical variables are:\n', obj_uni.sort_values())

Variable types are:
 float64    65
int64      41
object     16
dtype: int64

Unique values in categorical variables are:
 NAME_CONTRACT_TYPE             2
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
EMERGENCYSTATE_MODE            2
CODE_GENDER                    3
HOUSETYPE_MODE                 3
FONDKAPREMONT_MODE             4
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
NAME_TYPE_SUITE                7
WEEKDAY_APPR_PROCESS_START     7
WALLSMATERIAL_MODE             7
NAME_INCOME_TYPE               8
OCCUPATION_TYPE               18
ORGANIZATION_TYPE             58
dtype: int64


Label encoding and One-hot encoding: 

A totla of 4 categorical variables have 2 unique values. For categorical variables with 2 unique categories, we will use label encoding, and with more than 2 unique categories, we will use one-hot encoding. The purpose is to keep both the values in one column when unique category is 2. Thus the number of column is same here

In [23]:
# # fitting and transforming with label encoder
le = LabelEncoder()
le_count = 0
for col in train:
    if train[col].dtype == 'object':
        if len(list(train[col].unique())) <= 2: # for unique category less/equal 2
            le.fit(train[col]) # fitting and transforming with label encoder
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
            le_count += 1
print(le_count)

3


In [24]:
## One hot encoding for unique category >2
train = pd.get_dummies(train)
test = pd.get_dummies(test)
print(train.shape, test.shape)

(307511, 243) (48744, 239)


Through one hot coding we have 3 extra features in train data. We will get rid of them using align.

In [25]:
## geting rid of excess columns in train dataset
train_labels = train['TARGET']
train, test = train.align(test, join = 'inner', axis=1)
train['TARGET'] = train_labels
print(train.shape, test.shape)

(307511, 240) (48744, 239)


In [26]:
train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,0,1,0,1,0,1
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1,0,0,0,0,0,0,1,0,0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,0,0,0,0


We observed earlier that there are anomalies in  DAYS_EMPLOYED feature. We would replace it as before.

A large spike can be seen at 1000 which are actually anomalies. There is anomali in 18% of total 'DAYS_EMPLOYED data

In [27]:
# creating an anamolous column in case it is useful
train['DAYS_EMPLOYED_ANOM'] = train['DAYS_EMPLOYED']==365243 # for training dataset
train['DAYS_EMPLOYED'].replace({365243:0}, inplace=True) # replacing anomaly with nan in train
test['DAYS_EMPLOYED_ANOM'] = test['DAYS_EMPLOYED']==365243 # for test dataset
test['DAYS_EMPLOYED'].replace({365243:0}, inplace=True) # replacing anomaly with nan in test

In [12]:
# find correlation of the features with target
correlations =  train.corr()['TARGET'].sort_values()


In [13]:
print('Most negative correlations:\n', correlations.head(5), '\n\nMost positive correlations\n', correlations.tail(5))

Most negative correlations:
 EXT_SOURCE_3                           -0.178919
EXT_SOURCE_2                           -0.160472
EXT_SOURCE_1                           -0.155317
NAME_EDUCATION_TYPE_Higher education   -0.056593
CODE_GENDER_F                          -0.054704
Name: TARGET, dtype: float64 

Most positive correlations
 NAME_INCOME_TYPE_Working       0.057481
REGION_RATING_CLIENT           0.058899
REGION_RATING_CLIENT_W_CITY    0.060893
DAYS_BIRTH                     0.078239
TARGET                         1.000000
Name: TARGET, dtype: float64


The correlation of any individual variable with the target is not strong. DAYS_BIRTH has strongest positive relation and EXIT_SOURCE_3 has maximum negative correlation.

First we will predict the model using only the features available at train and test dataset. There are many missing values. We first will replace them with median irrespective of the variable and then scale them with MinMaxScaler. 

In [28]:
train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET,DAYS_EMPLOYED_ANOM
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,1,0,1,0,1,False
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0,0,0,0,0,0,1,0,0,False
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,0,0,0,False
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,0,0,0,False
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,0,0,0,False


Since EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3, and DAYS_BIRTH have stronger correlation with the TARGET, we will use them for creating new features

## Feature Engineering

In [29]:
# Feature engineering by combining several features to have higher correlation with the target
poly_features = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
poly_features_test = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# imputer for handling missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')
#poly_target = poly_features['TARGET']
#poly_features = poly_features.drop(columns = ['TARGET']) # seperating TARGET column from poly_features dataset

#replacing missing values in poly_features
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.fit_transform(poly_features_test)

# create polynomial object
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree = 3)

# train the polynomial features
poly_transformer.fit(poly_features)

# transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print(poly_features.shape) # we started with 4 variables as poly features and ended with 35
columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
print(columns[:10]) # new feature names

(307511, 35)
['1', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'EXT_SOURCE_1^2', 'EXT_SOURCE_1 EXT_SOURCE_2', 'EXT_SOURCE_1 EXT_SOURCE_3', 'EXT_SOURCE_1 DAYS_BIRTH', 'EXT_SOURCE_2^2']


We started with 4 features and after transformation we have 35 features which are all possible combination between polynomial features, such as EXT_SOURCE_1^2, EXT_SOURCE_1 EXT_SOURCE_2, EXT_SOURCE_1 EXT_SOURCE_3 etc.

In [30]:
poly_features

array([[ 1.00000000e+00,  8.30369674e-02,  2.62948593e-01, ...,
        -1.83785678e+02,  1.24755987e+07, -8.46859039e+11],
       [ 1.00000000e+00,  3.11267311e-01,  6.22245775e-01, ...,
        -4.80351894e+03,  1.50447540e+08, -4.71205850e+12],
       [ 1.00000000e+00,  5.05997931e-01,  5.55912083e-01, ...,
        -1.01375679e+04,  2.64650402e+08, -6.90893871e+12],
       ...,
       [ 1.00000000e+00,  7.44026400e-01,  5.35721752e-01, ...,
        -7.16860892e+02,  4.90203102e+07, -3.35210198e+12],
       [ 1.00000000e+00,  5.05997931e-01,  5.14162820e-01, ...,
        -5.22638430e+03,  9.45696770e+07, -1.71120670e+12],
       [ 1.00000000e+00,  7.34459669e-01,  7.08568896e-01, ...,
        -2.18762433e+02,  3.23681708e+07, -4.78920655e+12]])

In [31]:
# creating dataframe with new features in train dataset
poly_features = pd.DataFrame(poly_features, columns = columns)
poly_features['SK_ID_CURR'] = train['SK_ID_CURR'] # adding ID of loan

# test dataset feature transformation
poly_features_test = pd.DataFrame(poly_features_test, columns=columns)
poly_features_test['SK_ID_CURR'] = test['SK_ID_CURR']
print(poly_features.shape, poly_features_test.shape)

(307511, 36) (48744, 36)


In [32]:
poly_features.head()

Unnamed: 0,1,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1^2,EXT_SOURCE_1 EXT_SOURCE_2,EXT_SOURCE_1 EXT_SOURCE_3,EXT_SOURCE_1 DAYS_BIRTH,EXT_SOURCE_2^2,...,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3,SK_ID_CURR
0,1.0,0.083037,0.262949,0.139376,-9461.0,0.006895,0.021834,0.011573,-785.612748,0.069142,...,0.009637,-654.152107,0.005108,-346.733022,23536670.0,0.002707,-183.785678,12475600.0,-846859000000.0,100002
1,1.0,0.311267,0.622246,0.535276,-16765.0,0.096887,0.193685,0.166614,-5218.396475,0.38719,...,0.207254,-6491.237078,0.178286,-5583.975307,174891600.0,0.153368,-4803.518937,150447500.0,-4712058000000.0,100003
2,1.0,0.505998,0.555912,0.729567,-19046.0,0.256034,0.28129,0.369159,-9637.236584,0.309038,...,0.225464,-5885.942404,0.295894,-7724.580288,201657200.0,0.388325,-10137.567875,264650400.0,-6908939000000.0,100004
3,1.0,0.505998,0.650442,0.535276,-19005.0,0.256034,0.329122,0.270849,-9616.490669,0.423074,...,0.226462,-8040.528832,0.186365,-6616.894625,234933100.0,0.153368,-5445.325225,193336400.0,-6864416000000.0,100006
4,1.0,0.505998,0.322738,0.535276,-19932.0,0.256034,0.163305,0.270849,-10085.550751,0.10416,...,0.055754,-2076.117157,0.092471,-3443.335521,128219000.0,0.153368,-5710.929881,212657000.0,-7918677000000.0,100007


## Correlation with TARGET

In [18]:
# find correlation in train
poly_corrs = poly_features.corr()['TARGET'].sort_values()
print(poly_corrs.head(10), '\n', poly_corrs.tail(10))

EXT_SOURCE_2 EXT_SOURCE_3                -0.193939
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189605
EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176428
EXT_SOURCE_2 EXT_SOURCE_3^2              -0.172282
EXT_SOURCE_1 EXT_SOURCE_2                -0.166625
EXT_SOURCE_1 EXT_SOURCE_3                -0.164065
EXT_SOURCE_2                             -0.160295
EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156867
EXT_SOURCE_3                             -0.155892
EXT_SOURCE_1 EXT_SOURCE_3^2              -0.150822
Name: TARGET, dtype: float64 
 EXT_SOURCE_1 DAYS_BIRTH                 0.104882
EXT_SOURCE_3^2 DAYS_BIRTH               0.141777
EXT_SOURCE_2^2 DAYS_BIRTH               0.149313
EXT_SOURCE_3 DAYS_BIRTH                 0.150109
EXT_SOURCE_1 EXT_SOURCE_3 DAYS_BIRTH    0.151816
EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH    0.155891
EXT_SOURCE_2 DAYS_BIRTH                 0.156873
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH    0.181283
TARGET                                  1.000000
1                 

The correlations have improved than earlier. What would happen if we include all the variables instead of 4? We will try it later

## Feature engineering based on domain knowledge

There are several features which are important to understand the financial capacity of a cliet, such as the percentage of the credit amount relative to a client's income (CREDIT_INCOME_PERCENT), the percentage of the loan annuity relative to a client's income (ANNUITY_INCOME_PERCENT), the length of the payment in months (since the annuity is the monthly amount due (CREDIT_TERM), the percentage of the days employed relative to the client's age (DAYS_EMPLOYED_PERCENT) etc. 

In [33]:
#train['TARGET'] = train_labels #(use it if TARGET is dropped from train dataset)
train_domain = train.copy()
train_domain['CREDIT_INCOME_PERCENT'] = train_domain['AMT_CREDIT'] / train_domain['AMT_INCOME_TOTAL'] # loan to income ratio
train_domain['ANNUITY_INCOME_PERCENT'] = train_domain['AMT_ANNUITY'] / train_domain['AMT_INCOME_TOTAL'] # annuity to income ratio
train_domain['CREDIT_TERM'] = train_domain['AMT_ANNUITY'] / train_domain['AMT_CREDIT'] # annuity to loan ratio
train_domain['DAYS_EMPLOYED_PERCENT'] = train_domain['DAYS_EMPLOYED'] / train_domain['DAYS_BIRTH'] # employment to age ratio

test_domain = test.copy()
test_domain['CREDIT_INCOME_PERCENT'] = test_domain['AMT_CREDIT'] / test_domain['AMT_INCOME_TOTAL'] # loan to income ratio
test_domain['ANNUITY_INCOME_PERCENT'] = test_domain['AMT_ANNUITY'] / test_domain['AMT_INCOME_TOTAL'] # annuity to income ratio
test_domain['CREDIT_TERM'] = test_domain['AMT_ANNUITY'] / test_domain['AMT_CREDIT'] # annuity to loan ratio
test_domain['DAYS_EMPLOYED_PERCENT'] = test_domain['DAYS_EMPLOYED'] / test_domain['DAYS_BIRTH'] # employment to age ratio

In [34]:
# merging with poly_feature
train_domain['SK_ID_CURR'] = poly_features['SK_ID_CURR']
train_eng = train_domain.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

test_domain['SK_ID_CURR'] = poly_features_test['SK_ID_CURR']
test_eng = test_domain.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# align train and test dataframe and exclude extra features
train_eng, test_eng = train_eng.align(test_eng, join = 'inner', axis = 1)
print(train_eng.shape)
print(test_eng.shape)


(307511, 279)
(48744, 279)


In [35]:
train = train_eng
test = test_eng

## Correlation with TARGET

In [250]:
domain_feat = train_domain[['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT', 'TARGET']]
# find correlation in train
domain_corrs = domain_feat.corr()['TARGET'].sort_values()
print(domain_corrs)

DAYS_EMPLOYED_PERCENT    -0.027817
CREDIT_INCOME_PERCENT    -0.004019
CREDIT_TERM              -0.002918
ANNUITY_INCOME_PERCENT    0.007971
TARGET                    1.000000
Name: TARGET, dtype: float64


The correlation is not strong

The distinction is not apparent.

## Model prediction

First we will predict using ony the features at train and test dataset. Then we will include more features.

In [37]:
## substitute missing value with imputer then scaling with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
train_target = train_labels
train_ID = train['SK_ID_CURR']
test_ID = test['SK_ID_CURR']
train = train.drop(columns=['SK_ID_CURR'])
test = test.drop(columns = ['SK_ID_CURR'])
features = list(train.columns)
test = test.copy()
imputer = Imputer(strategy='median')
scaler = MinMaxScaler(feature_range = (0, 1))
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)
train

array([[ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -1.83785678e+02,  1.24755987e+07, -8.46859039e+11],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -4.80351894e+03,  1.50447540e+08, -4.71205850e+12],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
        -1.01375679e+04,  2.64650402e+08, -6.90893871e+12],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -7.16860892e+02,  4.90203102e+07, -3.35210198e+12],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -5.22638430e+03,  9.45696770e+07, -1.71120670e+12],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.18762433e+02,  3.23681708e+07, -4.78920655e+12]])

In [38]:
train = pd.DataFrame(train, columns = features)
test = pd.DataFrame(test, columns = features)
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)
print(train.shape, test.shape)
train

(307511, 278) (48744, 278)


array([[0.        , 0.        , 1.        , ..., 0.99034384, 0.02307702,
        0.97270563],
       [0.        , 0.        , 0.        , ..., 0.74761873, 0.27893581,
        0.72554322],
       [1.        , 1.        , 1.        , ..., 0.46736281, 0.49071659,
        0.58506245],
       ...,
       [0.        , 0.        , 1.        , ..., 0.96233557, 0.09084649,
        0.81250643],
       [0.        , 0.        , 1.        , ..., 0.72540099, 0.17531443,
        0.91743442],
       [0.        , 0.        , 0.        , ..., 0.98850613, 0.05996632,
        0.72060995]])

In [39]:
train = pd.DataFrame(train, columns = features)
test = pd.DataFrame(test, columns = features)
print(train.shape, test.shape)

(307511, 278) (48744, 278)


In [40]:
train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH_x,...,EXT_SOURCE_2^3,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3
0,0.0,0.0,1.0,0.0,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,...,0.029088,0.01633,0.960392,0.008212,0.979415,0.046172,0.003764,0.990344,0.023077,0.972706
1,0.0,0.0,0.0,0.0,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,...,0.385468,0.351196,0.60697,0.286645,0.668486,0.343083,0.213204,0.747619,0.278936,0.725543
2,1.0,1.0,1.0,0.0,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,...,0.274866,0.382054,0.643619,0.475732,0.5414,0.395588,0.53983,0.467363,0.490717,0.585062
3,0.0,0.0,1.0,0.0,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,...,0.440278,0.383744,0.513163,0.299633,0.607162,0.460865,0.213204,0.713898,0.35847,0.587909
4,0.0,0.0,1.0,0.0,0.000819,0.116854,0.078975,0.117845,0.39288,0.298591,...,0.053784,0.094477,0.874296,0.148673,0.795573,0.251525,0.213204,0.699943,0.394299,0.520494


In [47]:
train['SK_ID_CURR'] = train_ID
test['SK_ID_CURR'] = test_ID
train, test = train.align(test, join = 'inner', axis=1)
#train['TARGET'] = train_target
print(train.shape, test.shape)
train.head()

(307511, 279) (48744, 279)


Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH_x,...,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3,SK_ID_CURR
0,0.0,0.0,1.0,0.0,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,...,0.01633,0.960392,0.008212,0.979415,0.046172,0.003764,0.990344,0.023077,0.972706,100002
1,0.0,0.0,0.0,0.0,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,...,0.351196,0.60697,0.286645,0.668486,0.343083,0.213204,0.747619,0.278936,0.725543,100003
2,1.0,1.0,1.0,0.0,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,...,0.382054,0.643619,0.475732,0.5414,0.395588,0.53983,0.467363,0.490717,0.585062,100004
3,0.0,0.0,1.0,0.0,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,...,0.383744,0.513163,0.299633,0.607162,0.460865,0.213204,0.713898,0.35847,0.587909,100006
4,0.0,0.0,1.0,0.0,0.000819,0.116854,0.078975,0.117845,0.39288,0.298591,...,0.094477,0.874296,0.148673,0.795573,0.251525,0.213204,0.699943,0.394299,0.520494,100007


Now the data is ready to be used for model prediction, we will be using LightGBM as that gave us best score last time

Predicting the probability of not paying a loan. The predict_proba method returns a m x 2 array where m is the number of observations. The first column is the probability of repaying the loan and second is not repaying the loan. This is a straightforward method. No feature engineering is applied so far. The result is expected to be low.

## Prediction with engineered features

We will now use Light Gradient Boosting Machine

In [48]:
# ## LightGBM
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

def model(features, train_labels, test_features, n_folds = 5):
    
       
    # Extract the ids
    train_id = features['SK_ID_CURR']
    test_id = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train_labels
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_id, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [50]:
submission, fi, metrics = model(train, train_labels, test)
print(metrics)

Training Data Shape:  (307511, 278)
Testing Data Shape:  (48744, 278)
Training until validation scores don't improve for 100 rounds.
[200]	valid's binary_logloss: 0.557347	valid's auc: 0.761683	train's binary_logloss: 0.547237	train's auc: 0.804927
Early stopping, best iteration is:
[292]	valid's binary_logloss: 0.54791	valid's auc: 0.76229	train's binary_logloss: 0.533569	train's auc: 0.820156
Training until validation scores don't improve for 100 rounds.
[200]	valid's binary_logloss: 0.558371	valid's auc: 0.764242	train's binary_logloss: 0.547655	train's auc: 0.804549
[400]	valid's binary_logloss: 0.53954	valid's auc: 0.764821	train's binary_logloss: 0.519983	train's auc: 0.835212
Early stopping, best iteration is:
[361]	valid's binary_logloss: 0.542888	valid's auc: 0.764997	train's binary_logloss: 0.525011	train's auc: 0.829864
Training until validation scores don't improve for 100 rounds.
[200]	valid's binary_logloss: 0.557729	valid's auc: 0.770218	train's binary_logloss: 0.548874	

After including engineered features, the auc score has improved from 0.758 to 0.766.

In [51]:
fi_sorted = fi.sort_values('importance', ascending=False).reset_index()
fi_sorted.head() 


Unnamed: 0,index,feature,importance
0,241,CREDIT_TERM,888.4
1,12,DAYS_ID_PUBLISH,276.2
2,6,AMT_ANNUITY,269.2
3,7,AMT_GOODS_PRICE,252.4
4,263,EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3,238.2
