In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load all data sets

app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')

In [4]:
# Function to calculate missing values by column

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [5]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MEDI,213514,69.4
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4
FONDKAPREMONT_MODE,210295,68.4
LIVINGAPARTMENTS_MODE,210199,68.4
LIVINGAPARTMENTS_MEDI,210199,68.4
LIVINGAPARTMENTS_AVG,210199,68.4


In [6]:
app_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [7]:
app_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [8]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [9]:
# One Hot Encoding of Categorical Variables (with NaN values dealt with)

categorical_columns = [col for col in app_train.columns if app_train[col].dtype == 'object']
app_train = pd.get_dummies(app_train, columns= categorical_columns, dummy_na= True)

categorical_columns2 = [col for col in app_test.columns if app_test[col].dtype == 'object']
app_test = pd.get_dummies(app_test, columns= categorical_columns, dummy_na= True)

In [10]:
# Replacement of numerical NaN values with mean

columns = app_train.columns

for i in columns:
    current = app_train[i]
    if type(current[0])==np.int64 or type(current[0])==np.float64:
            if current.isnull().sum() != 0:
                mean = np.mean(current)
                app_train[i].replace({np.nan: mean}, inplace= True)

In [13]:
columns2 = app_test.columns

for i in columns2:
    current = app_test[i]
    if type(current[0])==np.int64 or type(current[0])==np.float64:
            if current.isnull().sum() != 0:
                mean = np.mean(current)
                app_test[i].replace({np.nan: mean}, inplace= True)

In [14]:
print(app_train.shape)
print(app_test.shape)

(307511, 262)
(48744, 258)


In [15]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

Your selected dataframe has 262 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [16]:
missing_values = missing_values_table(app_test)
missing_values.head(20)

Your selected dataframe has 258 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [17]:
# Feature Engineering of Training and Test Set

app_train['YEARS_BUILD_CREDIT'] = app_train['AMT_CREDIT']/app_train['YEARS_BUILD_AVG']
app_train['Annuity_Income'] = app_train['AMT_ANNUITY']/app_train['AMT_INCOME_TOTAL']
app_train['Income_Cred'] = app_train['AMT_CREDIT']/app_train['AMT_INCOME_TOTAL']
app_train['EMP_AGE'] = app_train['DAYS_EMPLOYED']/app_train['DAYS_BIRTH']
app_train['Income_PP'] = app_train['AMT_INCOME_TOTAL']/app_train['CNT_FAM_MEMBERS']
app_train['CHILDREN_RATIO'] = (1 + app_train['CNT_CHILDREN']) / app_train['CNT_FAM_MEMBERS']
app_train['PAYMENTS'] = app_train['AMT_ANNUITY']/ app_train['AMT_CREDIT']
app_train['NEW_CREDIT_TO_GOODS_RATIO'] = app_train['AMT_CREDIT'] / app_train['AMT_GOODS_PRICE']
app_train['GOODS_INCOME'] =  app_train['AMT_GOODS_PRICE']/app_train['AMT_INCOME_TOTAL']
app_train['Ext_source_mult'] = app_train['EXT_SOURCE_1'] * app_train['EXT_SOURCE_2'] * app_train['EXT_SOURCE_3']
app_train['Ext_SOURCE_MEAN'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
app_train['Ext_SOURCE_SD'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis = 1)

app_test['YEARS_BUILD_CREDIT'] = app_test['AMT_CREDIT']/app_test['YEARS_BUILD_AVG']
app_test['Annuity_Income'] = app_test['AMT_ANNUITY']/app_test['AMT_INCOME_TOTAL']
app_test['Income_Cred'] = app_test['AMT_CREDIT']/app_test['AMT_INCOME_TOTAL']
app_test['EMP_AGE'] = app_test['DAYS_EMPLOYED']/app_test['DAYS_BIRTH']
app_test['Income_PP'] = app_test['AMT_INCOME_TOTAL']/app_test['CNT_FAM_MEMBERS']
app_test['CHILDREN_RATIO'] = (1 + app_test['CNT_CHILDREN']) / app_test['CNT_FAM_MEMBERS']
app_test['PAYMENTS'] = app_test['AMT_ANNUITY']/ app_test['AMT_CREDIT']
app_test['NEW_CREDIT_TO_GOODS_RATIO'] = app_test['AMT_CREDIT'] / app_test['AMT_GOODS_PRICE']
app_test['GOODS_INCOME'] =  app_test['AMT_GOODS_PRICE']/app_test['AMT_INCOME_TOTAL']
app_test['Ext_source_mult'] = app_test['EXT_SOURCE_1'] * app_test['EXT_SOURCE_2'] * app_test['EXT_SOURCE_3']
app_test['Ext_SOURCE_MEAN'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
app_test['Ext_SOURCE_SD'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis = 1)

In [18]:
# Aligning Training and Test Data

train_labels = app_train['TARGET']

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 271)
Testing Features shape:  (48744, 270)


In [19]:
print(app_train.shape)
print(app_test.shape)

(307511, 271)
(48744, 270)


In [20]:
print(app_train.head())

   SK_ID_CURR  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0      100002             0          202500.0    406597.5      24700.5   
1      100003             0          270000.0   1293502.5      35698.5   
2      100004             0           67500.0    135000.0       6750.0   
3      100006             0          135000.0    312682.5      29686.5   
4      100007             0          121500.0    513000.0      21865.5   

   AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  \
0         351000.0                    0.018801       -9461           -637   
1        1129500.0                    0.003541      -16765          -1188   
2         135000.0                    0.010032      -19046           -225   
3         297000.0                    0.008019      -19005          -3039   
4         513000.0                    0.028663      -19932          -3038   

   DAYS_REGISTRATION   ...     EMP_AGE  Income_PP  CHILDREN_RATIO  PAYMENTS  \
0            

In [21]:
X_train = app_train.iloc[:,:270]
y_train = app_train.iloc[:,270]

In [22]:
print(y_train.head())

0    1
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64


In [23]:
X_test = app_test.iloc[:,:]
print(X_test.head())

   SK_ID_CURR  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0      100001             0          135000.0    568800.0      20560.5   
1      100005             0           99000.0    222768.0      17370.0   
2      100013             0          202500.0    663264.0      69777.0   
3      100028             2          315000.0   1575000.0      49018.5   
4      100038             1          180000.0    625500.0      32067.0   

   AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  \
0         450000.0                    0.018850      -19241          -2329   
1         180000.0                    0.035792      -18064          -4469   
2         630000.0                    0.019101      -20038          -4458   
3        1575000.0                    0.026392      -13976          -1866   
4         625500.0                    0.010032      -13040          -2191   

   DAYS_REGISTRATION      ...        Income_Cred   EMP_AGE  Income_PP  \
0            -5170.

In [24]:
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from numpy import loadtxt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [25]:
clf = XGBClassifier(learning_rate =0.02,
n_estimators=1000,
max_depth=3, 
min_child_weight=4,
subsample=0.8, 
colsample_bytree=0.8,
objective= 'binary:logistic', 
nthread=4,
scale_pos_weight=2, 
seed=27)

In [26]:
clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=4, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=2, seed=27, silent=True,
       subsample=0.8)

In [27]:
print(clf.predict_proba(X_test))

[[0.93283737 0.06716264]
 [0.7574953  0.24250475]
 [0.9708819  0.02911815]
 ...
 [0.95859337 0.04140661]
 [0.8960123  0.1039877 ]
 [0.67198455 0.32801548]]


In [28]:
predictions = clf.predict_proba(X_test)[:, 1]

In [29]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

In [30]:
print(submit)

       SK_ID_CURR    TARGET
0          100001  0.067163
1          100005  0.242505
2          100013  0.029118
3          100028  0.067662
4          100038  0.234337
5          100042  0.049934
6          100057  0.034575
7          100065  0.117648
8          100066  0.037582
9          100067  0.216538
10         100074  0.119073
11         100090  0.121634
12         100091  0.383659
13         100092  0.188466
14         100106  0.098042
15         100107  0.249998
16         100109  0.113190
17         100117  0.062757
18         100128  0.173958
19         100141  0.033599
20         100150  0.037508
21         100168  0.029148
22         100169  0.056065
23         100170  0.131509
24         100171  0.087585
25         100172  0.138239
26         100184  0.156711
27         100187  0.158317
28         100212  0.103651
29         100222  0.100236
...           ...       ...
48714      455963  0.076825
48715      455965  0.037959
48716      456007  0.453695
48717      456008  0

In [31]:
submit.to_csv('Kaggle_Nicholas_SubmissionLATEST', index = False)