In [1]:
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

### Read data

In [4]:
application_train = pd.read_csv("../input/application_train.csv") 
application_test = pd.read_csv("../input/application_test.csv")

In [5]:
df = application_train.append(application_test).reset_index()

In [10]:
df.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,DAYS_EMPLOYED_ANOM
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,False
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,False
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,False
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,,,WEDNESDAY,,,,,,,False
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,False


### Flag anomalies and the 4 XNA lines noticed during EDA

In [6]:
df = df[df['CODE_GENDER'] != 'XNA']
df['CODE_GENDER'].value_counts() 

F    235126
M    121125
Name: CODE_GENDER, dtype: int64

In [7]:
df = df[df['CODE_GENDER'] != 'XNA']

# Create an anomalous flag column
df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243
# Replace the anomalous values with nan
df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

## Encoding categorical variables

The problem with label encoding is that it gives the categories an arbitrary ordering. If we only have two unique values for a categorical variable (such as Male/Female), then label encoding is fine, but for more than 2 unique categories, one-hot encoding is the safe option.

In [12]:
# label encoding
def label_encoding(data):
    bin_features =[]
    for col in data:
        if data[col].dtype == 'object' and len(list(data[col].unique())) <= 2:
            bin_features.append(col)
    
    for bin_feature in bin_features: #['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE']
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    return df
df = label_encoding(df)

In [14]:
df.CODE_GENDER.head()

0    0
1    1
2    0
3    1
4    0
Name: CODE_GENDER, dtype: int64

In [15]:
# one-hot encoding
df = pd.get_dummies(df)

print('Data Features shape: ', df.shape)

Data Features shape:  (356251, 243)


## Some new features

In [17]:
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

In [18]:
# clear the memory
del application_train
del application_test
gc.collect()

82

### Separate train and test

In [27]:
# separate train and teset
train_labels = df[df['TARGET'].notnull()].TARGET.values
train_df = df[df['TARGET'].notnull()].drop(['TARGET','index','SK_ID_CURR'],axis=1)
test_df = df[df['TARGET'].isnull()].drop(['TARGET','index','SK_ID_CURR'],axis=1)

In [44]:
# save df to csv
df.drop(['index'],axis=1).to_csv('../input/application_df.csv', index = False)

## Logistic Regression 
### 1. Preprocessing - Data Scaling

In [28]:
# Copy of the testing data
train = train_df.copy()
test = test_df.copy()

features = list(train.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307507, 245)
Testing data shape:  (48744, 245)


### 2. Logistic Regression

In [29]:
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train, train_labels)

log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [39]:
# Submission dataframe
submit = df[df['TARGET'].isnull()][['SK_ID_CURR']] # double bracket is dataframe
submit['TARGET'] = log_reg_pred
submit.to_csv('../submission/log_reg_baseline.csv', index = False)

Submission score 0.676 -> 0.681 with new features

## Random Forest

In [42]:
train = train_df.copy()
test = test_df.copy()

features = list(train.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')
imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

In [43]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train, train_labels)
predictions = random_forest.predict_proba(test)[:, 1]

# Submission dataframe
submit = df[df['TARGET'].isnull()][['SK_ID_CURR']] # double bracket is dataframe
submit['TARGET'] = predictions
submit.to_csv('../submission/random_forest_unscaled_data.csv', index = False)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   52.9s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


0.697