In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

### Read data

In [5]:
application_train = pd.read_csv("../input/application_train.csv") 
application_test = pd.read_csv("../input/application_test.csv")

### Flag anomalies which are identified during EDA

In [6]:
# Create an anomalous flag column
application_train['DAYS_EMPLOYED_ANOM'] = application_train["DAYS_EMPLOYED"] == 365243
# Replace the anomalous values with nan
application_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# update the test set as well
application_test['DAYS_EMPLOYED_ANOM'] = application_test["DAYS_EMPLOYED"] == 365243
application_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

## Encoding categorical variables

The problem with label encoding is that it gives the categories an arbitrary ordering. If we only have two unique values for a categorical variable (such as Male/Female), then label encoding is fine, but for more than 2 unique categories, one-hot encoding is the safe option.

### 1. Label Encoding

In [7]:
def label_encoding(train, test):
    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in train:
        if train[col].dtype == 'object':
            if len(list(train[col].unique())) <= 2:
                le.fit(train[col])
                train[col] = le.transform(train[col])
                test[col] = le.transform(test[col])
            
                # Keep track of how many columns were label encoded
                le_count += 1
            
    print('%d columns were label encoded.' % le_count)

In [8]:
label_encoding(application_train, application_test)

3 columns were label encoded.


### 2. One hot Encoding and alignment between train and test

In [9]:
application_train = pd.get_dummies(application_train)
application_test = pd.get_dummies(application_test)

In [10]:
print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 244)
Testing Features shape:  (48744, 240)


There need to be the same features (columns) in both the training and testing data. One-hot encoding has created more columns in the training data because there were some categorical variables with categories not represented in the testing data. To remove the columns in the training data that are not in the testing data, we need to align the dataframes

In [11]:
train_labels = application_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
application_train, application_test = application_train.align(application_test, join = 'inner', axis = 1)

# Add the target back in
# application_train['TARGET'] = train_labels

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 240)


## Logistic Regression 
### 1. Preprocessing - Data Scaling

In [14]:
# Copy of the testing data
train = application_train.copy()
test = application_test.copy()

features = list(train.columns)


# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)


### 2. Logistic Regression

In [15]:
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train, train_labels)

log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [17]:
# Submission dataframe
submit = application_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred
submit.to_csv('log_reg_baseline.csv', index = False)

Submission score 0.676

## Random Forest
Will data scaling makes a difference in random forest?

In [19]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train, train_labels)
predictions = random_forest.predict_proba(test)[:, 1]

# Submission dataframe
submit = application_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred
submit.to_csv('random_forest_scaled_data.csv', index = False)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.8s finished


In [22]:
train_2 = application_train.copy()
test_2 = application_test.copy()

features = list(train_2.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')
imputer.fit(train_2)

train_2 = imputer.transform(train_2)
test_2 = imputer.transform(test_2)

In [23]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train_2, train_labels)
predictions = random_forest.predict_proba(test_2)[:, 1]

# Submission dataframe
submit = application_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred
submit.to_csv('random_forest_unscaled_data.csv', index = False)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.7s finished


both are 0.676