In this notebook we will train Logistic Regression, Random Forest and XGBoost models. Becuase we just want to establish a baseline for the AUC metric, we only use the minimal dataset in application_train.csv and application_test.csv and do not tune the models' hyperparamters.

In [32]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from _preprocessing import merge_tables
from _preprocessing import onehot_encoding
from _preprocessing import GeneralLabelEncoder
from _model_tunning import roc_auc
from _input_output import feature_importance_df
from _input_output import write_submit_csv

INP_DIR = "data/data_"
OUT_DIR = "data/tuning_"
SUBMIT_DIR = "data/submit_"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [2]:
main_csv_train = os.path.join(INP_DIR, "application_train.csv")
main_csv_test = os.path.join(INP_DIR, "application_test.csv")

add_files = []
prefixes = []

other_csv_files = [os.path.join(INP_DIR, f) for f in add_files]
print(other_csv_files)

X_train = merge_tables(main_csv_train, other_csv_files=other_csv_files, prefixes=prefixes)
X_test = merge_tables(main_csv_test, other_csv_files=other_csv_files, prefixes=prefixes)

y_train = X_train["TARGET"]
X_train = X_train.drop(["SK_ID_CURR", "TARGET"], axis=1)

id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("id_test shape:", id_test.shape)

[]
Loading data/data_/application_train.csv
Memory usage before changing types 294.90 MB
Memory usage after changing types 133.77 MB
Memory usage before changing types 133.77 MB
Memory usage after changing types 133.77 MB
Loading data/data_/application_test.csv
Memory usage before changing types 46.70 MB
Memory usage after changing types 21.16 MB
Memory usage before changing types 21.16 MB
Memory usage after changing types 21.16 MB
X_train shape: (307511, 124)
y_train shape: (307511,)
X_test shape: (48744, 124)
id_test shape: (48744, 1)


# Logistic Regression

In [3]:
# One hot encoding
X_train_lr, X_test_lr = onehot_encoding(X_train, X_test)
print("X_train_lr shape:", X_train_lr.shape)
print("X_test_lr shape:", X_test_lr.shape)


# Impute missing values
imputer = Imputer(strategy="median")
imputer.fit(X_train_lr)

X_train_lr = imputer.transform(X_train_lr)
X_test_lr = imputer.transform(X_test_lr)

# standardize features
std_scaler = StandardScaler()
std_scaler.fit(X_train_lr)

X_train_lr = std_scaler.transform(X_train_lr)
X_test_lr = std_scaler.transform(X_test_lr)

# train test split
X_train_lr_1, X_eval, y_train_1, y_eval = train_test_split(X_train_lr, y_train, test_size=0.2, 
                                                           random_state=123, stratify=y_train)

X_train_lr shape: (307511, 245)
X_test_lr shape: (48744, 245)




In [4]:
lr = LogisticRegression()
lr.fit(X_train_lr_1, y_train_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
auc_lr = roc_auc(lr, X_eval, y_eval)
print("AUC of Logistic regression model on an evaluation set: %0.5f" % auc_lr)

AUC of Logistic regression model on an evaluation set: 0.74462


In [15]:
# fit on the whole set
lr.fit(X_train_lr, y_train)

# write the Kaggle submission file
write_submit_csv(lr, X_test_lr, id_test, os.path.join(SUBMIT_DIR, "baseline_lr.csv"))

In [16]:
del X_train_lr, X_test_lr, X_train_lr_1, X_eval, y_train_1, y_eval

# Random Forest

In [17]:
# One hot encoding
X_train_rf, X_test_rf = onehot_encoding(X_train, X_test)
print("X_train_rf shape:", X_train_rf.shape)
print("X_test_rf shape:", X_test_rf.shape)


# Impute missing values
imputer = Imputer(strategy="median")
imputer.fit(X_train_rf)

X_train_rf = imputer.transform(X_train_rf)
X_test_rf = imputer.transform(X_test_rf)

# no need to standardize features

# train test split
X_train_rf_1, X_eval, y_train_1, y_eval = train_test_split(X_train_rf, y_train, test_size=0.2, 
                                                           random_state=123, stratify=y_train)

X_train_rf shape: (307511, 245)
X_test_rf shape: (48744, 245)


In [18]:
rf = RandomForestClassifier(n_estimators=500, max_depth=5)
rf.fit(X_train_rf_1, y_train_1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
auc_rf = roc_auc(rf, X_eval, y_eval)
print("AUC of Random Forest model on an evaluation set: %0.5f" % auc_rf)

AUC of Random Forest model on an evaluation set: 0.72559


In [20]:
# fit on the whole set
rf.fit(X_train_rf, y_train)

# write the Kaggle submission file
write_submit_csv(rf, X_test_rf, id_test, os.path.join(SUBMIT_DIR, "baseline_rf.csv"))

In [22]:
del X_train_rf, X_test_rf, X_train_rf_1, X_eval, y_train_1, y_eval

# XGBoost

## One-hot encoding

In [26]:
# One hot encoding
X_train_xgb, X_test_xgb = onehot_encoding(X_train, X_test)
print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
                                                           random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 245)
X_test_xgb shape: (48744, 245)


In [24]:
xgb = XGBClassifier()
xgb.fit(X_train_xgb_1, y_train_1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [25]:
auc_xgb = roc_auc(xgb, X_eval, y_eval)
print("AUC of XGBoost model on an evaluation set: %0.5f" % auc_xgb)

AUC of XGBoost model on an evaluation set: 0.74998


In [28]:
feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Unnamed: 0,feature,importance
29,EXT_SOURCE_3,0.077464
28,EXT_SOURCE_2,0.069141
131,NAME_EDUCATION_TYPE_Higher education,0.055505
111,CODE_GENDER_M,0.042869
7,DAYS_EMPLOYED,0.03988
134,NAME_EDUCATION_TYPE_Secondary / secondary special,0.032959
79,FLAG_DOCUMENT_3,0.032595
110,CODE_GENDER_F,0.03118
107,CREDIT_TO_GOODS,0.030561
3,AMT_ANNUITY,0.025143


In [29]:
# fit on the whole set
xgb.fit(X_train_xgb, y_train)

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "baseline_xgb_ohe.csv"))

In [30]:
del X_train_xgb, X_test_xgb, X_train_xgb_1, X_eval, y_train_1, y_eval

## Label encoding

Some guy said on Kaggle that using lable encoding can improve the performance a little bit. Let's try that.

In [33]:
# Label encoding
lbe = GeneralLabelEncoder()
lbe.fit(X_train)
X_train_xgb = lbe.transform(X_train)
X_test_xgb = lbe.transform(X_test)

print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
                                                           random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 124)
X_test_xgb shape: (48744, 124)


In [34]:
xgb = XGBClassifier()
xgb.fit(X_train_xgb_1, y_train_1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [35]:
auc_xgb = roc_auc(xgb, X_eval, y_eval)
print("AUC of XGBoost model on an evaluation set: %0.5f" % auc_xgb)

AUC of XGBoost model on an evaluation set: 0.75001


The performance increases very slightly.

In [36]:
feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Unnamed: 0,feature,importance
41,EXT_SOURCE_3,0.091609
40,EXT_SOURCE_2,0.082087
11,NAME_EDUCATION_TYPE,0.058733
16,DAYS_EMPLOYED,0.049408
86,TOTALAREA_MODE,0.047653
1,CODE_GENDER,0.046117
123,CREDIT_TO_GOODS,0.040074
8,AMT_GOODS_PRICE,0.037391
95,FLAG_DOCUMENT_3,0.032057
39,EXT_SOURCE_1,0.031904


In [37]:
# fit on the whole set
xgb.fit(X_train_xgb, y_train)

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "baseline_xgb_lbe.csv"))

Submitting this result to Kaggle gives an AUC of 0.73778.