In [1]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

import warnings
warnings.filterwarnings('ignore')



In [233]:
# load train and test dataset
loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [234]:
%run scripts/data.py

In [235]:
features_cols = loan_train.columns.drop('Loan_Status')
loan_train_features_df = loan_train[features_cols]
target_feature = loan_train['Loan_Status']

In [236]:
obj_cols = loan_train_features_df.select_dtypes(include=['object']).columns
non_obj_cols = loan_train_features_df.select_dtypes(exclude=['object']).columns

In [237]:
# label encoded data
train_df_enc, test_df_enc = get_label_encoded_data(loan_train_features_df, loan_test, obj_cols)

In [238]:
# one hot encoded
train_df_hot, test_df_hot = get_dummy_variable_data(loan_train_features_df, loan_test, non_obj_cols, obj_cols)

## Missing Values

In [253]:
def fill_missing_values_enc(train_df, test_df, func):
    train_df_cpy = train_df.copy()
    test_df_cpy = test_df.copy()
    
    train_df_cpy['LoanAmount'] = train_df_cpy.LoanAmount.fillna(func(train_df.LoanAmount))
    test_df_cpy['LoanAmount'] = test_df_cpy.LoanAmount.fillna(func(test_df.LoanAmount))
    
    train_df_cpy['Loan_Amount_Term'] = train_df_cpy.Loan_Amount_Term.fillna(func(train_df.Loan_Amount_Term))
    test_df_cpy['Loan_Amount_Term'] = test_df_cpy.Loan_Amount_Term.fillna(func(test_df.Loan_Amount_Term))
    
    train_df_cpy['Credit_History'] = train_df_cpy.Credit_History.fillna(1.0)
    test_df_cpy['Credit_History'] = test_df_cpy.Credit_History.fillna(1.0)
    
    return train_df_cpy, test_df_cpy

In [254]:
train_df_enc_mean, test_df_enc_mean = fill_missing_values_enc(train_df_enc, test_df_enc, np.mean)

In [256]:
train_df_hot_mean, test_df_hot_mean = fill_missing_values_enc(train_df_hot, train_df_hot, np.mean)

In [323]:
from sklearn.cross_validation import train_test_split
Xt, Xv, yt, yv = train_test_split(train_df_enc_mean, target_feature, test_size=0.2, random_state=42)

In [324]:
from sklearn.ensemble import RandomForestClassifier

etr = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=4, class_weight='auto')
etr.fit(Xt, yt)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [325]:
print 'Accuracy score ', etr.score(Xv, yv)

Accuracy score  0.780487804878
