## Loan Prediction

** Objectives **

* Feature Selection

In [188]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.cross_validation import StratifiedKFold, cross_val_score, LeaveOneOut

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier


%matplotlib inline

In [189]:
train = pd.read_csv('./data/synthesized/train_mean.csv')
test = pd.read_csv('./data/synthesized/test_mean.csv')
sub = pd.read_csv('./data/Sample_Submission_ZAuTl8O.csv')

train_original = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
test_original = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')
feature_names = train_original.columns[:-1]

In [190]:
def get_data(df, dataset_type='train'):
    if dataset_type == 'train':
        features = get_train_features(df)
        X = df[features]
        y = df.Loan_Status
        return X, y
    else:
        features = get_test_features(df)
        X = df[features]
        return X

def get_train_features(df):
    features = df.columns[1:-1]
    return features

def get_test_features(df):
    features = df.columns[1:]
    return features

In [191]:
# X, y = get_data(train, dataset_type='train')
# X_test = get_data(test, dataset_type='test')

X = train_original[train_original.columns.drop('Loan_Status')]
y = train_original.Loan_Status

X_test = test_original

In [192]:
obj_cols = X.select_dtypes(include=['object'])

for col in obj_cols:
    feature = pd.concat([X[col], X_test[col]], axis=0)
    lbl = LabelEncoder()
    lbl.fit(feature)
    
    X[col] = lbl.transform(X[col])
    X_test[col] = lbl.transform(X_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [193]:
assert (X.shape[1] == X_test.shape[1]), 'Mismatch in number of features'

In [194]:
X = X.fillna(-9999)
X_test = X_test.fillna(-9999)

## Cross validation

In [195]:
def skfold_scorer(scoring = 'accuracy', n_folds = 5):
    def score(model, X, y):
        return np.mean(cross_val_score(model, X, y, cv = n_folds, scoring = scoring, n_jobs = -1))
    return score

def loo_scorer(scoring = 'accuracy'):
    def score(model, X, y):
        return np.mean(cross_val_score(model, X, y, cv = LeaveOneOut(X.shape[0]), scoring = scoring, n_jobs = -1))
    return score

## Models

In [214]:
logreg = LogisticRegression()
rf = RandomForestClassifier(n_jobs=-1)
gbm = GradientBoostingClassifier()
xgb_clf = XGBClassifier(n_estimators=300, seed=123)

## Feature Selection

In [197]:
def exhaustive_search(X_train, y_train, model, scorer, d = 0.1):
    q_max = 0
    n_features = X_train.shape[1] 
    best_features = []
    
    for j in range(1, n_features + 1):
        feature_indices = itertools.combinations(range(n_features), j)
        for features_list in feature_indices:
            sub_data = X_train.iloc[:, features_list]
            q = scorer(model, sub_data, y_train)
            if q > q_max:
                if abs(q - q_max) < d*q_max:
                    q_max = q
                    j_min = j
                    best_features = features_list
                    return best_features, q_max
                else:
                    q_max = q
                    j_min = j
                    best_features = features_list
                    
    return best_features, q_max

In [59]:
def show_results(algorithm, data, target, model, scorer):
    best_features, best_Q = algorithm(data, target, model, scorer)
    print 'Best score = ' + str(best_Q)
    print 'Best features:'  + str(best_features)
    print '----------'

In [60]:
show_results(exhaustive_search, X, y, logreg, skfold_scorer())

Best score = 0.688921205711
Best features:(1, 9, 12, 13)
----------


In [61]:
show_results(exhaustive_search, X, y, rf, skfold_scorer())

Best score = 0.687308302485
Best features:(1,)
----------


In [62]:
show_results(exhaustive_search, X, y, gbm, skfold_scorer())

Best score = 0.687308302485
Best features:(1,)
----------


In [215]:
show_results(exhaustive_search, X, y, xgb_clf, skfold_scorer())

Best score = 0.811184558435
Best features:(4, 9)
----------


## Training

In [216]:
model = XGBClassifier(n_estimators=300, seed=123)

In [217]:
X_sub = X[[4, 9]]
test_sub = X_test[[4, 9]]

In [218]:
assert ( X_sub.shape[1] == test_sub.shape[1] ), 'Mismatch in number of features'

In [219]:
model.fit(X_sub, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=123, silent=True, subsample=1)

In [225]:
prediction = model.predict(test_sub)

In [207]:
prediction_num = map(lambda x: 1 if x == 'Y' else 0, prediction)

In [227]:
sub['Loan_ID'] = test_original.index.values
sub['Loan_Status'] = prediction

In [228]:
sub.to_csv('./submissions/xgb_submission300_label_encoding.csv', index=False)