In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt

from  xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV

df_train = pd.read_csv('train.csv')

#############################################   Few Outlier Conditions ###############################

df_train = df_train[df_train['ApplicantIncome']<=60000]

######################################################################################################


df_test = pd.read_csv('test.csv')
y = df_train['Loan_Status']

df_test_loanID = df_test['Loan_ID'].to_frame()

df_train.drop('Loan_ID', 1, inplace = True)
df_test.drop('Loan_ID', 1, inplace = True)

df_train['Loan_Status'] = [1 if val == 'Y' 
                           else 0
                     for val in df_train['Loan_Status']]


def data_explore(df_train):
    ser_gender = df_train['Gender'].value_counts()
    df_train['Gender'].fillna(ser_gender[ser_gender == max(ser_gender)].index[0], inplace = True)


    ser_married = df_train['Married'].value_counts()
    df_train['Married'].fillna(ser_married[ser_married == max(ser_married)].index[0], inplace = True)

    ser_dependents = df_train['Dependents'].value_counts()
    df_train['Dependents'].fillna(ser_dependents[ser_dependents == max(ser_dependents)].index[0], inplace = True)


    ser_self_employed = df_train['Self_Employed'].value_counts()
    df_train['Self_Employed'].fillna(ser_self_employed[ser_self_employed == max(ser_self_employed)].index[0], inplace = True)

    ser_loanamount = (df_train['LoanAmount'].median() + df_train['LoanAmount'].mean())/2
    df_train['LoanAmount'].fillna(ser_loanamount, inplace = True)

    ser_loanterm_amount = (df_train['Loan_Amount_Term'].median() + df_train['Loan_Amount_Term'].mean())/2
    df_train['Loan_Amount_Term'].fillna(ser_loanterm_amount, inplace = True)

    ser_credit_history = df_train['Credit_History'].mean()
    df_train['Credit_History'].fillna(ser_credit_history, inplace = True)

    for column in df_train.columns:
        text_val_dict = {}
        if df_train[column].dtypes not in ('int64', 'float64'):
            column_contents = list(df_train[column].unique().astype(str))
            column_contents.sort()
            i = 0
            for content in column_contents:
                text_val_dict[content] = i
                i+=1
            df_train[column] = df_train[column].map(text_val_dict)
    return df_train

df_train_final_1 = data_explore(df_train)
df_train_final = df_train_final_1.drop('Loan_Status', 1)
df_test_final = data_explore(df_test)


X_train, X_test, y_train, y_test = train_test_split(df_train_final, y, random_state = 0)


# # Random Forests Classifier
# model_RFC = RandomForestClassifier(n_estimators = 100,n_jobs = -1,  
#                                    min_samples_leaf = 10, random_state = 1).fit(X_train, y_train)
# print('RFC Train score: ', model_RFC.score(X_train, y_train))
# print('RFC Test score: ', model_RFC.score(X_test, y_test))
# # print('\n')

# # Logistic Regression Classifier
# model_log = LogisticRegression().fit(X_train, y_train)
# print('Train score: ', model_log.score(X_train, y_train))
# print('Test score: ', model_log.score(X_test, y_test))
# # print('\n')
min_samples = list(range(1,50))
n_estimator = list(range(10, 500, 50))
# XGB Classifier
param_list = [ {'max_depth' : [1, 2, 3, 4, 5],
               'n_estimators' : n_estimator, 'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100], }]
# XGBClassifier().get_params().keys()

clf = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_list, n_jobs = -1)
clf.fit(X_train, y_train)

# model_xgb = XGBClassifier(param_grid = param_list).fit(X_train, y_train)
print('GBC Training score: ', clf.score(X_train, y_train))
print('GBC Testing score: ', clf.score(X_test, y_test))
print(clf)

GBC Training score:  0.814814814815
GBC Testing score:  0.830065359477
GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5], 'n_estimators': [10, 60, 110, 160, 210, 260, 310, 360, 410, 460], 'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)


In [51]:
print('Best gamma:', clf.best_estimator_.gamma)
print('Best Max depth:',clf.best_estimator_.max_depth) 
print('Best estimator size:',clf.best_estimator_.n_estimators)
print('Best learning rate:',clf.best_estimator_.learning_rate)
print('Best score for data1:', clf.best_score_)
print('Training score: ', clf.score(X_train, y_train))
print('Testing score: ', clf.score(X_test, y_test))

Best gamma: 1
Best Max depth: 3
Best estimator size: 410
Best learning rate: 0.01
Best score for data1: 0.803921568627451
Training score:  0.806100217865
Testing score:  0.843137254902
