##### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

##### Importing dataset

In [2]:
train = pd.read_csv("C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\train.txt", sep=',')
test = pd.read_csv("C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\test.txt", sep=',')

In [None]:
print(train.shape, test.shape)

In [3]:
data = train.append(test, ignore_index=True, sort=False)

##### Data Handelling

###### Missing data

In [None]:
data.isna().any()

In [4]:
missing_values = [*data.columns[data.isna().any()]]

In [5]:
missing_values.pop()

'Loan_Status'

In [6]:
missing_values.remove('LoanAmount') # removing to fillna with  more complex methods than mean or median
missing_values.remove('Loan_Amount_Term') #removing to fillna possibly with  more complex methods than mean or median
missing_values

['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']

In [7]:
for i in missing_values: # filling missing values with mode 
    
    data[i].fillna(data[i].mode()[0], inplace=True)

In [8]:
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)

In [None]:
data.Loan_Amount_Term.value_counts(dropna=False)

In [None]:
for i in missing_values:
    print(f'Values in column {i} are: ')
    print(data[i].value_counts(dropna=False))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

In [9]:
data['LoanAmount'].interpolate(limit_direction='both', inplace=True) # interplating using linear method

###### Feature Engineering

In [10]:
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome'] # Creating total incomes column

data['Amount_Category'] = pd.qcut(data['LoanAmount'], q= 3, labels= ['Low', 'Medium', 'High']) #Categorizing amount as Low, Medium or High

data['Income_Category'] = pd.qcut(data['TotalIncome'], q=3, labels=['Low', 'Medium', 'High']) #Categorizing income as Low, Medium or High

data['DoubleIncome'] = data['CoapplicantIncome'].apply(lambda x: 0 if x == 0 else 1) #Checking if there are two income streams

data['EMI'] = data['LoanAmount'].div(data['Loan_Amount_Term']) #checking emi for loan
 
data['Loan_to_income_ratio'] = data['LoanAmount'].div(data['TotalIncome'])*100 # checking the ration of loan to total income

In [None]:
data.head()

In [None]:
data.info()

In [11]:
numeric_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount', 'TotalIncome','EMI','Loan_to_income_ratio']

In [12]:
obj_list = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Income_Category'
            , 'Amount_Category', 'DoubleIncome']

In [None]:
data[obj_list].head()

In [13]:
le = LabelEncoder()

for i in obj_list:
    data[i] = le.fit_transform(data[i])

In [14]:
le_tenure = LabelEncoder()

data['Loan_Amount_Term'] = le_tenure.fit_transform(data['Loan_Amount_Term'])

In [15]:
mmscale = MinMaxScaler()

In [16]:
data[numeric_cols] = mmscale.fit_transform(data[numeric_cols])

  return self.partial_fit(X, y)


In [None]:
data.head()

In [17]:
df = data.drop('Loan_ID', axis=1)

In [None]:
df.head()

###### Splitting data to train and test

In [18]:
df_train = df[df.Loan_Status.notna()]
df_test = df[df.Loan_Status.isna()]

In [19]:
df_train['Loan_Status'] = df_train.Loan_Status.apply(lambda x:1 if x == 'N' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
df_train.head()

In [70]:
df_test.drop('Loan_Status', axis=1, inplace=True)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
614,1,1,0,0,0,0.070617,0.0,0.146165,10,1,2,0.053772,1,2,0,0.012964,0.172625
615,1,1,1,0,0,0.037975,0.036,0.16932,10,1,2,0.039393,2,2,1,0.015017,0.255362
616,1,1,2,0,0,0.061728,0.0432,0.287988,10,1,2,0.067347,0,0,1,0.025542,0.285783
617,1,1,2,0,0,0.028889,0.061104,0.131693,10,1,2,0.043289,1,2,1,0.01168,0.184938
618,1,0,0,1,0,0.040444,0.0,0.099855,10,1,2,0.023052,1,1,0,0.008856,0.218245


In [73]:
df_test = df_test.values

###### Splitting to X and y, training, cross-validation

In [20]:
X, y = df_train.drop('Loan_Status', axis=1).values, df_train.Loan_Status.values

In [None]:
skf = StratifiedKFold(n_splits=10)

#### Random Forest

In [None]:
rf_clf_1 = RandomForestClassifier()

In [None]:
[*range(1, 11)]

In [None]:
params = {'n_estimators': [1000],
    'criterion': ['gini'],
    'max_depth': [1, None],
    'min_samples_split': [*range(2, 11)],
    'min_samples_leaf': [*range(11, 21)],
    'max_features': ['auto', 'sqrt']}

###### Grid search

In [None]:
gs_cv_rf = GridSearchCV(estimator=rf_clf_1, param_grid=params, scoring='accuracy', n_jobs=-1, cv=skf) 
# grid search with 10 fold stratified cross-validatiob

In [None]:
gs_cv_rf.fit(X, y)

In [None]:
gs_cv_rf.best_params_ # checking the best parameters for training

In [None]:
rf_clf_bst = RandomForestClassifier(min_samples_leaf = 11, min_samples_split = 4, n_estimators= 1000) # Instantiating a model with best params from grid search

In [None]:
rf_bst_cv = cross_val_score(rf_clf_bst, X, y, cv = skf, n_jobs=-1, scoring='accuracy', verbose=1)

In [None]:
print(rf_bst_cv.min(), rf_bst_cv.max(), rf_bst_cv.mean(), rf_bst_cv.std())
# mean accuracy score 80 max; 88, min 75, deviation seems to be high

In [None]:
rf_clf_bst.fit(X, y)

In [None]:
pred = rf_clf_bst.predict(df_test)

In [82]:
sample = pd.read_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\sample.csv')

In [85]:
df_test.shape

(367, 17)

In [83]:
sample.shape

(367, 2)

In [None]:
sample['Loan_Status'] = pred

In [None]:
sample['Loan_Status'] = sample.Loan_Status.apply(lambda x: 'Y' if x == 0 else 'N')

In [None]:
sample.to_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\submission_rf.csv', index=False)

In [None]:
sample.Loan_Status.value_counts()

#### Xtreme Gradient Boosting Classifiers

In [21]:
xgb_clf = xgb.XGBClassifier(n_jobs=-1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [26]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [27]:
xgb_pred = xgb_clf.predict(X_test)

In [48]:
xgb_prb_prd = xgb_clf.predict_proba(X_test)[:, 1] > 0.4

In [50]:
confusion_matrix(y_test, xgb_prb_prd)

array([[77,  8],
       [18, 20]], dtype=int64)

In [51]:
accuracy_score(y_test, xgb_prb_prd)

0.7886178861788617

In [56]:
f1_score(y_test, xgb_prb_prd, average='macro')

0.7308080808080808

###### Grid search

In [57]:
params = {'max_depth': [3, 6, 9, 12],
         'colsample_bytree': [0.7, 1]}

In [58]:
xgb_clf_grid = xgb.XGBClassifier(n_estimators=500)

In [99]:
skf = StratifiedKFold(n_splits=3)

In [60]:
xgb_g_cv = GridSearchCV(estimator=xgb_clf_grid, param_grid=params, scoring='accuracy', n_jobs=-1, cv= skf)

In [86]:
xgb_g_cv.fit(X, y)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 6, 9, 12], 'colsample_bytree': [0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [64]:
print(xgb_g_cv.best_params_, xgb_g_cv.best_score_)

{'colsample_bytree': 0.7, 'max_depth': 9} 0.7817589576547231


In [123]:
xgb_bst = xgb.XGBClassifier(n_estimators=1000, colsample_bytree=0.7, max_depth=9, base_score=0.1)

In [124]:
cross_val_score(estimator=xgb_bst, X=X, y=y, cv=skf, n_jobs=-1).mean()

0.7686991869918699

In [125]:
xgb_bst.fit(X, y)

XGBClassifier(base_score=0.1, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [126]:
xgb_pred = xgb_bst.predict(df_test)

In [127]:
sample['Loan_Status'] = xgb_pred

In [128]:
sample.Loan_Status.value_counts(dropna=False)

0    274
1     93
Name: Loan_Status, dtype: int64

In [129]:
sample['Loan_Status'] = sample['Loan_Status'].apply(lambda x: 'N' if x == 0 else 'Y')

In [130]:
sample.to_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\submission_xgb.csv', index=False)