##### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

##### Importing dataset

In [2]:
train = pd.read_csv("C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\train.txt", sep=',')
test = pd.read_csv("C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\test.txt", sep=',')

In [None]:
print(train.shape, test.shape)

In [3]:
data = train.append(test, ignore_index=True, sort=False)

##### Data Handelling

###### Missing data

In [None]:
data.isna().any()

In [4]:
missing_values = [*data.columns[data.isna().any()]]

In [5]:
missing_values.pop()

'Loan_Status'

In [6]:
missing_values.remove('LoanAmount') # removing to fillna with  more complex methods than mean or median
missing_values

['Gender',
 'Married',
 'Dependents',
 'Self_Employed',
 'Loan_Amount_Term',
 'Credit_History']

In [7]:
for i in missing_values: # filling missing values with mode 
    
    data[i].fillna(data[i].mode()[0], inplace=True)

In [None]:
data.head()

In [None]:
data.isnull().any()

In [None]:
for i in missing_values:
    print(f'Values in column {i} are: ')
    print(data[i].value_counts(dropna=False))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

In [8]:
data['LoanAmount'].interpolate(limit_direction='both', inplace=True) # interplating using linear method

###### Feature Engineering

In [9]:
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome'] # Creating total incomes column

data['Amount_Category'] = pd.qcut(data['LoanAmount'], q= 3, labels= ['Low', 'Medium', 'High']) #Categorizing amount as Low, Medium or High

data['Income_Category'] = pd.qcut(data['TotalIncome'], q=3, labels=['Low', 'Medium', 'High']) #Categorizing income as Low, Medium or High

data['DoubleIncome'] = data['CoapplicantIncome'].apply(lambda x: 0 if x == 0 else 1) #Checking if there are two income streams

data['EMI'] = data['LoanAmount'].div(data['Loan_Amount_Term']) #checking emi for loan
 
data['Loan_to_income_ratio'] = data['LoanAmount'].div(data['TotalIncome'])*100 # checking the ration of loan to total income

In [18]:
data.groupby(['Education', 'Self_Employed', 'Dependents'])['TotalIncome'].mean()

Education     Self_Employed  Dependents
Graduate      No             0              6393.368845
                             1              7428.878505
                             2              6668.694949
                             3+            11999.089286
              Yes            0              8869.187500
                             1              8312.285714
                             2              9843.736842
                             3+             7898.333333
Not Graduate  No             0              4699.485149
                             1              5287.785714
                             2              5029.625000
                             3+             4292.375000
              Yes            0              7234.785714
                             1              4338.000000
                             2              5566.500000
                             3+             6467.600000
Name: TotalIncome, dtype: float64

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 19 columns):
Loan_ID                 981 non-null object
Gender                  981 non-null object
Married                 981 non-null object
Dependents              981 non-null object
Education               981 non-null object
Self_Employed           981 non-null object
ApplicantIncome         981 non-null int64
CoapplicantIncome       981 non-null float64
LoanAmount              981 non-null float64
Loan_Amount_Term        981 non-null float64
Credit_History          981 non-null float64
Property_Area           981 non-null object
Loan_Status             614 non-null object
TotalIncome             981 non-null float64
Amount_Category         981 non-null category
Income_Category         981 non-null category
DoubleIncome            981 non-null int64
EMI                     981 non-null float64
Loan_to_income_ratio    981 non-null float64
dtypes: category(2), float64(7), int64(2), object(

In [25]:
numeric_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount', 'TotalIncome','EMI','Loan_to_income_ratio']

In [26]:
obj_list = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Income_Category'
            , 'Amount_Category', 'DoubleIncome']

In [None]:
data[obj_list].head()

In [27]:
le = LabelEncoder()

for i in obj_list:
    data[i] = le.fit_transform(data[i])

In [28]:
le_tenure = LabelEncoder()

data['Loan_Amount_Term'] = le_tenure.fit_transform(data['Loan_Amount_Term'])

In [29]:
mmscale = MinMaxScaler()

In [30]:
data[numeric_cols] = mmscale.fit_transform(data[numeric_cols])

  return self.partial_fit(X, y)


In [31]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
0,LP001002,1,0,0,0,0,0.07221,0.0,0.172214,10,1,2,Y,0.055394,2,2,0,0.015274,0.199061
1,LP001003,1,1,1,0,0,0.05658,0.036192,0.172214,10,1,0,N,0.058435,2,2,1,0.015274,0.190398
2,LP001005,1,1,0,0,1,0.037037,0.0,0.082489,10,1,2,Y,0.019583,1,1,0,0.007316,0.200216
3,LP001006,1,1,0,1,0,0.031889,0.056592,0.160637,10,1,2,Y,0.04398,2,2,1,0.014247,0.222998
4,LP001008,1,0,0,0,0,0.074074,0.0,0.191027,10,1,2,Y,0.057292,2,2,0,0.016943,0.215161


In [32]:
df = data.drop('Loan_ID', axis=1)

In [33]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
0,1,0,0,0,0,0.07221,0.0,0.172214,10,1,2,Y,0.055394,2,2,0,0.015274,0.199061
1,1,1,1,0,0,0.05658,0.036192,0.172214,10,1,0,N,0.058435,2,2,1,0.015274,0.190398
2,1,1,0,0,1,0.037037,0.0,0.082489,10,1,2,Y,0.019583,1,1,0,0.007316,0.200216
3,1,1,0,1,0,0.031889,0.056592,0.160637,10,1,2,Y,0.04398,2,2,1,0.014247,0.222998
4,1,0,0,0,0,0.074074,0.0,0.191027,10,1,2,Y,0.057292,2,2,0,0.016943,0.215161


###### Splitting data to train and test

In [34]:
df_train = df[df.Loan_Status.notna()]
df_test = df[df.Loan_Status.isna()]

In [35]:
df_train['Loan_Status'] = df_train.Loan_Status.apply(lambda x:1 if x == 'N' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [36]:
df_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
0,1,0,0,0,0,0.07221,0.0,0.172214,10,1,2,0,0.055394,2,2,0,0.015274,0.199061
1,1,1,1,0,0,0.05658,0.036192,0.172214,10,1,0,1,0.058435,2,2,1,0.015274,0.190398
2,1,1,0,0,1,0.037037,0.0,0.082489,10,1,2,0,0.019583,1,1,0,0.007316,0.200216
3,1,1,0,1,0,0.031889,0.056592,0.160637,10,1,2,0,0.04398,2,2,1,0.014247,0.222998
4,1,0,0,0,0,0.074074,0.0,0.191027,10,1,2,0,0.057292,2,2,0,0.016943,0.215161


In [37]:
df_test.drop('Loan_Status', axis=1, inplace=True)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
614,1,1,0,0,0,0.070617,0.0,0.146165,10,1,2,0.053772,1,2,0,0.012964,0.172625
615,1,1,1,0,0,0.037975,0.036,0.16932,10,1,2,0.039393,2,2,1,0.015017,0.255362
616,1,1,2,0,0,0.061728,0.0432,0.287988,10,1,2,0.067347,0,0,1,0.025542,0.285783
617,1,1,2,0,0,0.028889,0.061104,0.131693,10,1,2,0.043289,1,2,1,0.01168,0.184938
618,1,0,0,1,0,0.040444,0.0,0.099855,10,1,2,0.023052,1,1,0,0.008856,0.218245


In [38]:
df_test = df_test.values

###### Splitting to X and y, training, cross-validation

In [40]:
X, y = df_train.drop('Loan_Status', axis=1).values, df_train.Loan_Status.values

In [53]:
rf_clf_1 = RandomForestClassifier(n_estimators= 1000)
rf_clf_2 = RandomForestClassifier(min_samples_leaf = 3, min_samples_split = 3, n_estimators= 1000)
rf_clf_3 = RandomForestClassifier(min_samples_leaf = 4, min_samples_split = 4, n_estimators= 1000)
rf_clf_4 = RandomForestClassifier(min_samples_leaf = 1, min_samples_split = 5, n_estimators= 1000)
rf_clf_5 = RandomForestClassifier(min_samples_leaf = 6, min_samples_split = 2, n_estimators= 1000)
rf_clf_6 = RandomForestClassifier(min_samples_leaf = 12, min_samples_split = 8, n_estimators= 1000)
rf_clf_7 = RandomForestClassifier(min_samples_leaf = 10, min_samples_split = 6, n_estimators= 1000)
gbc_clf_1 = GradientBoostingClassifier(learning_rate=0.05, min_samples_leaf=2, min_samples_split=3, max_depth= 3,n_estimators=500)
gbc_clf_2 = GradientBoostingClassifier(learning_rate=0.06, min_samples_leaf=3, min_samples_split=6, max_depth= 7,n_estimators=500)
gbc_clf_3 = GradientBoostingClassifier(learning_rate=0.02, min_samples_leaf=6, min_samples_split=3, max_depth= 4,n_estimators=500)
gbc_clf_4 = GradientBoostingClassifier(learning_rate=0.01, min_samples_leaf=4, min_samples_split=8, max_depth= 11,n_estimators=500)
gbc_clf_5 = GradientBoostingClassifier(learning_rate=0.03, min_samples_leaf=5, min_samples_split=7, max_depth= 6,n_estimators=500)
xt_clf_1 = ExtraTreesClassifier(min_samples_leaf=2, min_samples_split=2, max_depth=1, n_estimators=500)
xt_clf_2 = ExtraTreesClassifier(min_samples_leaf=3, min_samples_split=6, max_depth=5, n_estimators=500)
xt_clf_3 = ExtraTreesClassifier(min_samples_leaf=5, min_samples_split=3, max_depth=3, n_estimators=500)
xt_clf_4 = ExtraTreesClassifier(min_samples_leaf=6, min_samples_split=7, max_depth=9, n_estimators=500)
xt_clf_5 = ExtraTreesClassifier(min_samples_leaf=4, min_samples_split=2, max_depth=11, n_estimators=500)

In [54]:
estimators = [rf_clf_1, rf_clf_2, rf_clf_3, rf_clf_4, rf_clf_5, rf_clf_6, rf_clf_7, gbc_clf_1, gbc_clf_2, gbc_clf_3, gbc_clf_4, gbc_clf_5
              ,xt_clf_1, xt_clf_2, xt_clf_3, xt_clf_4, xt_clf_5]

In [58]:
len(estimators)

17

In [50]:
new_train_dF = pd.DataFrame()
new_test_dF = pd.DataFrame()

In [55]:
def stacked_models(estimators, X, y, new_train_df, new_test_df, test_df):
    
    for ind, est in enumerate(estimators):
        
        est.fit(X, y)
        
        new_train_df[ind] = est.predict(X)
        new_test_df[ind] = est.predict(test_df)
        
        print(accuracy_score( y, new_train_df[ind]))

In [56]:
stacked_models(estimators, X, y, new_train_dF, new_test_dF, df_test)

1.0
0.8941368078175895
0.8713355048859935
0.9820846905537459
0.8452768729641694
0.8306188925081434
0.8289902280130294
0.9837133550488599
1.0
0.9429967426710097
1.0
1.0
0.6889250814332247
0.8110749185667753
0.8094462540716613
0.8094462540716613
0.8224755700325733


In [57]:
new_test_dF.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [68]:
stacked_train = pd.concat([new_train_dF, df_train], axis=1, sort=False,)
stacked_test = pd.concat([new_test_dF, pd.DataFrame(df_test)], axis=1, sort=False, ignore_index=True)

In [69]:
stacked_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0,0,0,0,0,0,0,0,0,0,...,0.146165,10.0,1.0,2.0,0.053772,1.0,2.0,0.0,0.012964,0.172625
1,0,0,0,0,0,0,0,0,0,0,...,0.16932,10.0,1.0,2.0,0.039393,2.0,2.0,1.0,0.015017,0.255362
2,0,0,0,0,0,0,0,0,0,0,...,0.287988,10.0,1.0,2.0,0.067347,0.0,0.0,1.0,0.025542,0.285783
3,0,0,0,0,0,0,0,0,0,0,...,0.131693,10.0,1.0,2.0,0.043289,1.0,2.0,1.0,0.01168,0.184938
4,0,0,0,0,0,0,0,0,0,0,...,0.099855,10.0,1.0,2.0,0.023052,1.0,1.0,0.0,0.008856,0.218245


In [71]:
stacked_X, stacked_y = stacked_train.drop('Loan_Status', axis=1).values, stacked_train.Loan_Status.values
stacked_test = stacked_test.values

In [72]:
stacked_train_results = pd.DataFrame()
stacked_test_results = pd.DataFrame()

In [73]:
stacked_models(estimators, stacked_X, stacked_y, stacked_train_results, stacked_test_results, stacked_test)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [74]:
dbl_stacked_test = pd.concat([stacked_test_results, pd.DataFrame(df_test)], axis=1, ignore_index=True)

In [75]:
dbl_stacked_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0,0,0,0,0,0,0,0,0,0,...,0.146165,10.0,1.0,2.0,0.053772,1.0,2.0,0.0,0.012964,0.172625
1,0,0,0,0,0,0,0,0,0,0,...,0.16932,10.0,1.0,2.0,0.039393,2.0,2.0,1.0,0.015017,0.255362
2,0,0,0,0,0,0,0,0,0,0,...,0.287988,10.0,1.0,2.0,0.067347,0.0,0.0,1.0,0.025542,0.285783
3,0,0,0,0,0,0,0,0,0,0,...,0.131693,10.0,1.0,2.0,0.043289,1.0,2.0,1.0,0.01168,0.184938
4,0,0,0,0,0,0,0,0,0,0,...,0.099855,10.0,1.0,2.0,0.023052,1.0,1.0,0.0,0.008856,0.218245


In [76]:
dbl_stacked_train = pd.concat([stacked_train_results, df_train], axis=1)

In [77]:
dbl_stacked_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,Amount_Category,Income_Category,DoubleIncome,EMI,Loan_to_income_ratio
0,0,0,0,0,0,0,0,0,0,0,...,10,1,2,0,0.055394,2,2,0,0.015274,0.199061
1,1,1,1,1,1,1,1,1,1,1,...,10,1,0,1,0.058435,2,2,1,0.015274,0.190398
2,0,0,0,0,0,0,0,0,0,0,...,10,1,2,0,0.019583,1,1,0,0.007316,0.200216
3,0,0,0,0,0,0,0,0,0,0,...,10,1,2,0,0.04398,2,2,1,0.014247,0.222998
4,0,0,0,0,0,0,0,0,0,0,...,10,1,2,0,0.057292,2,2,0,0.016943,0.215161


In [79]:
res_X, res_y = dbl_stacked_train.drop('Loan_Status', axis=1).values, dbl_stacked_train.Loan_Status.values

In [80]:
dbl_stacked_test = dbl_stacked_test.values

In [82]:
gbc_clf_1.fit(res_X, res_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [84]:
pred = gbc_clf_1.predict(dbl_stacked_test)

In [85]:
sample = pd.read_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\sample.csv')

In [86]:
sample['Loan_Status'] = pred

In [87]:
sample['Loan_Status'] = sample.Loan_Status.apply(lambda x: 'N' if x == 1 else 'Y')

In [88]:
sample.to_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\sample_stacked_gbc.csv', index=False)

In [None]:
skf = StratifiedKFold(n_splits=10)

#### Random Forest

In [None]:
rf_clf_1 = RandomForestClassifier()

In [None]:
[*range(1, 11)]

In [None]:
params = {'n_estimators': [1000],
    'criterion': ['gini'],
    'max_depth': [1, None],
    'min_samples_split': [*range(2, 11)],
    'min_samples_leaf': [*range(11, 21)],
    'max_features': ['auto', 'sqrt']}

###### Grid search

In [None]:
gs_cv_rf = GridSearchCV(estimator=rf_clf_1, param_grid=params, scoring='accuracy', n_jobs=-1, cv=skf) 
# grid search with 10 fold stratified cross-validatiob

In [None]:
gs_cv_rf.fit(X, y)

In [None]:
gs_cv_rf.best_params_ # checking the best parameters for training

In [None]:
rf_clf_bst = RandomForestClassifier(min_samples_leaf = 11, min_samples_split = 4, n_estimators= 1000) # Instantiating a model with best params from grid search

In [None]:
rf_bst_cv = cross_val_score(rf_clf_bst, X, y, cv = skf, n_jobs=-1, scoring='accuracy', verbose=1)

In [None]:
print(rf_bst_cv.min(), rf_bst_cv.max(), rf_bst_cv.mean(), rf_bst_cv.std())
# mean accuracy score 80 max; 88, min 75, deviation seems to be high

In [None]:
rf_clf_bst.fit(X, y)

In [None]:
pred = rf_clf_bst.predict(df_test)

In [None]:
sample = pd.read_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\sample.csv')

In [None]:
df_test.shape

In [None]:
sample.shape

In [None]:
sample['Loan_Status'] = pred

In [None]:
sample['Loan_Status'] = sample.Loan_Status.apply(lambda x: 'Y' if x == 0 else 'N')

In [None]:
sample.to_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\submission_rf.csv', index=False)

In [None]:
sample.Loan_Status.value_counts()

#### Xtreme Gradient Boosting Classifiers

In [None]:
xgb_clf = xgb.XGBClassifier(n_jobs=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
xgb_pred = xgb_clf.predict(X_test)

In [None]:
xgb_prb_prd = xgb_clf.predict_proba(X_test)[:, 1] > 0.4

In [None]:
confusion_matrix(y_test, xgb_prb_prd)

In [None]:
accuracy_score(y_test, xgb_prb_prd)

In [None]:
f1_score(y_test, xgb_prb_prd, average='macro')

###### Grid search

In [None]:
params = {'max_depth': [3, 6, 9, 12],
         'colsample_bytree': [0.7, 1]}

In [None]:
xgb_clf_grid = xgb.XGBClassifier(n_estimators=500)

In [None]:
skf = StratifiedKFold(n_splits=3)

In [None]:
xgb_g_cv = GridSearchCV(estimator=xgb_clf_grid, param_grid=params, scoring='accuracy', n_jobs=-1, cv= skf)

In [None]:
xgb_g_cv.fit(X, y)

In [None]:
print(xgb_g_cv.best_params_, xgb_g_cv.best_score_)

In [None]:
xgb_bst = xgb.XGBClassifier(n_estimators=1000, colsample_bytree=0.7, max_depth=9, base_score=0.1)

In [None]:
cross_val_score(estimator=xgb_bst, X=X, y=y, cv=skf, n_jobs=-1).mean()

In [None]:
xgb_bst.fit(X, y)

In [None]:
xgb_pred = xgb_bst.predict(df_test)

In [None]:
sample['Loan_Status'] = xgb_pred

In [None]:
sample.Loan_Status.value_counts(dropna=False)

In [None]:
sample['Loan_Status'] = sample['Loan_Status'].apply(lambda x: 'N' if x == 0 else 'Y')

In [None]:
sample.to_csv('C:\\AADeloitte\\learning\\Boom\\Python Practise files\\Python\\Loan Prediction\\submission_xgb.csv', index=False)