# Loan Prediction Data



### Importing necessary packages 

In [101]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

###  Importing train and test data

In [2]:
train = pd.read_csv('https://datahack-prod.s3.ap-south-1.amazonaws.com/train_file/train_u6lujuX_CVtuZ9i.csv')

In [3]:
test= pd.read_csv('https://datahack-prod.s3.ap-south-1.amazonaws.com/test_file/test_Y3wMUE5_7gLdaTN.csv')

In [4]:
fulldata = [train, test]

In [5]:
train.head(n=2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [6]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [7]:
test.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural
366,LP002989,Male,No,0,Graduate,Yes,9200,0,98.0,180.0,1.0,Rural


In [8]:
train.shape

(614, 13)

In [9]:
test.shape

(367, 12)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [11]:
train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [12]:
test.describe(include=['O'])

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area
count,367,356,367,357,367,344,367
unique,367,2,2,4,2,2,3
top,LP001622,Male,Yes,0,Graduate,No,Urban
freq,1,286,233,200,283,307,140


In [13]:
train.describe(include=['O'])

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,601,611,599,614,582,614,614
unique,614,2,2,4,2,2,3,2
top,LP002928,Male,Yes,0,Graduate,No,Semiurban,Y
freq,1,489,398,345,480,500,233,422


In [14]:
train['Gender'].unique()

array(['Male', 'Female', nan], dtype=object)

In [15]:
train['Married'].unique()

array(['No', 'Yes', nan], dtype=object)

In [16]:
train['Dependents'].unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [17]:
train['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [18]:
train['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [19]:
train['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

### Imputing missing values

In [20]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [21]:
for dataset in fulldata:
    dataset['Gender']= dataset['Gender'].fillna('Male')

In [22]:
for dataset in fulldata:
    dataset['Married']= dataset['Married'].fillna('Yes')

In [23]:
for dataset in fulldata:
    dataset['Dependents']= dataset['Dependents'].fillna('0')

In [24]:
for dataset in fulldata:
    dataset['Self_Employed']= dataset['Self_Employed'].fillna('No')

In [25]:
for dataset in fulldata:
    dataset['LoanAmount']= dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean())

In [26]:
for dataset in fulldata:
    dataset['Loan_Amount_Term']= dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mean())

In [27]:
for dataset in fulldata:
    dataset['Credit_History']= dataset['Credit_History'].fillna(dataset['Credit_History'].mean())

In [28]:
train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

###  Data Cleaning 

In [29]:
for dataset in fulldata:
    dataset['Gender']=dataset['Gender'].map({'Male':1, 'Female':0}).astype(int)

In [30]:
for dataset in fulldata:
    dataset['Married']=dataset['Married'].map({'Yes':1, 'No':0}).astype(int)

In [31]:
train['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [32]:
train['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [33]:
for dataset in fulldata:
    dataset['Dependents']= dataset['Dependents'].map({'0':0,'1':1,'2':2,'3+':3}).astype(int)

In [34]:
for dataset in fulldata:
    dataset['Education']=dataset['Education'].map({'Graduate':1, 'Not Graduate':0}).astype(int)

In [35]:
for dataset in fulldata:
    dataset['Self_Employed']=dataset['Self_Employed'].map({'Yes':1, 'No':0}).astype(int)

In [36]:
for dataset in fulldata:
    dataset['Property_Area']=dataset['Property_Area'].map({'Urban':0, 'Rural':1, 'Semiurban':2}).astype(int)

In [37]:
train['Loan_Status']= train['Loan_Status'].map({'Y':1, 'N':0})

In [38]:
train['Credit_History'].unique()

array([ 1.        ,  0.        ,  0.84219858])

###  Feature Engineering 

In [39]:
train[[ 'Gender', 'Loan_Status']].groupby('Gender',as_index=False).mean()

Unnamed: 0,Gender,Loan_Status
0,0,0.669643
1,1,0.691235


In [40]:
train[[ 'Married', 'Loan_Status']].groupby('Married',as_index=False).mean()

Unnamed: 0,Married,Loan_Status
0,0,0.629108
1,1,0.718204


In [41]:
train[[ 'Dependents', 'Loan_Status']].groupby('Dependents',as_index=False).mean()

Unnamed: 0,Dependents,Loan_Status
0,0,0.686111
1,1,0.647059
2,2,0.752475
3,3,0.647059


In [42]:
train[[ 'Education', 'Loan_Status']].groupby('Education',as_index=False).mean()

Unnamed: 0,Education,Loan_Status
0,0,0.61194
1,1,0.708333


In [43]:
train[[ 'Self_Employed', 'Loan_Status']].groupby('Self_Employed',as_index=False).mean()

Unnamed: 0,Self_Employed,Loan_Status
0,0,0.68797
1,1,0.682927


In [44]:
train[[ 'Credit_History', 'Loan_Status']].groupby('Credit_History',as_index=False).mean()

Unnamed: 0,Credit_History,Loan_Status
0,0.0,0.078652
1,0.842199,0.74
2,1.0,0.795789


In [45]:
train[[ 'Property_Area', 'Loan_Status']].groupby('Property_Area',as_index=False).mean()

Unnamed: 0,Property_Area,Loan_Status
0,0,0.658416
1,1,0.614525
2,2,0.76824


In [46]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,1,0,5849,0.0,146.412162,360.0,1.0,0,1
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,1,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,0,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,0,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,0,1


In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               614 non-null int64
Married              614 non-null int64
Dependents           614 non-null int64
Education            614 non-null int64
Self_Employed        614 non-null int64
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           614 non-null float64
Loan_Amount_Term     614 non-null float64
Credit_History       614 non-null float64
Property_Area        614 non-null int64
Loan_Status          614 non-null int64
dtypes: float64(4), int64(8), object(1)
memory usage: 62.4+ KB


In [48]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Loan_ID              367 non-null object
Gender               367 non-null int64
Married              367 non-null int64
Dependents           367 non-null int64
Education            367 non-null int64
Self_Employed        367 non-null int64
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           367 non-null float64
Loan_Amount_Term     367 non-null float64
Credit_History       367 non-null float64
Property_Area        367 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 34.5+ KB


###  lets split the train data into Target column as 'y' and independent columns as 'X'

In [49]:
y= train['Loan_Status'].values 

In [50]:
y.shape

(614,)

In [106]:
y

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0,

In [51]:
X= train.drop(['Loan_Status', 'Loan_ID'], axis=1).values

In [52]:
test_new= test.drop('Loan_ID', axis=1)

In [103]:
X

array([[   1.,    0.,    0., ...,  360.,    1.,    0.],
       [   1.,    1.,    1., ...,  360.,    1.,    1.],
       [   1.,    1.,    0., ...,  360.,    1.,    0.],
       ..., 
       [   1.,    1.,    1., ...,  360.,    1.,    0.],
       [   1.,    1.,    2., ...,  360.,    1.,    0.],
       [   0.,    0.,    0., ...,  360.,    0.,    2.]])

In [102]:
scale = StandardScaler()

In [104]:
X= scale.fit_transform(X)

In [107]:
X_train, X_test, y_train,y_test= train_test_split(X,y, test_size=0.25, random_state=42)

## Applying Machine learning Algorithms for Prediction

### 1. knn (KNeighborsClassifier)

In [215]:
knn= KNeighborsClassifier()

In [216]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [217]:
y_pred = knn.predict(X_test)

** Metrics for evaluation**

In [218]:
knn.score(X_test, y_test)

0.77272727272727271

In [219]:
confusion_matrix(y_test, y_pred)

array([[24, 30],
       [ 5, 95]])

In [220]:
classification_report(y_test, y_pred)

'             precision    recall  f1-score   support\n\n          0       0.83      0.44      0.58        54\n          1       0.76      0.95      0.84       100\n\navg / total       0.78      0.77      0.75       154\n'

** lets see by tuning the parameters if we can get some better accuracy**

In [221]:
param_grid = {'n_neighbors':np.arange(1,15),
               'p': [2,3]}

In [222]:
knn_cv = GridSearchCV(knn, param_grid)

In [223]:
knn_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'p': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [224]:
y_predcv= knn_cv.predict(X_test)

In [264]:
knnscore=knn_cv.score(X_test, y_test)

In [226]:
knn_cv.best_params_

{'n_neighbors': 13, 'p': 3}

### 2. LogisticRegression

In [227]:
logreg= LogisticRegression()

In [228]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [229]:
y_pred= logreg.predict(X_test)

** Metrics for Evaluation**

In [230]:
logreg.score(X_test, y_test)

0.77922077922077926

In [231]:
confusion_matrix(y_test, y_pred)

array([[22, 32],
       [ 2, 98]])

In [232]:
classification_report(y_test, y_pred)

'             precision    recall  f1-score   support\n\n          0       0.92      0.41      0.56        54\n          1       0.75      0.98      0.85       100\n\navg / total       0.81      0.78      0.75       154\n'

** Tuning hyperparameters of Logistic model**

In [233]:
param_grid1 = {'penalty':['l1', 'l2'],
               'C': [0.001,0.01,0.1,1,10]}

In [234]:
logreg_cv= GridSearchCV(logreg, param_grid1)

In [235]:
logreg_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [236]:
y_pred1= logreg.predict(X_test)

In [263]:
logregscore=logreg_cv.score(X_test, y_test)

In [238]:
logreg_cv.best_params_

{'C': 0.001, 'penalty': 'l2'}

### 3. RandomForest

In [239]:
random=RandomForestClassifier()

In [240]:
random.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [241]:
y_pred2= random.predict(X_test)

** Metrics for Evaluation**

In [262]:
randomscore=random.score(X_test, y_test)

In [243]:
confusion_matrix(y_test, y_pred2)

array([[33, 21],
       [12, 88]])

** Parameters tuning on RandomForest**

In [244]:
param_grid={'n_estimators':[120,160,300],
             'max_depth':[5,8,15],
              'min_samples_leaf':[1,2],
           'max_features':['sqrt', 'auto']}

In [245]:
random_cv= GridSearchCV(random, param_grid)

In [246]:
random_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [120, 160, 300], 'max_depth': [5, 8, 15], 'min_samples_leaf': [1, 2], 'max_features': ['sqrt', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [178]:
y_pred3= random_cv.predict(X_test)

** Metrics for Evaluation**

In [247]:
random_cv.score(X_test, y_test)

0.77272727272727271

In [249]:
random_cv.best_params_

{'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'n_estimators': 120}

### 4. SVM 

In [250]:
svm = LinearSVC()

In [251]:
svm.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [252]:
y_pred4= svm.predict(X_test)

In [253]:
svm.score(X_test, y_test)

0.77272727272727271

In [254]:
param_grid={'C':[0.001,0.01,0.1,1,10] }

In [255]:
svm_cv =GridSearchCV(svm, param_grid)

In [256]:
svm_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [257]:
y_pred5= svm_cv.predict(X_test)

In [260]:
svmscore=svm_cv.score(X_test, y_test)

In [259]:
svm_cv.best_params_

{'C': 0.001}

In [265]:
models = pd.DataFrame({'Models':['Knn', 'Logistic','RandomForest','SVM'],
                        'Scores':[knnscore, logregscore, randomscore, svmscore]}
                      )

In [266]:
models

Unnamed: 0,Models,Scores
0,Knn,0.772727
1,Logistic,0.779221
2,RandomForest,0.785714
3,SVM,0.779221


###  Creating a submission dataframe with id and predicted results

In [267]:
submission = pd.DataFrame({'Loan_ID': test['Loan_ID'],
                            'Loan_Status': y_new_pred})

In [268]:
submission

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1
5,LP001054,1
6,LP001055,1
7,LP001056,0
8,LP001059,1
9,LP001067,1


In [269]:
submission.to_csv('loanpred.csv', index=False)