# ***LOAN PREDICTION USING VOTINGCLASSIFIER***

## ***Problem Statement***

Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.

Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set.

In [1]:
#Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#Import the dataset
dataset = pd.read_csv('loan.csv')

In [3]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
dataset['Dependents'] = dataset['Dependents'].replace({'0':'A','1':'B','2':'C',
                                          '3+':'D'})

In [5]:
dataset['Dependents'].value_counts()

A    345
B    102
C    101
D     51
Name: Dependents, dtype: int64

In [6]:
x = dataset.iloc[:, :-1]
y = dataset.iloc[:, 12]

In [7]:
x.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [8]:
#since there are missing values fill the numerical missing columns by its mean and fill the categorical missing columns by 
#most frequent 
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(x.iloc[:, [8]])
x.iloc[:, [8]] = imputer.transform(x.iloc[:,[8]])

imputer = imputer.fit(x.iloc[:, [9]])
x.iloc[:, [9]] = imputer.transform(x.iloc[:,[9]])



In [9]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="most_frequent")
imputer = imp.fit(x.iloc[:,[1]])
x.iloc[:,[1]] = imputer.transform(x.iloc[:,[1]])

imputer = imp.fit(x.iloc[:,[2]])
x.iloc[:,[2]] = imputer.transform(x.iloc[:,[2]])

imputer = imp.fit(x.iloc[:,[3]])
x.iloc[:,[3]] = imputer.transform(x.iloc[:,[3]])

imputer = imp.fit(x.iloc[:,[5]])
x.iloc[:,[5]] = imputer.transform(x.iloc[:,[5]])

imputer = imp.fit(x.iloc[:,[10]])
x.iloc[:,[10]] = imputer.transform(x.iloc[:,[10]])

In [10]:
#Encode the categorical columns
from sklearn.preprocessing import LabelEncoder
labelencoder_x = LabelEncoder()
x.iloc[:,0] = labelencoder_x.fit_transform(x.iloc[:,0])
x.iloc[:,1] = labelencoder_x.fit_transform(x.iloc[:,1])
x.iloc[:,2] = labelencoder_x.fit_transform(x.iloc[:,2])
x.iloc[:,3] = labelencoder_x.fit_transform(x.iloc[:,3])
x.iloc[:,4] = labelencoder_x.fit_transform(x.iloc[:,4])
x.iloc[:,5] = labelencoder_x.fit_transform(x.iloc[:,5])
x.iloc[:,11] = labelencoder_x.fit_transform(x.iloc[:,11])

X=pd.DataFrame(x)

In [11]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
X=pd.DataFrame(X)

In [12]:
X.columns=x.columns

In [13]:
X.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,-1.729232,0.472343,-1.372089,-0.737806,-0.528362,-0.392601,0.072991,-0.554487,0.0,0.279851,0.411733,1.223298
1,-1.72359,0.472343,0.728816,0.25347,-0.528362,-0.392601,-0.134412,-0.038732,-0.219273,0.279851,0.411733,-1.318513
2,-1.717948,0.472343,0.728816,-0.737806,-0.528362,2.547117,-0.393747,-0.554487,-0.957641,0.279851,0.411733,1.223298
3,-1.712307,0.472343,0.728816,-0.737806,1.892641,-0.392601,-0.462062,0.25198,-0.314547,0.279851,0.411733,1.223298
4,-1.706665,0.472343,-1.372089,-0.737806,-0.528362,-0.392601,0.097728,-0.554487,-0.064454,0.279851,0.411733,1.223298


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,random_state =5)

## ***SVC***

In [15]:
from sklearn.svm import SVC
classifier = SVC()

In [19]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
random_search = RandomizedSearchCV(estimator = classifier,
                           param_distributions = parameters,
                           scoring = 'accuracy',
                           n_iter=20,
                           cv = 10,
                           n_jobs = -1)
random_search = random_search.fit(x_train, y_train)



In [22]:
random_search.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
clf1=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


## ***RANDOMFOREST***

In [26]:
from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier()

In [27]:
parameters={'n_estimators':[40,60,80,90,100,140,220,250,300],
            'criterion':['gini','entropy'],
            'max_depth':[3,4,5,6,7],
            'max_features':randint(1,3)}
randomsearch=RandomizedSearchCV(estimator=classifier,
                                param_distributions=parameters,
                                n_iter=20,cv=10,n_jobs=-1,
                                scoring = 'accuracy')
randomsearch=randomsearch.fit(x_train,y_train)



In [28]:
randomsearch.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=6, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
clf2=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=6, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## ***KNN***

In [36]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()

In [39]:
parameters={'n_neighbors':[3,4,5,6,7,8,9]}
randomsearch=RandomizedSearchCV(estimator=classifier,
                                param_distributions=parameters,
                                n_iter=8,cv=10,n_jobs=-1,
                                scoring = 'accuracy')
randomsearch=randomsearch.fit(x_train,y_train)



In [40]:
randomsearch.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [41]:
clf3=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

## ***VOTING CLASSIFIER***

In [48]:
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('svc', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')

eclf1 = eclf1.fit(x_train,y_train)

In [49]:
y_pred=eclf1.predict(x_test)

In [50]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 20  26]
 [  2 106]]


In [51]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.8181818181818182