# Import the libraries

In [1]:
import numpy as np
import pandas as pd

#### Load the data

In [2]:
crTrain = pd.read_csv('Python_Module_Day_15.2_Credit_Risk_Train_data.csv')
crTest = pd.read_csv('Python_Module_Day_15.3_Credit_Risk_Test_data.csv')
crValidate = pd.read_csv('Python_Module_Day_15.4_Credit_Risk_Validate_data.csv')

#### Check Training Data

In [3]:
crTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [4]:
crTrain.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


#### Remove Non-relevant Variable

In [5]:
crTrain.drop(columns='Loan_ID', inplace=True)
crTrain.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Perform Imputations & Check if any Missing Values

In [6]:
crTrain.LoanAmount.fillna(crTrain.LoanAmount.mean(), inplace=True)
crTrain.Loan_Amount_Term.fillna(crTrain.Loan_Amount_Term.mean(), inplace=True)
crTrain.Credit_History.fillna(crTrain.Credit_History.mode()[0], inplace=True)
crTrain.Self_Employed.fillna(crTrain.Self_Employed.mode()[0], inplace=True)
crTrain.Gender.fillna(crTrain.Gender.mode()[0], inplace=True)
crTrain.Married.fillna(crTrain.Married.mode()[0], inplace=True)
crTrain.Dependents.fillna(crTrain.Dependents.mode()[0], inplace=True)


In [7]:
crTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
Gender               614 non-null object
Married              614 non-null object
Dependents           614 non-null object
Education            614 non-null object
Self_Employed        614 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           614 non-null float64
Loan_Amount_Term     614 non-null float64
Credit_History       614 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(7)
memory usage: 57.6+ KB


#### Is the data balanced ?

In [8]:
# Label is unbalanced
crTrain.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

#### Which of my non numeric variables are independent of label?

In [9]:
contigency_table = pd.crosstab(crTrain['Gender'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Gender and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Gender and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Null Hypothesis(H0): Gender and Loan_Status are independent of each other.
Confidence Level : 26.08538689130362 %


In [10]:
contigency_table = pd.crosstab(crTrain['Married'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Married and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Married and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Alternate Hypothesis (H1): Married and Loan_Status have some form of relationship.
Confidence Level : 97.03914191374179 %


In [11]:
contigency_table = pd.crosstab(crTrain['Property_Area'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Property_Area and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Property_Area and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Alternate Hypothesis (H1): Property_Area and Loan_Status have some form of relationship.
Confidence Level : 99.78639812188355 %


In [12]:
contigency_table = pd.crosstab(crTrain['Dependents'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Dependents and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Dependents and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Null Hypothesis(H0): Dependents and Loan_Status are independent of each other.
Confidence Level : 63.11336918633995 %


In [12]:
contigency_table = pd.crosstab(crTrain['Education'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Education and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Education and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Alternate Hypothesis (H1): Education and Loan_Status have some form of relationship.
Confidence Level : 95.69003787064264 %


In [13]:
contigency_table = pd.crosstab(crTrain['Self_Employed'],crTrain['Loan_Status'])

from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(contigency_table)
if p <= 0.05:
    print("Alternate Hypothesis (H1): Self_Employed and Loan_Status have some form of relationship.")
else:
    print("Null Hypothesis(H0): Self_Employed and Loan_Status are independent of each other.")
print("Confidence Level : {} %".format(((1- p)*100)))

Null Hypothesis(H0): Self_Employed and Loan_Status are independent of each other.
Confidence Level : 2.892535885966907 %


### Using Chi Square Test for Hypothesis we get 
#### Significant Variables
Married  
Education  
Property_Area
#### Insignificant Variables 
Gender  
Dependents   
Self_Employed  
#### Therefore we exclude Gender  Dependents   & Self_Employed from features.

In [14]:
crTrain.drop(columns='Gender', inplace=True)
crTrain.drop(columns='Dependents', inplace=True)
crTrain.drop(columns='Self_Employed', inplace=True)

In [15]:
crTrain.head()

Unnamed: 0,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,No,Graduate,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Yes,Graduate,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Yes,Graduate,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Yes,Not Graduate,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,No,Graduate,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Married and Education column can be converted to numeric by assigning 1 & 0 to their values as below

In [16]:
crTrain['Married'].replace({'No': 0},inplace=True)
crTrain['Married'].replace({'Yes': 1},inplace=True)
crTrain['Education'].replace({'Not Graduate': 0},inplace=True)
crTrain['Education'].replace({'Graduate': 1},inplace=True)
crTrain.head()

Unnamed: 0,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,1,1,4583,1508.0,128.0,360.0,1.0,Rural,N
2,1,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,1,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,0,1,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Let's factorise Property_Area variable

In [17]:
dummyDFProperty_Area = pd.get_dummies(crTrain.Property_Area)
remainDFProperty_Area = crTrain.iloc[:,[0,1,2,3,4,5,6,8]]
finalTrain = pd.concat([dummyDFProperty_Area, remainDFProperty_Area], axis=1)
finalTrain.head()

Unnamed: 0,Rural,Semiurban,Urban,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,0,0,1,0,1,5849,0.0,146.412162,360.0,1.0,Y
1,1,0,0,1,1,4583,1508.0,128.0,360.0,1.0,N
2,0,0,1,1,1,3000,0.0,66.0,360.0,1.0,Y
3,0,0,1,1,0,2583,2358.0,120.0,360.0,1.0,Y
4,0,0,1,0,1,6000,0.0,141.0,360.0,1.0,Y


#### Let us prepare test data on the same lines as we preprocessed training data

In [18]:
crTest.drop(columns='Loan_ID', inplace=True)
crTest.drop(columns='Gender', inplace=True)
crTest.drop(columns='Dependents', inplace=True)
crTest.drop(columns='Self_Employed', inplace=True)

In [19]:
crTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 8 columns):
Married              367 non-null object
Education            367 non-null object
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           362 non-null float64
Loan_Amount_Term     361 non-null float64
Credit_History       338 non-null float64
Property_Area        367 non-null object
dtypes: float64(3), int64(2), object(3)
memory usage: 23.0+ KB


# Perform Imputations for test data

In [20]:
crTest.LoanAmount.fillna(crTest.LoanAmount.mean(), inplace=True)
crTest.Loan_Amount_Term.fillna(crTest.Loan_Amount_Term.mean(), inplace=True)
crTest.Credit_History.fillna(crTest.Credit_History.mode()[0], inplace=True)
crTest.Married.fillna(crTest.Married.mode()[0], inplace=True)
crTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 8 columns):
Married              367 non-null object
Education            367 non-null object
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           367 non-null float64
Loan_Amount_Term     367 non-null float64
Credit_History       367 non-null float64
Property_Area        367 non-null object
dtypes: float64(3), int64(2), object(3)
memory usage: 23.0+ KB


In [21]:
crTest['Married'].replace({'No': 0},inplace=True)
crTest['Married'].replace({'Yes': 1},inplace=True)
crTest['Education'].replace({'Not Graduate': 0},inplace=True)
crTest['Education'].replace({'Graduate': 1},inplace=True)
crTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 8 columns):
Married              367 non-null int64
Education            367 non-null int64
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           367 non-null float64
Loan_Amount_Term     367 non-null float64
Credit_History       367 non-null float64
Property_Area        367 non-null object
dtypes: float64(3), int64(4), object(1)
memory usage: 23.0+ KB


In [22]:
crTest.head()

Unnamed: 0,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,5720,0,110.0,360.0,1.0,Urban
1,1,1,3076,1500,126.0,360.0,1.0,Urban
2,1,1,5000,1800,208.0,360.0,1.0,Urban
3,1,1,2340,2546,100.0,360.0,1.0,Urban
4,0,0,3276,0,78.0,360.0,1.0,Urban


In [23]:
dummyDFProperty_Area_test = pd.get_dummies(crTest.Property_Area)
remainDFProperty_Area_test = crTest.iloc[:,[0,1,2,3,4,5,6]]
finalTest = pd.concat([dummyDFProperty_Area_test, remainDFProperty_Area_test], axis=1)
finalTest.head()

Unnamed: 0,Rural,Semiurban,Urban,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0,0,1,1,1,5720,0,110.0,360.0,1.0
1,0,0,1,1,1,3076,1500,126.0,360.0,1.0
2,0,0,1,1,1,5000,1800,208.0,360.0,1.0
3,0,0,1,1,1,2340,2546,100.0,360.0,1.0
4,0,0,1,0,0,3276,0,78.0,360.0,1.0


In [24]:
finalTrain.head()

Unnamed: 0,Rural,Semiurban,Urban,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,0,0,1,0,1,5849,0.0,146.412162,360.0,1.0,Y
1,1,0,0,1,1,4583,1508.0,128.0,360.0,1.0,N
2,0,0,1,1,1,3000,0.0,66.0,360.0,1.0,Y
3,0,0,1,1,0,2583,2358.0,120.0,360.0,1.0,Y
4,0,0,1,0,1,6000,0.0,141.0,360.0,1.0,Y


#### Training Data is now ready to run Logistic Regression
#### Let's run Logistic Regression

In [25]:
features = finalTrain.iloc[:,0:10].values
label = finalTrain.iloc[:,10].values

In [26]:
import warnings
warnings.filterwarnings('ignore')

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
# for i in range(1,615):
logistic_model = LogisticRegression()
logistic_model.fit(features,label)
cm_train = confusion_matrix(label,logistic_model.predict(features))
cm_train


array([[ 84, 108],
       [  8, 414]], dtype=int64)

In [28]:
from sklearn.metrics import classification_report
print(classification_report(label,logistic_model.predict(features)))

              precision    recall  f1-score   support

           N       0.91      0.44      0.59       192
           Y       0.79      0.98      0.88       422

   micro avg       0.81      0.81      0.81       614
   macro avg       0.85      0.71      0.73       614
weighted avg       0.83      0.81      0.79       614



Let us fit model on the test data to predict label Loan_Status and use outcome column from Validation data set to create confusion matrix

In [29]:
# model.fit(crValidate.outcome, model.predict(finalTest))
cm_test = confusion_matrix(crValidate.outcome,logistic_model.predict(finalTest))
cm_test

array([[ 58,  19],
       [  1, 289]], dtype=int64)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(crValidate.outcome,logistic_model.predict(finalTest)))

              precision    recall  f1-score   support

           N       0.98      0.75      0.85        77
           Y       0.94      1.00      0.97       290

   micro avg       0.95      0.95      0.95       367
   macro avg       0.96      0.87      0.91       367
weighted avg       0.95      0.95      0.94       367



# Using KNN 

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

for j in range(3,11,2):
    knn_model = KNeighborsClassifier(n_neighbors = j )
    knn_model.fit(features,label)

    train_score = knn_model.score(features,label)
    test_score = knn_model.score(finalTest,crValidate.outcome)
#     cm = confusion_matrix(label,model.predict(features))
    
    if test_score > train_score:
        print("Test Score: {} Train Score: {} Neighbour: {}".format(test_score,train_score,j))


Test Score: 0.7275204359673024 Train Score: 0.7263843648208469 Neighbour: 7
Test Score: 0.7520435967302452 Train Score: 0.7166123778501629 Neighbour: 9


In [32]:
knn_model = KNeighborsClassifier(n_neighbors = 9 )
knn_model.fit(features,label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=9, p=2,
           weights='uniform')

In [33]:
knn_cm_train = confusion_matrix(crValidate.outcome,knn_model.predict(finalTest))
knn_cm_train


array([[ 10,  67],
       [ 24, 266]], dtype=int64)

In [34]:
from sklearn.metrics import classification_report
print(classification_report(crValidate.outcome,knn_model.predict(finalTest)))


              precision    recall  f1-score   support

           N       0.29      0.13      0.18        77
           Y       0.80      0.92      0.85       290

   micro avg       0.75      0.75      0.75       367
   macro avg       0.55      0.52      0.52       367
weighted avg       0.69      0.75      0.71       367



# Using SVM

In [41]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(features,label)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [43]:
predictions = svc_model.predict(finalTest)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(crValidate.outcome,svc_model.predict(finalTest)))

[[  1  76]
 [  0 290]]


Since the number of misclassifications in our model is high, we will try to adjust the parameters.
we will create 'grid' of parameters and try out all possible combinations which is called as grid search.Also the CV in gridSearchCV stands for Cross-validaton

In [46]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(features,label)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6862745098039216, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.6878048780487804, total=   0.0s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.6862745098039216, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.6862745098039216, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, s

[CV]  C=1000, gamma=1, kernel=rbf, score=0.6911764705882353, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6926829268292682, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6911764705882353, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV]  C=1000, gamma=0.01, kernel=rbf, score=0.6829268292682927, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV]  C=1000, gamma=0.01, kernel=rbf, score=0.6878048780487804, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV]  C=1000, gamma=0.01, kernel=rbf, score=0.6813725490196079, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=r

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    2.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [47]:
grid.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [48]:
grid.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Now we can re-run the program

In [50]:
grid_predictions = grid.predict(finalTest)
print(confusion_matrix(crValidate.outcome,grid_predictions))

[[  1  76]
 [  0 290]]


# we will try to  find out the  accuracy of all the models

In [55]:
from sklearn.metrics import accuracy_score

For logistic model

In [56]:
print (accuracy_score(crValidate.outcome,logistic_model.predict(finalTest)))

0.9455040871934605


For KNN  model

In [57]:
print (accuracy_score(crValidate.outcome,knn_model.predict(finalTest)))

0.7520435967302452


For SVM model

In [58]:
print (accuracy_score(crValidate.outcome,svc_model.predict(finalTest)))

0.7929155313351499


For SVM model using gridsearch

In [59]:
print (accuracy_score(crValidate.outcome, grid_predictions))

0.7929155313351499


OBSERVATIONS :
1. Even  after the hyper-parameter tuning in the svm model ,the performance of the model is not improved so we won't go for this approach.
2. Logistic Regression Model Gives better results than kNN and SVM models.

In [67]:
finalTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 10 columns):
Rural                367 non-null uint8
Semiurban            367 non-null uint8
Urban                367 non-null uint8
Married              367 non-null int64
Education            367 non-null int64
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           367 non-null float64
Loan_Amount_Term     367 non-null float64
Credit_History       367 non-null float64
dtypes: float64(3), int64(4), uint8(3)
memory usage: 21.2 KB


In [40]:
Property_Area = str(input('Enter Property_Area (rural OR urban OR semiurban: '))
Property_Area = Property_Area.upper()
Married=float(input('Enter Marital Status, 1 = YES, 0=No: '))
Education = float(input('Enter 1 if Graduate, 0 if Non Graduate: '))
ApplicantIncome = float(input('Enter the Applicant Income: '))
CoapplicantIncome = float(input('Enter the Coapplicants Income: '))
LoanAmount=float(input('Enter Loan Amount: '))
Loan_Amount_Term = float(input('Enter Loan Amount Term: '))
Credit_History = float(input('Enter the Credit History (1 or 0): '))
if Property_Area=='RURAL':
    Rural = 1
    Semiurban = 0
    Urban = 0
if Property_Area=='SEMIURBAN':
    Rural = 0
    Semiurban = 1
    Urban = 0
if Property_Area=='URBAN':
    Rural = 0
    Semiurban = 0
    Urban = 1
Loan_Status = logistic_model.predict([[Rural,Semiurban,Urban,Married,Education,
                                       ApplicantIncome,CoapplicantIncome,LoanAmount,
                                      Loan_Amount_Term,Credit_History]])
if Loan_Status == 'Y':
    Loan_Status = "Approved"
if Loan_Status == 'N':
    Loan_Status = "Rejected"
print("Your Loan is {}.".format(Loan_Status))

Enter Property_Area (rural OR urban OR semiurban: urban
Enter Marital Status, 1 = YES, 0=No: 1
Enter 1 if Graduate, 0 if Non Graduate: 1
Enter the Applicant Income: 5678
Enter the Coapplicants Income: 400
Enter Loan Amount: 200
Enter Loan Amount Term: 360
Enter the Credit History (1 or 0): 1
Your Loan is Approved.
