In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.tools.plotting import scatter_matrix

In [2]:
data_row = pd.read_csv('train.csv')
data_row.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
data_row.describe()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [4]:
data_row.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [5]:
## select the most Important features
data = data_row.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12]]
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
## count values of gender
data['Gender'].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [7]:
## fill na based on the most frequent value "Male"
data['Gender'].fillna("Male",inplace=True)

In [8]:
## count values of married
data['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [9]:

## fill na based on the most frequent value "Yes"
data['Married'].fillna("Yes",inplace=True)

In [10]:
## count values of Dependants
data['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [11]:
## fill na based on the most frequent value "0"
data['Dependents'].fillna("0",inplace=True)

In [12]:
## count values of Self_Employed
data['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [13]:
## fill na based on the most frequent value "No"
data['Self_Employed'].fillna("No",inplace=True)

In [14]:
## Calc the mean of LoanAmount and fill na with the mean value 
loanAmounMean = data['LoanAmount'].mean()     
data['LoanAmount'].fillna(loanAmounMean,inplace=True)

In [15]:
## Calc the mean of Loan_Amount_Term and fill na with the mean value 
loanAmountTermMean = data['Loan_Amount_Term'].mean()
data['Loan_Amount_Term'].fillna(loanAmountTermMean,inplace=True)

In [16]:
## count values of Credit_History
data['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [26]:
## fill na based on the most frequent value "1.0"
data['Credit_History'].fillna(1.0,inplace=True)

In [18]:
## check if there are any null values
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [19]:
## convert categories values into numbers
data['Gender'] = LabelEncoder().fit_transform(data['Gender'])
data['Married'] = LabelEncoder().fit_transform(data['Married'])
data['Dependents'] = LabelEncoder().fit_transform(data['Dependents'])
data['Education'] = LabelEncoder().fit_transform(data['Education'])
data['Self_Employed'] = LabelEncoder().fit_transform(data['Self_Employed'])
data['Property_Area'] = LabelEncoder().fit_transform(data['Property_Area'])
data['Loan_Status'] = LabelEncoder().fit_transform(data['Loan_Status'])

In [20]:
data.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


## Split Data into Training & Test¶


In [21]:
features = data.iloc[:,[0,1,2,3,4,5,6,7,8,9,10]].values
labels = data.iloc[:,-1].values
validation_size = 0.3
seed = 0
scoring = 'accuracy'
## split all data into training and testing with ratio 70:30
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(features,labels, test_size=validation_size, random_state=seed)
print("Train Size is: ",len(x_train))
print("Test Size is: ",len(x_validation))

Train Size is:  429
Test Size is:  185


## Cross Validation

In [22]:
## Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10,random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.801827 (0.051350)
LDA: 0.801827 (0.051350)
KNN: 0.645570 (0.047207)
CART: 0.680509 (0.067395)
RF: 0.722425 (0.062932)
NB: 0.792580 (0.050197)
SVM: 0.671096 (0.047891)


## Model Building¶


In [23]:

## build model with logistic regression model
L_R = LogisticRegression()
L_R.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Model Evaluation¶


In [24]:
y_pred = L_R.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(confusion_matrix(y_validation, y_pred))
print(classification_report(y_validation, y_pred))

0.8324324324324325
[[ 22  29]
 [  2 132]]
              precision    recall  f1-score   support

           0       0.92      0.43      0.59        51
           1       0.82      0.99      0.89       134

   micro avg       0.83      0.83      0.83       185
   macro avg       0.87      0.71      0.74       185
weighted avg       0.85      0.83      0.81       185



## Model Serialization

In [25]:
pickle.dump(L_R, open('finalModel', 'wb'))