In [1]:
import numpy as np
import pandas as pd

In [2]:
#Read the data
datafile= "C:/Users/malat/Desktop/IView/analyticsvidya/course_datascience/loanprediction_train.csv"
data=pd.read_csv(datafile)

In [3]:
# Data set has 614 rows and 13 columns
print(data.shape)

(614, 13)


In [4]:
#print first few rows
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# data types of each feature. You will come to know the missing values in each column.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [6]:
# the summary of numerical variables
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
## Let us do some data cleaning
## Impute the Missing Values

In [8]:
print(data['Gender'].value_counts())
print(data['Married'].value_counts())
print(data['Dependents'].value_counts())
print(data['Education'].value_counts())
print(data['Self_Employed'].value_counts())
print(data['Property_Area'].value_counts())

Male      489
Female    112
Name: Gender, dtype: int64
Yes    398
No     213
Name: Married, dtype: int64
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
No     500
Yes     82
Name: Self_Employed, dtype: int64
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64


In [9]:
#fill in the missing values with mode 
data['Gender'].fillna('Male',inplace=True)
data['Married'].fillna('Yes',inplace=True)
data['Dependents'].fillna('0',inplace=True)
data['Education'].fillna('Graduate',inplace=True)
data['Self_Employed'].fillna('No',inplace=True)
data['Property_Area'].fillna('Semiurban',inplace=True)
data['Loan_Amount_Term'].fillna(360,inplace=True)

In [10]:
# Imputing the missing value with mean.
data['LoanAmount'].fillna(data['LoanAmount'].mean(), inplace=True)

In [11]:
data['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [12]:
pd.crosstab(data["Credit_History"],data["Loan_Status"],margins=True)

Loan_Status,N,Y,All
Credit_History,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,82,7,89
1.0,97,378,475
All,179,385,564


In [None]:
 people with a credit history have higher chances of getting a loan, as 
 most of the people with credit history got a loan as compared to people without credit history.

In [13]:
# Credit history is important. From the observation above, we fill the missing values of Credit History column
# with 1.0 if the row has a Loan_Status of 'Y', else 0.0
conditions = [data['Loan_Status'] == 'Y', data['Loan_Status'] == 'N']
values = [1.0, 0.0]
data['Credit_History'] = np.where(data['Credit_History'].isnull(),
                              np.select(conditions, values),
                              data['Credit_History'])

In [15]:
data.isnull().sum() 

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
# Great, no missing values
# Now Some more data munging
# The variables loan amount and Applicant Income have outliers and in this problem it is normal to have higher income
#and so applying for higher amount of loan is possible.Take log transformation to have a normal distribution

In [16]:
data['LoanAmount_log'] = np.log(data['LoanAmount'])

In [17]:
# Let us combine the incomes of Applicant and Coapplicant and apply log transformation
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome']
data['TotalIncome_log'] = np.log(data['TotalIncome'])

In [None]:
sklearn requires all inputs to be numeric, we should convert all our categorical variables into numeric 
by encoding the categories. This can be done using the following code:

In [18]:
var_mod=['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']
for i in var_mod:
    data[i] = data[i].astype('category')

In [19]:
for i in var_mod:
    data[i] =data[i].cat.codes

In [20]:
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn.metrics import confusion_matrix


#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print("Accuracy : %s" % "{0:.3%}".format(accuracy))
  
  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 
  cm=confusion_matrix(predictions,data[outcome])
  print(cm) 



In [21]:
outcome_var = 'Loan_Status'
model = LogisticRegression()
predictor_var = ['Credit_History','LoanAmount_log','TotalIncome_log','Gender','Married','Education','Self_Employed','Property_Area']
classification_model(model,data,predictor_var,outcome_var)

Accuracy : 83.062%
Cross-Validation Score : 83.065%
[[ 95   7]
 [ 97 415]]


In [22]:
model = DecisionTreeClassifier()
classification_model(model,data,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 70.844%
[[192   0]
 [  0 422]]


In [23]:
model = RandomForestClassifier(n_estimators=100)
classification_model(model,data,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 79.815%
[[192   0]
 [  0 422]]
