In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [44]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

train.info()

#filling blank values: typically I would find a way to fill them in, but I suspect that leaving things blank might
#actually play a role in approval. Let's take a look and perhaps assign specific values to blanks

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [45]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Loan_ID              367 non-null object
Gender               356 non-null object
Married              367 non-null object
Dependents           357 non-null object
Education            367 non-null object
Self_Employed        344 non-null object
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           362 non-null float64
Loan_Amount_Term     361 non-null float64
Credit_History       338 non-null float64
Property_Area        367 non-null object
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [46]:
data=pd.concat([train,test])
data[:100]

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No
5,5417,4196.0,1.0,2,Graduate,Male,267.0,360.0,LP001011,Y,Yes,Urban,Yes
6,2333,1516.0,1.0,0,Not Graduate,Male,95.0,360.0,LP001013,Y,Yes,Urban,No
7,3036,2504.0,0.0,3+,Graduate,Male,158.0,360.0,LP001014,N,Yes,Semiurban,No
8,4006,1526.0,1.0,2,Graduate,Male,168.0,360.0,LP001018,Y,Yes,Urban,No
9,12841,10968.0,1.0,1,Graduate,Male,349.0,360.0,LP001020,N,Yes,Semiurban,No


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 13 columns):
ApplicantIncome      981 non-null int64
CoapplicantIncome    981 non-null float64
Credit_History       902 non-null float64
Dependents           956 non-null object
Education            981 non-null object
Gender               957 non-null object
LoanAmount           954 non-null float64
Loan_Amount_Term     961 non-null float64
Loan_ID              981 non-null object
Loan_Status          614 non-null object
Married              978 non-null object
Property_Area        981 non-null object
Self_Employed        926 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 107.3+ KB


In [48]:
data.Self_Employed.value_counts()

No     807
Yes    119
Name: Self_Employed, dtype: int64

In [49]:
data.Property_Area.value_counts()

Semiurban    349
Urban        342
Rural        290
Name: Property_Area, dtype: int64

In [50]:
data.Gender.value_counts()

Male      775
Female    182
Name: Gender, dtype: int64

In [51]:
data.Dependents.value_counts()

0     545
2     160
1     160
3+     91
Name: Dependents, dtype: int64

In [52]:
data.Married.value_counts()

Yes    631
No     347
Name: Married, dtype: int64

In [53]:
data.Education.value_counts()

Graduate        763
Not Graduate    218
Name: Education, dtype: int64

In [54]:
gender={'Male':1,'Female':0,np.nan:-1}
status={'Y':int(1),'N':int(0)}
yn={'Yes':1,'No':0}
Dep={'0':int(0),'1':int(1),'2':int(2),'3+':int(3)}
Binary={True:1,False:0}
Ed={'Graduate':1,'Not Graduate':0}
Prop={'Semiurban':1,'Urban':2,'Rural':3}

In [55]:
#let's convert the train set to binary for some quick info
trainB=pd.DataFrame()
trainB['Gender']=train['Gender'].map(gender)
trainB['Loan_Status']=train['Loan_Status'].map(status)
trainB['Credit_History']=train['Credit_History'].fillna(-1)
trainB['Dependents']=train['Dependents'].map(Dep).fillna(-1)
trainB['containsNaN']=train.isnull().any(axis=1)
trainB['NoLoanAmount']=train['LoanAmount'].isnull().map(Binary)
trainB['LoanAmount']=train['LoanAmount'].fillna(train.LoanAmount.max())
trainB['Self_Employed']=train['Self_Employed'].fillna('No').map(yn)
trainB['Education']=train['Education'].map(Ed)
trainB['PropertyArea']=train['Property_Area'].map(Prop)
trainB['NoLoanTerm']=train['Loan_Amount_Term'].isnull().map(Binary)


#estimate Marriage: If dependents blank or less than 1, assume not married
bins=[-2,0,4]
MarrEst=pd.cut(list(trainB.Dependents),bins)
trainB['MarrEst']=list(MarrEst.codes)
trainB['Married']=train['Married'].map(yn).fillna(trainB.MarrEst)

trainB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Gender            614 non-null int64
Loan_Status       614 non-null int64
Credit_History    614 non-null float64
Dependents        614 non-null float64
containsNaN       614 non-null bool
NoLoanAmount      614 non-null int64
LoanAmount        614 non-null float64
Self_Employed     614 non-null int64
Education         614 non-null int64
PropertyArea      614 non-null int64
NoLoanTerm        614 non-null int64
MarrEst           614 non-null int64
Married           614 non-null float64
dtypes: bool(1), float64(4), int64(8)
memory usage: 58.2 KB


In [56]:
#let's convert the train set to binary for some quick info
testB=pd.DataFrame()
testB['Gender']=test['Gender'].map(gender)
testB['Credit_History']=test['Credit_History'].fillna(-1)
testB['Dependents']=test['Dependents'].map(Dep).fillna(-1)
testB['containsNaN']=test.isnull().any(axis=1)
testB['NoLoanAmount']=test['LoanAmount'].isnull().map(Binary)
testB['LoanAmount']=test['LoanAmount'].fillna(test.LoanAmount.max())
testB['Self_Employed']=test['Self_Employed'].fillna('No').map(yn)
testB['Education']=test['Education'].map(Ed)
testB['PropertyArea']=test['Property_Area'].map(Prop)
testB['NoLoanTerm']=test['Loan_Amount_Term'].isnull().map(Binary)


#estimate Marriage: If dependents blank or less than 1, assume not married
bins=[-2,0,4]
MarrEstT=pd.cut(list(testB.Dependents),bins)
testB['MarrEst']=list(MarrEstT.codes)
testB['Married']=test['Married'].map(yn).fillna(testB.MarrEst)

testB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Gender            367 non-null int64
Credit_History    367 non-null float64
Dependents        367 non-null float64
containsNaN       367 non-null bool
NoLoanAmount      367 non-null int64
LoanAmount        367 non-null float64
Self_Employed     367 non-null int64
Education         367 non-null int64
PropertyArea      367 non-null int64
NoLoanTerm        367 non-null int64
MarrEst           367 non-null int64
Married           367 non-null int64
dtypes: bool(1), float64(3), int64(8)
memory usage: 32.0 KB


In [57]:
trainB.Married.value_counts()

1.0    398
0.0    216
Name: Married, dtype: int64

In [58]:
trainB[['NoLoanTerm','Loan_Status']].groupby(['NoLoanTerm']).mean()

Unnamed: 0_level_0,Loan_Status
NoLoanTerm,Unnamed: 1_level_1
0,0.69
1,0.571429


In [59]:
trainB[['Dependents','Married']].groupby(['Dependents']).mean()

Unnamed: 0_level_0,Married
Dependents,Unnamed: 1_level_1
-1.0,0.533333
0.0,0.504348
1.0,0.77451
2.0,0.920792
3.0,0.862745


In [60]:
trainB[['containsNaN','Loan_Status']].groupby(['containsNaN']).mean()

Unnamed: 0_level_0,Loan_Status
containsNaN,Unnamed: 1_level_1
False,0.691667
True,0.671642


In [61]:
trainB[['NoLoanAmount','Loan_Status']].groupby(['NoLoanAmount']).mean()

Unnamed: 0_level_0,Loan_Status
NoLoanAmount,Unnamed: 1_level_1
0,0.694257
1,0.5


In [62]:
trainB[['Married','Loan_Status']].groupby(['Married']).mean()

Unnamed: 0_level_0,Loan_Status
Married,Unnamed: 1_level_1
0.0,0.634259
1.0,0.71608


In [63]:
trainB[['Dependents','Loan_Status']].groupby(['Dependents']).mean() #-1 represents no dependents reported

Unnamed: 0_level_0,Loan_Status
Dependents,Unnamed: 1_level_1
-1.0,0.6
0.0,0.689855
1.0,0.647059
2.0,0.752475
3.0,0.647059


In [64]:
trainB[['Gender','Loan_Status']].groupby(['Gender']).mean() #-1 represents no gender reported

Unnamed: 0_level_0,Loan_Status
Gender,Unnamed: 1_level_1
-1,0.615385
0,0.669643
1,0.693252


In [65]:
trainB[['Credit_History','Loan_Status']].groupby(['Credit_History']).mean() #-1 represents balnk credit history

Unnamed: 0_level_0,Loan_Status
Credit_History,Unnamed: 1_level_1
-1.0,0.74
0.0,0.078652
1.0,0.795789


In [66]:
#let's OneHot the categorical data in trainC for formal analysis
trainC=pd.DataFrame()
trainC[['No_gender','Female','Male']]=pd.get_dummies(train.Gender.fillna(-1))
#trainC[['Blank_Credit','No_Credit','Credit_History']]=pd.get_dummies(train.Credit_History.fillna(-1))
trainC[['blank','NoCredit','Credit']]=pd.get_dummies(trainB.Credit_History)
trainC['Dependents']=trainB.Dependents
trainC['LoanAmount']=trainB.LoanAmount
trainC['NoLoanAmount']=trainB.NoLoanAmount
trainC['Married']=trainB['Married']
trainC['ApplicantIncome']=train['ApplicantIncome']
trainC['CoapplicantIncome']=train['CoapplicantIncome']
trainC[['Semiurban','Urban','Rural']]=pd.get_dummies(trainB.PropertyArea)
trainC['Education']=trainB['Education']
trainC['NoLoanTerm']=trainB['NoLoanTerm']
trainC['LoanTerm']=train['Loan_Amount_Term'].fillna(train.Loan_Amount_Term.max())
trainC['IncomeSum']=train['ApplicantIncome']+train['CoapplicantIncome']

trainC['Loan_Status']=train['Loan_Status']

trainC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 20 columns):
No_gender            614 non-null uint8
Female               614 non-null uint8
Male                 614 non-null uint8
blank                614 non-null uint8
NoCredit             614 non-null uint8
Credit               614 non-null uint8
Dependents           614 non-null float64
LoanAmount           614 non-null float64
NoLoanAmount         614 non-null int64
Married              614 non-null float64
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
Semiurban            614 non-null uint8
Urban                614 non-null uint8
Rural                614 non-null uint8
Education            614 non-null int64
NoLoanTerm           614 non-null int64
LoanTerm             614 non-null float64
IncomeSum            614 non-null float64
Loan_Status          614 non-null object
dtypes: float64(6), int64(4), object(1), uint8(9)
memory usage: 58.2+ KB


In [67]:
#let's OneHot the categorical data in trainC for formal analysis
testC=pd.DataFrame()
testC[['No_gender','Female','Male']]=pd.get_dummies(test.Gender.fillna(-1))
testC[['blank','NoCredit','Credit']]=pd.get_dummies(testB.Credit_History)
testC['Dependents']=testB.Dependents
testC['LoanAmount']=testB.LoanAmount
testC['NoLoanAmount']=testB.NoLoanAmount
testC['Married']=testB['Married']
testC['ApplicantIncome']=test['ApplicantIncome']
testC['CoapplicantIncome']=test['CoapplicantIncome']
testC[['Semiurban','Urban','Rural']]=pd.get_dummies(testB.PropertyArea)
testC['Education']=testB['Education']
testC['NoLoanTerm']=testB['NoLoanTerm']
testC['LoanTerm']=test['Loan_Amount_Term'].fillna(test.Loan_Amount_Term.max())
testC['IncomeSum']=test['ApplicantIncome']+test['CoapplicantIncome']

testC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 19 columns):
No_gender            367 non-null uint8
Female               367 non-null uint8
Male                 367 non-null uint8
blank                367 non-null uint8
NoCredit             367 non-null uint8
Credit               367 non-null uint8
Dependents           367 non-null float64
LoanAmount           367 non-null float64
NoLoanAmount         367 non-null int64
Married              367 non-null int64
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
Semiurban            367 non-null uint8
Urban                367 non-null uint8
Rural                367 non-null uint8
Education            367 non-null int64
NoLoanTerm           367 non-null int64
LoanTerm             367 non-null float64
IncomeSum            367 non-null int64
dtypes: float64(3), int64(7), uint8(9)
memory usage: 32.0 KB


In [68]:
trainC

Unnamed: 0,No_gender,Female,Male,blank,NoCredit,Credit,Dependents,LoanAmount,NoLoanAmount,Married,ApplicantIncome,CoapplicantIncome,Semiurban,Urban,Rural,Education,NoLoanTerm,LoanTerm,IncomeSum,Loan_Status
0,0,0,1,0,0,1,0.0,700.0,1,0.0,5849,0.0,0,1,0,1,0,360.0,5849.0,Y
1,0,0,1,0,0,1,1.0,128.0,0,1.0,4583,1508.0,0,0,1,1,0,360.0,6091.0,N
2,0,0,1,0,0,1,0.0,66.0,0,1.0,3000,0.0,0,1,0,1,0,360.0,3000.0,Y
3,0,0,1,0,0,1,0.0,120.0,0,1.0,2583,2358.0,0,1,0,0,0,360.0,4941.0,Y
4,0,0,1,0,0,1,0.0,141.0,0,0.0,6000,0.0,0,1,0,1,0,360.0,6000.0,Y
5,0,0,1,0,0,1,2.0,267.0,0,1.0,5417,4196.0,0,1,0,1,0,360.0,9613.0,Y
6,0,0,1,0,0,1,0.0,95.0,0,1.0,2333,1516.0,0,1,0,0,0,360.0,3849.0,Y
7,0,0,1,0,1,0,3.0,158.0,0,1.0,3036,2504.0,1,0,0,1,0,360.0,5540.0,N
8,0,0,1,0,0,1,2.0,168.0,0,1.0,4006,1526.0,0,1,0,1,0,360.0,5532.0,Y
9,0,0,1,0,0,1,1.0,349.0,0,1.0,12841,10968.0,1,0,0,1,0,360.0,23809.0,N


In [69]:
X_train,X_test,y_train,y_test=train_test_split(trainC.drop(['Loan_Status'],axis=1),trainC['Loan_Status'])
X=trainC.drop(['Loan_Status'],axis=1)
y=trainC['Loan_Status']

In [70]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
print('train score: {}'.format(lr.score(X_train,y_train)))
print('test score: {}'.format(lr.score(X_test,y_test)))

train score: 0.8152173913043478
test score: 0.7922077922077922


In [71]:
svm=LinearSVC()
svm.fit(X_train,y_train)
print('train score: {}'.format(svm.score(X_train,y_train)))
print('test score: {}'.format(svm.score(X_test,y_test)))

train score: 0.6760869565217391
test score: 0.7207792207792207


In [72]:
scaler=MinMaxScaler()
Xs_train=scaler.fit_transform(X_train)
Xs_test=scaler.transform(X_test)

poly=PolynomialFeatures(degree=2).fit(Xs_train)
Xs_train_poly=poly.transform(Xs_train)
Xs_test_poly=poly.transform(Xs_test)

svm=SVC()
svm.fit(Xs_train_poly,y_train)
print('train score: {}'.format(svm.score(Xs_train_poly,y_train)))
print('test score: {}'.format(svm.score(Xs_test_poly,y_test)))

train score: 0.8173913043478261
test score: 0.7857142857142857


In [73]:
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
print('train score: {}'.format(gbc.score(X_train,y_train)))
print('test score: {}'.format(gbc.score(X_test,y_test)))

train score: 0.9108695652173913
test score: 0.7662337662337663


In [74]:
param_grid={'max_features':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],'min_samples_split':[27,28,29,30,31,32,33]}
grid_search=GridSearchCV(GradientBoostingClassifier(),param_grid,cv=5)
grid_search.fit(X,y)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_features': 1, 'min_samples_split': 32}
0.799674267101


In [75]:
param_grid={'n_neighbors':[1,5,10,15,20,25,30,35,40,45,50,55,60]}
grid_search=GridSearchCV(KNeighborsClassifier(),param_grid,cv=5)
grid_search.fit(X,y)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'n_neighbors': 25}
0.690553745928


In [76]:
scaler=MinMaxScaler()
Xs_train=scaler.fit_transform(X_train)
Xs_test=scaler.transform(X_test)

poly=PolynomialFeatures(degree=2).fit(Xs_train)
Xs_train_poly=poly.transform(Xs_train)
Xs_test_poly=poly.transform(Xs_test)

In [77]:
gbc=GradientBoostingClassifier()
gbc.fit(Xs_train_poly,y_train)
print('train score: {}'.format(gbc.score(Xs_train_poly,y_train)))
print('test score: {}'.format(gbc.score(Xs_test_poly,y_test)))

train score: 0.9391304347826087
test score: 0.7532467532467533


In [78]:
svm=LinearSVC()
svm.fit(Xs_train,y_train)
print('train score: {}'.format(svm.score(Xs_train,y_train)))
print('test score: {}'.format(svm.score(Xs_test,y_test)))

train score: 0.8195652173913044
test score: 0.7857142857142857


In [79]:
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
print('train score: {}'.format(gbc.score(X_train,y_train)))
print('test score: {}'.format(gbc.score(X_test,y_test)))

train score: 0.9108695652173913
test score: 0.7662337662337663


In [80]:
gbc=GradientBoostingClassifier(max_features=6,min_samples_split=27)
gbc.fit(X,y)
pred=gbc.predict(testC)
#pred=pred.map({1:'Y',0:'N'})
solution=pd.DataFrame(data={'Loan_ID':list(test.Loan_ID),'Loan_Status':pred})
solution.to_csv('solutions.csv',index=False)

In [81]:
scaler=MinMaxScaler()
Xs_train=scaler.fit_transform(X)
Xs_test=scaler.transform(testC)

poly=PolynomialFeatures(degree=2).fit(Xs_train)
Xs_train_poly=poly.transform(Xs_train)
Xs_test_poly=poly.transform(Xs_test)

param_grid={'C':[0.001,0.01,0.1,1,10,100]}
grid_search=GridSearchCV(SVC(),param_grid,cv=5)
grid_search.fit(Xs_train_poly,y)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 1}
0.809446254072


In [82]:
svm=SVC()
svm.fit(Xs_train_poly,y)
pred=svm.predict(Xs_test_poly)
#pred=pred.map({1:'Y',0:'N'})
solution=pd.DataFrame(data={'Loan_ID':list(test.Loan_ID),'Loan_Status':pred})
solution.to_csv('solutions.csv',index=False)