## Loan Prediction

### Objectives:
* Classify loan prediction by using Logistic Regession
* Show how Newton Raphson Method or Gradient Descent can be used to optimize the algorithm
* Plot ROC curves for different splits of the training set

In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, StratifiedKFold

%matplotlib inline

In [42]:
# Set random seed
np.random.seed(144)

In [28]:
train = pd.read_csv('./data/train_u6lujuX.csv')
test = pd.read_csv('./data/test_Y3wMUE5.csv')
sub = pd.read_csv('./data/Sample_Submission_ZAuTl8O.csv')

In [29]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


In [30]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110,360,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126,360,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208,360,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100,360,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78,360,1.0,Urban


In [31]:
# check to see if there is an overlap in loan ids among training and test examples
len(set(train.Loan_ID) & set(test.Loan_ID))

0

In [32]:
# Set Loan Id as index
train = train.set_index('Loan_ID')
test = test.set_index('Loan_ID')

## One Hot Encoding

In [33]:
features = train.columns.drop('Loan_Status')

target = train.Loan_Status
train = train[features].T.to_dict().values()

test = test.T.to_dict().values()

In [34]:
transformer = DictVectorizer(sparse=False)

train = transformer.fit_transform(train)
test = transformer.fit_transform(test)

In [36]:
# fill missing values with -1
X = pd.DataFrame(train)
test = pd.DataFrame(test)

y = pd.Series(target)

In [38]:
X = X.fillna(-1)
test = test.fillna(-1)

In [40]:
y = (y=='Y').astype(np.int)

## Cross validation scores

In [47]:
skf = StratifiedKFold(y.values, n_folds=5, random_state=44)
C_grid = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]