In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
loan = pd.read_csv("Train_Data.csv")
loan.set_index("ID", inplace = True)
loan.reset_index(drop=True, inplace = True)
loan.head()

Unnamed: 0,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
0,5849,0.0,,360.0,1.0,Male,No,0,Graduate,No,Urban,Y
1,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural,N
2,3000,0.0,66.0,360.0,1.0,Male,Yes,0,Graduate,Yes,Urban,Y
3,2583,2358.0,120.0,360.0,1.0,Male,Yes,0,Not Graduate,No,Urban,Y
4,6000,0.0,141.0,360.0,1.0,Male,No,0,Graduate,No,Urban,Y


In [3]:
loan.dropna(inplace = True)
loan.isnull().sum()

Income_of_Applicant          0
Income_of_Joint_Applicant    0
Loan_Amount_Requirement      0
Loan_Amount_Term             0
Credit_History               0
Gender                       0
Is_Married                   0
No_of_Dependents             0
Level_of_Education           0
IS_Self_Employed             0
Area_of_Property             0
Loan_Status                  0
dtype: int64

In [4]:
# # Changed
# # This is stratified sampling based on "Credit History".
# split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
# for train_index_s, test_index_s in split.split(loan, loan["Loan_Status"]):
#     strat_train_set = loan.iloc[train_index_s]
#     strat_test_set = loan.iloc[test_index_s]

In [6]:
# This is stratified sampling based on "Credit History".
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
for train_index_s, test_index_s in split.split(loan, loan["Credit_History"]):
    strat_train_set = loan.iloc[train_index_s]
    strat_test_set = loan.iloc[test_index_s]

In [6]:
strat_train_set.head()

Unnamed: 0,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
391,9504,0.0,275.0,360.0,1.0,Male,Yes,3+,Graduate,No,Rural,Y
476,6700,1750.0,230.0,300.0,1.0,Male,Yes,2,Graduate,No,Semiurban,Y
404,7441,0.0,194.0,360.0,1.0,Female,No,0,Graduate,Yes,Rural,N
282,2301,985.799988,78.0,180.0,1.0,Male,Yes,2,Graduate,No,Urban,Y
350,9083,0.0,228.0,360.0,1.0,Male,Yes,0,Graduate,No,Semiurban,Y


In [7]:
# # seperate the numerical & categorical features
# train_num_col  = list(strat_train_set.select_dtypes(["int64","float"]).columns)
# train_cat_col = list(strat_train_set.select_dtypes("object").columns)


In [8]:
# print(train_num_col)
# print(train_cat_col)

In [9]:
# train_num = strat_train_set[train_num_col]
# train_cat = strat_train_set[train_cat_col]

In [10]:
# strat_train_set["Income_of_Applicant"] = pd.to_numeric(strat_train_set["Income_of_Applicant"], downcast = "float")

In [11]:
# train_num.head(10)

In [12]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy="median")

In [13]:
# imputer.fit(train_num)
# print(imputer.statistics_)

In [14]:
# train_num_imp = pd.DataFrame(imputer.transform(train_num), columns = train_num.columns, index = train_num.index)
# train_num_imp.head(5)

In [15]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

In [16]:
# scaler.fit(train_num_imp)
# train_num_tr = scaler.transform(train_num_imp)

In [17]:
# train_num_tr = pd.DataFrame(scaler.transform(train_num_imp), columns = train_num.columns, index = train_num.index)
# train_num_tr.head(5)

In [18]:
# import sys
# sys.exit()

In [19]:
def mm_scaler(dataset):
    mmscaler = MinMaxScaler()
    data_scaled = mmscaler.fit_transform(dataset)
    return data_scaled

In [20]:
# # This is simple random sampling.
# np.random.seed(42)
# shuffles = np.random.permutation(len(loan))

# test_ratio = 0.7
# train_set_size = int(len(loan) * test_ratio)

# train_index = shuffles[:train_set_size]
# test_index = shuffles[train_set_size:]

# strat_train_set = loan.iloc[train_index]
# strat_test_set = loan.iloc[test_index]

In [21]:
strat_train_set.head()

Unnamed: 0,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
391,9504,0.0,275.0,360.0,1.0,Male,Yes,3+,Graduate,No,Rural,Y
476,6700,1750.0,230.0,300.0,1.0,Male,Yes,2,Graduate,No,Semiurban,Y
404,7441,0.0,194.0,360.0,1.0,Female,No,0,Graduate,Yes,Rural,N
282,2301,985.799988,78.0,180.0,1.0,Male,Yes,2,Graduate,No,Urban,Y
350,9083,0.0,228.0,360.0,1.0,Male,Yes,0,Graduate,No,Semiurban,Y


In [22]:
train_loan_x = strat_train_set.drop("Loan_Status", axis = 1)
train_loan_y = strat_train_set["Loan_Status"]

In [23]:
test_loan_x = strat_test_set.drop("Loan_Status", axis = 1)
test_loan_y = strat_test_set["Loan_Status"]

In [24]:
# Label encoding & 1-hot encoding
def binarize_1hot(dataset, to_binarize, to_one_hot):
    for i in to_binarize:
        dataset[i] = LabelBinarizer().fit_transform(dataset[i])
    dataset = pd.get_dummies(data = dataset, columns = to_one_hot)
    return dataset

In [25]:
# Min-Max Scaler function
def mm_scaler(dataset):
    mmscaler = MinMaxScaler()
    data_scaled = mmscaler.fit_transform(dataset)
    return data_scaled

In [26]:
# label encoding
def label_encode(dataset):
    label_enc = LabelBinarizer()
    dataset_encoded = label_enc.fit_transform(dataset)
    return dataset_encoded

In [27]:
# label encoding & one-hot encoding train_X data
to_binarize = ["Gender", "Is_Married", "Level_of_Education", "IS_Self_Employed"]
to_one_hot = ["No_of_Dependents", "Area_of_Property"]

train_x_encoded = binarize_1hot(train_loan_x, to_binarize, to_one_hot)
test_x_encoded = binarize_1hot(test_loan_x, to_binarize, to_one_hot)

In [28]:
train_x_encoded_scaled = mm_scaler(train_x_encoded)
test_x_encoded_scaled = mm_scaler(test_x_encoded)

In [29]:
train_y_encoded_scaled = label_encode(train_loan_y)
test_y_encoded_scaled = label_encode(test_loan_y)

In [30]:
# Training a Logistic Regression model
LR_model = LogisticRegression(C=0.5).fit(train_x_encoded_scaled,train_y_encoded_scaled)

  return f(**kwargs)


In [31]:
# Testing on test set
yhat = LR_model.predict(test_x_encoded_scaled)
print("LR Jaccard index: %f" % jaccard_score(test_y_encoded_scaled, yhat, pos_label = 1))
print("LR F1-score: %f" % f1_score(test_y_encoded_scaled, yhat, average='weighted', pos_label = 1))
confusion_matrix(test_y_encoded_scaled, yhat)

LR Jaccard index: 0.801587
LR F1-score: 0.806617


array([[ 18,  22],
       [  3, 101]], dtype=int64)

In [32]:
# Random sampling at 70:30 split.

# LR Jaccard index: 0.784000
# LR F1-score: 0.786952
# array([[19, 26],
#        [ 1, 98]], dtype=int64)

In [33]:
# Stratified sampling using "Credit_History".

# LR Jaccard index: 0.801587
# LR F1-score: 0.806617
# array([[ 18,  22],
#        [  3, 101]], dtype=int64)