In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('train_ctrUa4K.csv')
df.shape

(614, 13)

# Handling Nan data


In [3]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
df.Gender.unique()

array(['Male', 'Female', nan], dtype=object)

In [5]:
df.Married.unique()

array(['No', 'Yes', nan], dtype=object)

In [6]:
df.Dependents.unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [7]:
df.Self_Employed.unique()

array(['No', 'Yes', nan], dtype=object)

In [8]:
df.LoanAmount.unique()

array([ nan, 128.,  66., 120., 141., 267.,  95., 158., 168., 349.,  70.,
       109., 200., 114.,  17., 125., 100.,  76., 133., 115., 104., 315.,
       116., 112., 151., 191., 122., 110.,  35., 201.,  74., 106., 320.,
       144., 184.,  80.,  47.,  75., 134.,  96.,  88.,  44., 286.,  97.,
       135., 180.,  99., 165., 258., 126., 312., 136., 172.,  81., 187.,
       113., 176., 130., 111., 167., 265.,  50., 210., 175., 131., 188.,
        25., 137., 160., 225., 216.,  94., 139., 152., 118., 185., 154.,
        85., 259., 194.,  93., 370., 182., 650., 102., 290.,  84., 242.,
       129.,  30., 244., 600., 255.,  98., 275., 121.,  63., 700.,  87.,
       101., 495.,  67.,  73., 260., 108.,  58.,  48., 164., 170.,  83.,
        90., 166., 124.,  55.,  59., 127., 214., 240.,  72.,  60., 138.,
        42., 280., 140., 155., 123., 279., 192., 304., 330., 150., 207.,
       436.,  78.,  54.,  89., 143., 105., 132., 480.,  56., 159., 300.,
       376., 117.,  71., 490., 173.,  46., 228., 30

In [9]:
df.Loan_Amount_Term.describe()

count    600.00000
mean     342.00000
std       65.12041
min       12.00000
25%      360.00000
50%      360.00000
75%      360.00000
max      480.00000
Name: Loan_Amount_Term, dtype: float64

In [10]:
df.Credit_History.unique()

array([ 1.,  0., nan])

In [11]:
df.Gender.fillna("Male",inplace=True)
df.Married.fillna("Yes",inplace=True)
df.Dependents.interpolate(method='pad',inplace=True)
df.Self_Employed.fillna("No",inplace=True)
df.LoanAmount.fillna(146.412162,inplace=True)
df.Loan_Amount_Term.fillna(342.00000,inplace=True)

In [12]:
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [24]:
df.dropna(axis=0, how='any',inplace=True)

In [25]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [31]:
df.shape

(564, 13)

# Normalizing data

In [32]:
df.ApplicantIncome=(df.ApplicantIncome-df.ApplicantIncome.mean())/(df.ApplicantIncome.max()-df.ApplicantIncome.min())
df.CoapplicantIncome=(df.CoapplicantIncome-df.CoapplicantIncome.mean())/(df.CoapplicantIncome.max()-df.CoapplicantIncome.min())
df.LoanAmount=(df.LoanAmount-df.LoanAmount.mean())/(df.LoanAmount.max()-df.LoanAmount.min())
df.Loan_Amount_Term=(df.Loan_Amount_Term-df.Loan_Amount_Term.mean())/(df.Loan_Amount_Term.max()-df.Loan_Amount_Term.min())

# Handling Categorical data

In [33]:

categories3 = pd.Categorical(df['Gender'], categories=['Female', 'Male'], ordered=True)
labels3, unique3 = pd.factorize(categories3, sort=True)
df['Gender-e'] = labels3
categories7 = pd.Categorical(df['Married'], categories=['No', 'Yes'], ordered=True)
labels7, unique7 = pd.factorize(categories7, sort=True)
df['Married-e'] = labels7
categories4 = pd.Categorical(df['Dependents'], categories=['0', '1', '2', '3+'], ordered=True)
labels4, unique4 = pd.factorize(categories4, sort=True)
df['Dependents-e'] = labels4
categories1 = pd.Categorical(df['Education'], categories=['Not Graduate', 'Graduate'], ordered=True)
labels1, unique1 = pd.factorize(categories1, sort=True)
df['Eduacation-e'] = labels1
categories5 = pd.Categorical(df['Self_Employed'], categories=['Yes','No'], ordered=True)
labels5, unique5 = pd.factorize(categories5, sort=True)
df['Self_Employed-e'] = labels5
categories2 = pd.Categorical(df['Property_Area'], categories=['Rural', 'Semiurban','Urban'], ordered=True)
labels2, unique2 = pd.factorize(categories2, sort=True)
df['Property_Area-e'] = labels2
categories6 = pd.Categorical(df['Loan_Status'], categories=['N','Y'], ordered=True)
labels6, unique6 = pd.factorize(categories6, sort=True)
df['Loan_Status-e'] = labels6

In [35]:
df.isna().any()

Loan_ID              False
Gender               False
Married              False
Dependents           False
Education            False
Self_Employed        False
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
Property_Area        False
Loan_Status          False
Gender-e             False
Married-e            False
Dependents-e         False
Eduacation-e         False
Self_Employed-e      False
Property_Area-e      False
Loan_Status-e        False
dtype: bool

# Converting continuous data to discrete bins

In [36]:
df1=df[['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']]

In [38]:
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=[5,5,5,3], encode='ordinal', strategy='uniform').fit(df1)
df2=kbins.transform(df1)
df2.astype('object')
df21=pd.DataFrame(data=df2,columns=('ApplicantIncome-e','CoapplicantIncome-e','LoanAmount-e','Loan_Amount_Term-e'))
df21.isna().any()

ApplicantIncome-e      False
CoapplicantIncome-e    False
LoanAmount-e           False
Loan_Amount_Term-e     False
dtype: bool

# Concate data frame

In [None]:
df11=df[['Gender-e','Married-e','Dependents-e','Eduacation-e','Self_Employed-e','Property_Area-e']]

In [39]:
Y=df[['Loan_Status-e']]
Y.shape

(564, 1)

In [44]:
df['ApplicantIncome-e']=df21['ApplicantIncome-e']
df['CoapplicantIncome-e']=df21['CoapplicantIncome-e']
df['LoanAmount-e']=df21['LoanAmount-e']
df['Loan_Amount_Term-e']=df21['Loan_Amount_Term-e']
df.dropna()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Married-e,Dependents-e,Eduacation-e,Self_Employed-e,Property_Area-e,Loan_Status-e,ApplicantIncome-e,CoapplicantIncome-e,LoanAmount-e,Loan_Amount_Term-e
0,LP001002,Male,No,0,Graduate,No,0.004732,-0.045230,0.001844,0.040205,...,0,0,1,1,2,1,0.0,0.0,0.0,2.0
1,LP001003,Male,Yes,1,Graduate,No,-0.010927,-0.000663,-0.024801,0.040205,...,1,1,1,1,0,0,0.0,0.0,0.0,2.0
2,LP001005,Male,Yes,0,Graduate,Yes,-0.030506,-0.045230,-0.114526,0.040205,...,1,0,1,0,2,1,0.0,0.0,0.0,2.0
3,LP001006,Male,Yes,0,Not Graduate,No,-0.035664,0.024457,-0.036379,0.040205,...,1,0,0,1,2,1,0.0,0.0,0.0,2.0
4,LP001008,Male,No,0,Graduate,No,0.006599,-0.045230,-0.005988,0.040205,...,0,0,1,1,2,1,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,LP002804,Female,Yes,0,Graduate,No,-0.015911,0.022920,0.053346,0.040205,...,1,0,1,1,1,1,0.0,0.0,0.0,2.0
560,LP002807,Male,Yes,2,Not Graduate,No,-0.022158,-0.038078,-0.053745,0.040205,...,1,2,0,1,1,1,0.0,0.0,0.0,0.0
561,LP002813,Female,Yes,1,Graduate,Yes,0.173377,-0.045230,0.658267,0.040205,...,1,1,1,0,1,1,0.0,0.0,1.0,2.0
562,LP002820,Male,Yes,0,Graduate,No,0.005647,0.015473,0.095314,0.040205,...,1,0,1,1,0,1,0.0,0.0,1.0,2.0


In [46]:
df.dropna(axis=0, how='any',inplace=True)

In [47]:
X=df[['Gender-e','Married-e','Dependents-e','Eduacation-e','Self_Employed-e','Property_Area-e','ApplicantIncome-e','CoapplicantIncome-e','LoanAmount-e','Loan_Amount_Term-e']]

In [48]:
X.isna().sum()

Gender-e               0
Married-e              0
Dependents-e           0
Eduacation-e           0
Self_Employed-e        0
Property_Area-e        0
ApplicantIncome-e      0
CoapplicantIncome-e    0
LoanAmount-e           0
Loan_Amount_Term-e     0
dtype: int64

In [49]:
X.shape

(517, 10)