In [10]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [11]:
df=pd.read_csv('train_ctrUa4K.csv')
df.shape

(614, 13)

# Handling Nan data


In [12]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
df.Gender.unique()

array(['Male', 'Female', nan], dtype=object)

In [14]:
df.Married.unique()

array(['No', 'Yes', nan], dtype=object)

In [15]:
df.Dependents.unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [16]:
df.Self_Employed.unique()

array(['No', 'Yes', nan], dtype=object)

In [17]:
df.Loan_Amount_Term.describe()

count    600.00000
mean     342.00000
std       65.12041
min       12.00000
25%      360.00000
50%      360.00000
75%      360.00000
max      480.00000
Name: Loan_Amount_Term, dtype: float64

In [18]:
df.Credit_History.unique()

array([ 1.,  0., nan])

In [19]:
df.Gender.fillna("Male",inplace=True)
df.Married.fillna("Yes",inplace=True)
df.Dependents.interpolate(method='pad',inplace=True)
df.Self_Employed.fillna("No",inplace=True)
df.LoanAmount.fillna(146.412162,inplace=True)
df.Loan_Amount_Term.fillna(342.00000,inplace=True)

In [12]:
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [22]:
df.dropna(axis=0, how='any',inplace=True)

In [23]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [43]:
df.shape

(564, 20)

# Normalizing data

In [25]:
df.ApplicantIncome=(df.ApplicantIncome-df.ApplicantIncome.mean())/(df.ApplicantIncome.max()-df.ApplicantIncome.min())
df.CoapplicantIncome=(df.CoapplicantIncome-df.CoapplicantIncome.mean())/(df.CoapplicantIncome.max()-df.CoapplicantIncome.min())
df.LoanAmount=(df.LoanAmount-df.LoanAmount.mean())/(df.LoanAmount.max()-df.LoanAmount.min())
df.Loan_Amount_Term=(df.Loan_Amount_Term-df.Loan_Amount_Term.mean())/(df.Loan_Amount_Term.max()-df.Loan_Amount_Term.min())

# Handling Categorical data

In [26]:

categories3 = pd.Categorical(df['Gender'], categories=['Female', 'Male'], ordered=True)
labels3, unique3 = pd.factorize(categories3, sort=True)
df['Gender-e'] = labels3
categories7 = pd.Categorical(df['Married'], categories=['No', 'Yes'], ordered=True)
labels7, unique7 = pd.factorize(categories7, sort=True)
df['Married-e'] = labels7
categories4 = pd.Categorical(df['Dependents'], categories=['0', '1', '2', '3+'], ordered=True)
labels4, unique4 = pd.factorize(categories4, sort=True)
df['Dependents-e'] = labels4
categories1 = pd.Categorical(df['Education'], categories=['Not Graduate', 'Graduate'], ordered=True)
labels1, unique1 = pd.factorize(categories1, sort=True)
df['Eduacation-e'] = labels1
categories5 = pd.Categorical(df['Self_Employed'], categories=['Yes','No'], ordered=True)
labels5, unique5 = pd.factorize(categories5, sort=True)
df['Self_Employed-e'] = labels5
categories2 = pd.Categorical(df['Property_Area'], categories=['Rural', 'Semiurban','Urban'], ordered=True)
labels2, unique2 = pd.factorize(categories2, sort=True)
df['Property_Area-e'] = labels2
categories6 = pd.Categorical(df['Loan_Status'], categories=['N','Y'], ordered=True)
labels6, unique6 = pd.factorize(categories6, sort=True)
df['Loan_Status-e'] = labels6

In [38]:
df.shape

(564, 20)

# Converting continuous data to discrete bins

In [29]:
df1=df[['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']]

In [30]:
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=[5,5,5,3], encode='ordinal', strategy='uniform').fit(df1)
df2=kbins.transform(df1)
df2.astype('object')
df21=pd.DataFrame(data=df2,columns=('ApplicantIncome-e','CoapplicantIncome-e','LoanAmount-e','Loan_Amount_Term-e'))
df21.isna().any()

ApplicantIncome-e      False
CoapplicantIncome-e    False
LoanAmount-e           False
Loan_Amount_Term-e     False
dtype: bool

# Concate data frame

In [36]:
df11=df[['Gender-e','Married-e','Dependents-e','Eduacation-e','Self_Employed-e','Property_Area-e']]

In [52]:
df11.drop_duplicates()

Unnamed: 0,Gender-e,Married-e,Dependents-e,Eduacation-e,Self_Employed-e,Property_Area-e
0,1,0,0,1,1,2
1,1,1,1,1,1,0
2,1,1,0,1,0,2
3,1,1,0,0,1,2
5,1,1,2,1,0,2
...,...,...,...,...,...,...
534,0,0,0,0,0,2
561,0,1,1,1,0,1
563,1,0,0,0,0,1
592,1,0,3,1,0,1


In [31]:
Y=df[['Loan_Status-e']]
Y.shape

(564, 1)

In [35]:
df21.index

RangeIndex(start=0, stop=564, step=1)

In [37]:
df11.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            604, 605, 606, 607, 608, 609, 610, 611, 612, 613],
           dtype='int64', length=564)

In [47]:
X=df[['Gender-e','Married-e','Dependents-e','Eduacation-e','Self_Employed-e','Property_Area-e','ApplicantIncome-e','CoapplicantIncome-e','LoanAmount-e','Loan_Amount_Term-e']]

In [48]:
X.isna().sum()

Gender-e               0
Married-e              0
Dependents-e           0
Eduacation-e           0
Self_Employed-e        0
Property_Area-e        0
ApplicantIncome-e      0
CoapplicantIncome-e    0
LoanAmount-e           0
Loan_Amount_Term-e     0
dtype: int64

In [49]:
X.shape

(517, 10)