In [1]:
# load important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Read the data from the csv file into a dataframe
df = pd.read_csv('loan_approval_dataset.csv')
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
df.shape

(4269, 13)

In [4]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [5]:
# Edit column names to remove the spaces before the column names

df.columns = df.columns.str.lower().str.replace(' ','')

In [6]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [7]:
# Check for missing values

df.isnull().sum()

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [8]:
# check the data types in the data frame
df.dtypes

loan_id                      int64
no_of_dependents             int64
education                   object
self_employed               object
income_annum                 int64
loan_amount                  int64
loan_term                    int64
cibil_score                  int64
residential_assets_value     int64
commercial_assets_value      int64
luxury_assets_value          int64
bank_asset_value             int64
loan_status                 object
dtype: object

In [9]:
# loan_id is redundant and can be deleted
df.tail()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved
4268,4269,1,Graduate,No,9200000,29700000,10,607,17800000,11800000,35700000,12000000,Approved


In [10]:
# loan_id is a redudntant column so we can delete it

del df['loan_id']


In [11]:
df.tail()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved
4268,1,Graduate,No,9200000,29700000,10,607,17800000,11800000,35700000,12000000,Approved


In [12]:
# Get the names of the numerical features and the categorical features

categorical = list(df.dtypes[df.dtypes=='object'].index)
categorical

['education', 'self_employed', 'loan_status']

In [13]:
numerical = list(df.dtypes[df.dtypes!='object'].index)
numerical

['no_of_dependents',
 'income_annum',
 'loan_amount',
 'loan_term',
 'cibil_score',
 'residential_assets_value',
 'commercial_assets_value',
 'luxury_assets_value',
 'bank_asset_value']

In [14]:
# Examine the values of the categorical

for value in categorical:
    print(df[value].value_counts().index)

Index([' Graduate', ' Not Graduate'], dtype='object', name='education')
Index([' Yes', ' No'], dtype='object', name='self_employed')
Index([' Approved', ' Rejected'], dtype='object', name='loan_status')


In [15]:
# First remove the spaces in 'Not Graduate'
df['education'] = df['education'].str.replace(' Not ', ' Not_')
df['education'].head()

0         Graduate
1     Not_Graduate
2         Graduate
3         Graduate
4     Not_Graduate
Name: education, dtype: object

In [16]:
# Continue editing the values of the categorical to remove the remaining extra spaces
for value in categorical:
    df[value] = df[value].str.lower().str.replace(' ','')

for value in categorical:
    print(df[value].value_counts().index)

Index(['graduate', 'not_graduate'], dtype='object', name='education')
Index(['yes', 'no'], dtype='object', name='self_employed')
Index(['approved', 'rejected'], dtype='object', name='loan_status')


In [17]:
# Examine the unique values of the categoricals

df[categorical].nunique()

education        2
self_employed    2
loan_status      2
dtype: int64

In [18]:
# We can examine the numerical columns (i.e. features)

df.describe()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [19]:
# Convert the target variable to numerical

df['loan_status'] = (df['loan_status'] == 'approved').astype(int)
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,graduate,no,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,not_graduate,yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,graduate,no,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,graduate,no,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,not_graduate,yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [29]:
# Update the new categorical variables to exclude the target variable
categorical = categorical[:-1]
categorical

['education', 'self_employed']

In [31]:
# Plot the histogram of the features to see their distribution

In [32]:
# Try to normalize the distribution if not normalized

In [33]:
# Compute feature importance (correlation or mutual information)