In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [65]:
ds = pd.read_csv('../loan_approval_dataset.csv')
ds.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [5]:
ds.shape

(4269, 13)

In [6]:
ds.nunique()

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

In [8]:
ds.duplicated().sum()

0

In [9]:
ds.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

As you can see the with the exception of `loan_id` this columns have a leading white space

In [66]:
ds.columns = [c.strip() for c in ds.columns]
ds.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [67]:
y = ds['loan_status']
X = ds.drop(columns=['loan_status'])

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
dtypes: int64(10), object(2)
memory usage: 400.3+ KB


### Apply domain knowledge as part of preprocessing

In [40]:
total_assets = X.iloc[:, 8:11]

In [52]:
X["total_assets"] = total_assets.sum(axis=1)

In [53]:
X.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'total_sasets', 'total_assets'],
      dtype='object')

In [55]:
X.drop(
    columns=[
        "residential_assets_value",
        "commercial_assets_value",
        "luxury_assets_value",
        "bank_asset_value",
        "Total_Assets",
        "total_sasets",
    ],
    inplace=True,
)

In [56]:
X.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets
0,1,2,0,0,9600000,29900000,12,778,42700000
1,2,0,1,1,4100000,12200000,8,417,13700000
2,3,3,0,0,9100000,29700000,20,506,44900000
3,4,3,0,0,8200000,30700000,8,467,44800000
4,5,5,1,1,9800000,24200000,20,382,50000000


### Label Encoder was used to convert text based binary values to integers

In [57]:
le = LabelEncoder()
X["self_employed"] = le.fit_transform(X['self_employed'])
X.head()


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets
0,1,2,0,0,9600000,29900000,12,778,42700000
1,2,0,1,1,4100000,12200000,8,417,13700000
2,3,3,0,0,9100000,29700000,20,506,44900000
3,4,3,0,0,8200000,30700000,8,467,44800000
4,5,5,1,1,9800000,24200000,20,382,50000000


In [58]:
X["education"] = le.fit_transform(X["education"])
X.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets
0,1,2,0,0,9600000,29900000,12,778,42700000
1,2,0,1,1,4100000,12200000,8,417,13700000
2,3,3,0,0,9100000,29700000,20,506,44900000
3,4,3,0,0,8200000,30700000,8,467,44800000
4,5,5,1,1,9800000,24200000,20,382,50000000


In [68]:
y = y.str.strip()
y.unique()

array(['Approved', 'Rejected'], dtype=object)

### Scale integer values 

In [60]:
# log transformation
log_cols= ["income_annum", "loan_amount", "total_assets"]
X[log_cols] = np.log(X[log_cols])


In [70]:
X.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets
0,1,2,0,0,16.077274,17.213369,12,778,17.569709
1,2,0,1,1,15.226498,16.316947,8,417,16.432906
2,3,3,0,0,16.023785,17.206658,20,506,17.619948
3,4,3,0,0,15.919645,17.239773,8,467,17.617719
4,5,5,1,1,16.097893,17.001863,20,382,17.727534


In [69]:
y = y.map({'Approved': 1, 'Rejected': 0})
y.unique()

array([1, 0], dtype=int64)

In [71]:
X.drop(columns=['loan_id'], inplace=True)

In [72]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets
0,2,0,0,16.077274,17.213369,12,778,17.569709
1,0,1,1,15.226498,16.316947,8,417,16.432906
2,3,0,0,16.023785,17.206658,20,506,17.619948
3,3,0,0,15.919645,17.239773,8,467,17.617719
4,5,1,1,16.097893,17.001863,20,382,17.727534


## Split the data

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [74]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
y_pred = lg.predict(X_test)

In [76]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_pred, y_test)
print(acc)

0.9063231850117096
