Split the data into:

X_train, X_test, y_train, y_test


LabelEncoder from sklearn.preprocessing

train_test_split from sklearn.model_selection


In [95]:
import pandas as pd
import matplotlib as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report


loans = pd.read_csv('../data/loan_approval_dataset.csv')
# First check to see if any NAs
loans.isna().any()

loan_id                      False
 no_of_dependents            False
 education                   False
 self_employed               False
 income_annum                False
 loan_amount                 False
 loan_term                   False
 cibil_score                 False
 residential_assets_value    False
 commercial_assets_value     False
 luxury_assets_value         False
 bank_asset_value            False
 loan_status                 False
dtype: bool

In [74]:
loans.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [75]:
# Convert the Loan Status into 0s and 1s
loans.columns 

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [76]:
loans.columns = loans.columns.str.strip()

In [77]:
loans['loan_status'].value_counts()

loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

In [78]:
loans['loan_status'].unique()

array([' Approved', ' Rejected'], dtype=object)

In [79]:
loans['loan_status'] = loans['loan_status'].str.strip()

In [80]:
loans['loan_status'].unique()

array(['Approved', 'Rejected'], dtype=object)

In [81]:
loans['loan_status'] = loans['loan_status'].map({'Approved':1,'Rejected':0})

In [82]:
loans['loan_status'].value_counts()

loan_status
1    2656
0    1613
Name: count, dtype: int64

In [83]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
 12  loan_status               4269 non-null   int64 
dtypes: int64(11), object(2)
memory usage: 433.7+ KB


LabelEncoding now

In [84]:
from sklearn.preprocessing import LabelEncoder
label_cols = ['self_employed','education'] 

encoders = {}

for col in label_cols:
    encoder = LabelEncoder()
    loans[col] = loans[col].astype(str).str.strip()
    loans[col] = le.fit_transform(loans[col])
    encoders[col] = encoder



In [85]:
loans.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [90]:
X = loans.drop(['loan_id','loan_status'],axis=1)
y = loans['loan_status']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [97]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"--- {name} ---")
    print(classification_report(y_test, preds))


--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.78      0.59      0.67       390
           1       0.79      0.90      0.84       678

    accuracy                           0.79      1068
   macro avg       0.79      0.75      0.76      1068
weighted avg       0.79      0.79      0.78      1068

--- Random Forest ---
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       390
           1       0.98      0.99      0.98       678

    accuracy                           0.98      1068
   macro avg       0.98      0.98      0.98      1068
weighted avg       0.98      0.98      0.98      1068

