In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv("loan_approval_dataset.csv")
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
df.isnull().sum()

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [4]:
df.drop(['loan_id'], axis=1, inplace=True)

In [5]:
print(df.columns)

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   no_of_dependents          4269 non-null   int64 
 1   education                 4269 non-null   object
 2   self_employed             4269 non-null   object
 3   income_annum              4269 non-null   int64 
 4   loan_amount               4269 non-null   int64 
 5   loan_term                 4269 non-null   int64 
 6   cibil_score               4269 non-null   int64 
 7   residential_assets_value  4269 non-null   int64 
 8   commercial_assets_value   4269 non-null   int64 
 9   luxury_assets_value       4269 non-null   int64 
 10  bank_asset_value          4269 non-null   int64 
 11  loan_status               4269 non-null   object
dtypes: int64(9), object(3)
memory usage: 400.3+ KB


In [7]:
df.describe()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [8]:
min_max_summary = df.aggregate([min, max])

print(min_max_summary)

     no_of_dependents      education self_employed  income_annum  loan_amount  \
min                 0       Graduate            No        200000       300000   
max                 5   Not Graduate           Yes       9900000     39500000   

     loan_term  cibil_score  residential_assets_value  \
min          2          300                   -100000   
max         20          900                  29100000   

     commercial_assets_value  luxury_assets_value  bank_asset_value  \
min                        0               300000                 0   
max                 19400000             39200000          14700000   

    loan_status  
min    Approved  
max    Rejected  


In [9]:
df.rename(columns=lambda x: x.strip(), inplace=True)

print(df.columns)

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')


In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['education'] = label_encoder.fit_transform(df['education'])

df['self_employed'] = label_encoder.fit_transform(df['self_employed'])

df['loan_status'] = label_encoder.fit_transform(df['loan_status'])

print(df[['education', 'self_employed','loan_status']])

      education  self_employed  loan_status
0             0              0            0
1             1              1            1
2             0              0            1
3             0              0            1
4             1              1            1
...         ...            ...          ...
4264          0              1            1
4265          1              1            0
4266          1              0            1
4267          1              0            0
4268          0              0            0

[4269 rows x 3 columns]


In [11]:
loan_status = df['loan_status'].unique()
print("Nilai dalam kolom 'loan_status':", loan_status)

no_of_dependents = df['no_of_dependents'].unique()
print("Nilai dalam kolom 'no_of_dependents':", no_of_dependents)

education = df['education'].unique()
print("Nilai dalam kolom 'education':", education)

self_employed = df['self_employed'].unique()
print("Nilai dalam kolom 'self_employed':", self_employed)

Nilai dalam kolom 'loan_status': [0 1]
Nilai dalam kolom 'no_of_dependents': [2 0 3 5 4 1]
Nilai dalam kolom 'education': [0 1]
Nilai dalam kolom 'self_employed': [0 1]


In [12]:
X=df[['no_of_dependents','education','self_employed','income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value']]
y=df['loan_status']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

In [14]:
from sklearn.tree import DecisionTreeClassifier
tree_model=DecisionTreeClassifier()
tree_model=tree_model.fit(X_train, y_train)

In [15]:
from sklearn.metrics import accuracy_score

y_pred=tree_model.predict(X_test)

acc_score=round(accuracy_score(y_pred,y_test),3)

print('Accuracy: ',acc_score)

Accuracy:  0.972


In [16]:
prediction = tree_model.predict([[0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000]])
print(prediction)

if(prediction[0]==0):
    print("Approved")
else:
    print("Rejected")

[1]
Rejected




In [17]:
import pickle

filename = "dt-hutang-v1.sav"
pickle.dump(tree_model, open(filename, 'wb'))

In [18]:
df.to_csv('knn-loan.csv', index=False)