In [676]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [677]:
df = pd.read_csv('./CreditScoring.csv')
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [678]:
df.shape[0]

4455

In [679]:
df.dtypes

Status       int64
Seniority    int64
Home         int64
Time         int64
Age          int64
Marital      int64
Records      int64
Job          int64
Expenses     int64
Income       int64
Assets       int64
Debt         int64
Amount       int64
Price        int64
dtype: object

In [680]:
df.columns = df.columns.str.lower()
df.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [681]:
# df['status'].values[1]

In [682]:
df[df['status'] == 0].index.values

array([3309], dtype=int64)

In [683]:
df.isnull().sum()

status       0
seniority    0
home         0
time         0
age          0
marital      0
records      0
job          0
expenses     0
income       0
assets       0
debt         0
amount       0
price        0
dtype: int64

In [684]:
df.nunique()

status          3
seniority      47
home            7
time           11
age            50
marital         6
records         2
job             5
expenses       94
income        353
assets        160
debt          183
amount        285
price        1419
dtype: int64

In [685]:
df.status.value_counts()

1    3200
2    1254
0       1
Name: status, dtype: int64

In [686]:
df.status.loc[4]

1

In [687]:
# defining a dictionary to map the values of `1` as `ok` `2` as `default` `3` as `unk` means unknown
status_value = {
    0: 'unk',
    1: 'ok',
    2: 'default'
}

In [688]:
# mapping it to `df.status`
df.status = df.status.map(status_value)
df.head()
df['status'][10]

'ok'

In [689]:
k = 0
for i in range(len(df)):
    if df['status'][i] == 'unk':
        k += 1
    
print(k)

1


In [690]:
home_values = { 
 1: 'rent', 
 2: 'owner', 
 3: 'private', 
 4: 'ignore', 
 5: 'parents', 
 6: 'other', 
 0: 'unk' 
} 

df.home = df.home.map(home_values)
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,rent,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,owner,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,rent,36,26,1,1,1,46,107,0,0,310,910


In [691]:

marital_values = { 
 1: 'single', 
 2: 'married', 
 3: 'widow', 
 4: 'separated', 
 5: 'divorced', 
 0: 'unk' 
} 
df.marital = df.marital.map(marital_values) 

records_values = { 
 1: 'no', 
 2: 'yes', 
 0: 'unk' 
} 
df.records = df.records.map(records_values) 

job_values = { 
 1: 'fixed', 
 2: 'partime', 
 3: 'freelance', 
 4: 'others', 
 0: 'unk' 
} 
df.job = df.job.map(job_values) 

In [692]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [693]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [694]:
# as the value of 99999999.0 is suspicious. It replaces the null value with this digit. thus replacing its values back to NaN.

for c in ['income','assets','debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [695]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [696]:
df.status.value_counts()

ok         3200
default    1254
unk           1
Name: status, dtype: int64

In [697]:
# `uk` is in one row, where we dont know if the user paid or not thus not useful to us. Therefore dropping
df = df[df['status'] != 'unk']
df.status.value_counts()

ok         3200
default    1254
Name: status, dtype: int64

Splitting data into training, validation and test dataset

In [698]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=11)

In [699]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=11)

In [700]:
df_train = df_train.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

In [701]:
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(2672, 891, 891)

In [702]:
y_train = (df_train.status == 'default').values
y_val = (df_val.status == 'default').values

In [703]:
del df_train['status']
del df_val['status']

In [704]:
df_train.isnull().sum()

seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       25
assets       30
debt         11
amount        0
price         0
dtype: int64

As we before replaced white spaces in `income`, `assets`, `debt` with `NaN` we got some null values

In [705]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

In [706]:
df_train.isnull().sum()

seniority    0
home         0
time         0
age          0
marital      0
records      0
job          0
expenses     0
income       0
assets       0
debt         0
amount       0
price        0
dtype: int64

For one hot encoding we need dictionary, thus converting into dictionary using `DictVectorizer`

In [707]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [708]:
dict_train[0]

{'seniority': 10,
 'home': 'owner',
 'time': 36,
 'age': 36,
 'marital': 'married',
 'records': 'no',
 'job': 'freelance',
 'expenses': 75,
 'income': 0.0,
 'assets': 10000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1400}

In [709]:
# Applying DictVectorizer
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

In [710]:
dv.get_feature_names()



['age',
 'amount',
 'assets',
 'debt',
 'expenses',
 'home=ignore',
 'home=other',
 'home=owner',
 'home=parents',
 'home=private',
 'home=rent',
 'home=unk',
 'income',
 'job=fixed',
 'job=freelance',
 'job=others',
 'job=partime',
 'job=unk',
 'marital=divorced',
 'marital=married',
 'marital=separated',
 'marital=single',
 'marital=unk',
 'marital=widow',
 'price',
 'records=no',
 'records=yes',
 'seniority',
 'time']

In [711]:
X_train.shape[1]

29

Decision trees are if-else-condition which gives us the result at the end. They can be created using simple python with if-else statements by observing the complete dataset

In [712]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [713]:
# as this is binary classification problem we will be using AUC score to check the accuracy
from sklearn.metrics import roc_auc_score
y_pred = dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)

1.0

In [714]:
y_pred = dt.predict_proba(X_val)[:,1]
roc_auc_score(y_val, y_pred).round(3)

0.649

We can see that the model is overfitted, it memorized the training datatset so well that didnt worked effectively on the unseen data.

Overfitting happens when we have a complex model with enough power to remember all 
the training data. If we force the model to be simpler, we can `make it less powerful`, and 
improve the model’s ability to generalize.

In [751]:
# Trying using the `max_depth` feature

dt = DecisionTreeClassifier(max_depth=2) # default -> max_depth = none which means tree can get as long as it can
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [752]:
from sklearn.tree import export_text 
tree_text = export_text(dt, feature_names=dv.feature_names_) 
print(tree_text)

|--- records=no <= 0.50
|   |--- seniority <= 6.50
|   |   |--- class: True
|   |--- seniority >  6.50
|   |   |--- class: False
|--- records=no >  0.50
|   |--- job=partime <= 0.50
|   |   |--- class: False
|   |--- job=partime >  0.50
|   |   |--- class: True



In [753]:
y_pred = dt.predict_proba(X_train)[:,1]
acc1 = roc_auc_score(y_train, y_pred).round(3)

y_pred = dt.predict_proba(X_val)[:,1]
acc2 = roc_auc_score(y_val,y_pred).round(3)

acc1,acc2

(0.705, 0.669)

In [762]:
df_new = df[['assets','status']]
df_new

Unnamed: 0,assets,status
0,0.0,ok
1,0.0,ok
2,3000.0,default
3,2500.0,ok
4,0.0,ok
...,...,...
4450,0.0,default
4451,3000.0,ok
4452,3500.0,default
4453,0.0,ok
