In [47]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [48]:

!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv

--2021-10-04 17:33:33--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: 'CreditScoring.csv.3'

     0K .......... .......... .......... .......... .......... 28% 1,13M 0s
    50K .......... .......... .......... .......... .......... 56% 1,38M 0s
   100K .......... .......... .......... .......... .......... 84% 3,76M 0s
   150K .......... .......... ........                        100%  491K=0,1s

2021-10-04 17:33:34 (1,17 MB/s) - 'CreditScoring.csv.3' saved [182489/182489]



In [49]:
df = pd.read_csv('CreditScoring.csv')
df.columns = df.columns.str.lower()

In [50]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [51]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)


home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [52]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=0)

In [53]:
df

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,default,1,rent,60,39,married,no,fixed,69,92,0,0,900,1020
4451,ok,22,owner,60,46,married,no,fixed,60,75,3000,600,950,1263
4452,default,0,owner,24,37,married,no,partime,60,90,3500,0,500,963
4453,ok,0,rent,48,23,single,no,freelance,49,140,0,0,550,550


In [54]:
df=df[df.status!='unk'].reset_index(drop=True)

In [55]:
df['default'] = (df.status == 'default').astype(int)
del df['status']

In [56]:
df.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price,default
0,9,rent,60,30,married,no,freelance,73,129,0,0,800,846,0
1,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658,0
2,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985,1
3,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325,0
4,0,rent,36,26,single,no,fixed,46,107,0,0,310,910,0


What are the categorical variables? What are the numerical?

In [57]:
numerical=['seniority','time','age','expenses','income','assets','debt','amount','price']
categorical=['home','marital','records','job']


Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution. Use train_test_split funciton for that with random_state=1

In [58]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [63]:
len(df_train),len(df_val),len(df_test)

(2672, 891, 891)

In [65]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)

In [72]:
y_train=df_train.default
y_val=df_val.default
y_test=df_test.default

del df_train['default']
del df_val['default']
del df_test['default']

In [96]:
y_train

0       1
1       0
2       0
3       0
4       0
       ..
2667    1
2668    0
2669    1
2670    0
2671    0
Name: default, Length: 2672, dtype: int32

## Question 1
ROC AUC could also be used to evaluate feature importance of numerical variables.

Let's do that

- For each numerical variable, use it as score and compute AUC with the "default" variable
- Use the training dataset for that
- If your AUC is < 0.5, invert this variable by putting "-" in front

(e.g. -df_train['expenses'])

AUC can go below 0.5 if the variable is negatively correlated with the target varialble. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.

In [76]:
y_train

0       1
1       0
2       0
3       0
4       0
       ..
2667    1
2668    0
2669    1
2670    0
2671    0
Name: default, Length: 2672, dtype: int32

In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score

score=[]
param=['seniority','time','debt','income']
for n in param:
    clf=RandomForestClassifier(n_estimators=100,random_state=0)
    clf.fit(df_train[n].to_frame(),y_train)
    y_pred=clf.predict(df_val[n].to_frame())
    auc_score=roc_auc_score(y_val,y_pred)
    if auc_score<0.5:
        clf.fit(-df_train[n].to_frame(),y_train)
        y_pred=clf.predict(df_val[n].to_frame())
        auc_score=roc_auc_score(y_val,y_pred)

    print(f'{n}: {auc_score:.4f}')

seniority: 0.5771
time: 0.5000
debt: 0.5000
income: 0.5704
