In [1]:
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

In [2]:
df = pd.read_parquet('german_credit_data.parquet').drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [3]:
df = df.rename(columns = {
    'Sex': 'Gender',
    'Saving accounts': 'Saving_acc',
    'Checking account': 'Checking_acc',
    'Credit amount': 'Credit_mnt',
})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           1000 non-null   int64 
 1   Gender        1000 non-null   object
 2   Job           1000 non-null   int64 
 3   Housing       1000 non-null   object
 4   Saving_acc    817 non-null    object
 5   Checking_acc  606 non-null    object
 6   Credit_mnt    1000 non-null   int64 
 7   Duration      1000 non-null   int64 
 8   Purpose       1000 non-null   object
 9   Risk          1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [5]:
df.describe()

Unnamed: 0,Age,Job,Credit_mnt,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [6]:
df.isna().sum()

Age               0
Gender            0
Job               0
Housing           0
Saving_acc      183
Checking_acc    394
Credit_mnt        0
Duration          0
Purpose           0
Risk              0
dtype: int64

# Feature Engineering

### Binary Features
- Gender
- Risk

converting to numerical values

In [7]:
df.Gender = df.Gender.apply(lambda x: 1 if x == 'male' else 0)
df.Risk = df.Risk.apply(lambda x: 1 if x == 'good' else 0)

### Continuous Features
- Age
- Duration
- Credit_mnt

bucketing continuous features

In [8]:
df

Unnamed: 0,Age,Gender,Job,Housing,Saving_acc,Checking_acc,Credit_mnt,Duration,Purpose,Risk
0,67,1,2,own,,little,1169,6,radio/TV,1
1,22,0,2,own,little,moderate,5951,48,radio/TV,0
2,49,1,1,own,little,,2096,12,education,1
3,45,1,2,free,little,little,7882,42,furniture/equipment,1
4,53,1,2,free,little,little,4870,24,car,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,own,little,,1736,12,furniture/equipment,1
996,40,1,3,own,little,little,3857,30,car,1
997,38,1,2,own,little,,804,12,radio/TV,1
998,23,1,2,free,little,little,1845,45,radio/TV,0


In [10]:
def categorize(df, feature):
    cutted = pd.qcut(df[feature], 3, ['low', 'medium', 'high'])
    categorized = pd.DataFrame(cutted)
    df_copy = df.copy()
    df_copy[feature] = categorized
    return df_copy

In [None]:
def one_hot_encode(df, feature):
    encoded = pd.get_dummies(df[feature], feature)
    df = df.drop(feature, axis=1)
    return df.join(encoded)

In [17]:
# Calculate information value
def calc_iv(df, feature, target, pr=False):
    lst = []
    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])
    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    iv = data['IV'].sum()
    print('This variable\'s IV is:',iv)
    print(df[feature].value_counts())
    return iv, data

In [11]:
categorize(df, 'Age')

Unnamed: 0,Age,Gender,Job,Housing,Saving_acc,Checking_acc,Credit_mnt,Duration,Purpose,Risk
0,high,1,2,own,,little,1169,6,radio/TV,1
1,low,0,2,own,little,moderate,5951,48,radio/TV,0
2,high,1,1,own,little,,2096,12,education,1
3,high,1,2,free,little,little,7882,42,furniture/equipment,1
4,high,1,2,free,little,little,4870,24,car,0
...,...,...,...,...,...,...,...,...,...,...
995,medium,0,1,own,little,,1736,12,furniture/equipment,1
996,high,1,3,own,little,little,3857,30,car,1
997,medium,1,2,own,little,,804,12,radio/TV,1
998,low,1,2,free,little,little,1845,45,radio/TV,0


In [17]:
iv, data = calc_iv(df0, 'Age', 'Risk')

This variable's IV is: 0.04768749644232659
medium    346
low       334
high      320
Name: Age, dtype: int64


In [19]:
data

Unnamed: 0,Variable,Value,All,Good,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,Age,high,320,84,236,0.32,0.7375,0.28,0.337143,-0.185717,0.010612
1,Age,low,334,122,212,0.334,0.634731,0.406667,0.302857,0.294733,0.030596
2,Age,medium,346,94,252,0.346,0.728324,0.313333,0.36,-0.138836,0.006479


In [12]:
df

Unnamed: 0,Age,Gender,Job,Housing,Saving_acc,Checking_acc,Credit_mnt,Duration,Purpose,Risk
0,67,1,2,own,,little,1169,6,radio/TV,1
1,22,0,2,own,little,moderate,5951,48,radio/TV,0
2,49,1,1,own,little,,2096,12,education,1
3,45,1,2,free,little,little,7882,42,furniture/equipment,1
4,53,1,2,free,little,little,4870,24,car,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,own,little,,1736,12,furniture/equipment,1
996,40,1,3,own,little,little,3857,30,car,1
997,38,1,2,own,little,,804,12,radio/TV,1
998,23,1,2,free,little,little,1845,45,radio/TV,0


In [26]:
def categorize(df, feature):
    cutted = pd.qcut(df[feature], 10, np.arange(1, 11, 1).tolist())
    categorized = pd.DataFrame(cutted)
    df_copy = df.copy()
    df_copy[feature] = categorized
    return df_copy

In [27]:
df0 = categorize(df, 'Age')

In [28]:
iv, data = calc_iv(df0, 'Age', 'Risk')

This variable's IV is: 0.10062227273111428
2     135
8     113
6     111
1     105
5     105
10     96
3      94
9      90
4      77
7      74
Name: Age, dtype: int64


In [29]:
data

Unnamed: 0,Variable,Value,All,Good,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,Age,1,105,42,63,0.105,0.6,0.14,0.09,0.441833,0.022092
1,Age,2,135,52,83,0.135,0.614815,0.173333,0.118571,0.379701,0.020793
2,Age,3,94,28,66,0.094,0.702128,0.093333,0.094286,-0.010152,1e-05
3,Age,4,77,26,51,0.077,0.662338,0.086667,0.072857,0.173569,0.002397
4,Age,5,105,33,72,0.105,0.685714,0.11,0.102857,0.067139,0.00048
5,Age,6,111,23,88,0.111,0.792793,0.076667,0.125714,-0.494545,0.024256
6,Age,7,74,18,56,0.074,0.756757,0.06,0.08,-0.287682,0.005754
7,Age,8,113,31,82,0.113,0.725664,0.103333,0.117143,-0.125434,0.001732
8,Age,9,90,18,72,0.09,0.8,0.06,0.102857,-0.538997,0.0231
9,Age,10,96,29,67,0.096,0.697917,0.096667,0.095714,0.009901,9e-06


In [22]:
pd.qcut(df.Age, 10).value_counts()

(23.0, 26.0]      135
(39.0, 45.0]      113
(33.0, 36.0]      111
(18.999, 23.0]    105
(30.0, 33.0]      105
(52.0, 75.0]       96
(26.0, 28.0]       94
(45.0, 52.0]       90
(28.0, 30.0]       77
(36.0, 39.0]       74
Name: Age, dtype: int64

In [None]:
# df0

# iv, data = calc_iv(df0, 'Age', 'Risk')

# data

# def categorize(df, feature):
#     cutted = pd.qcut(df[feature], 5, [1,2,3,4,5])
#     categorized = pd.DataFrame(cutted)
#     df[feature] = categorized
#     return df

# df.Age

# df1 = categorize(df, 'Age')

# iv, data = calc_iv(df0, 'Age', 'Risk')

# data

# # df = categorize(df, 'Age')
# # df = one_hot_encode(df, 'Age')

# # df = categorize(df, 'Duration')
# # df = one_hot_encode(df, 'Duration')

# # df = categorize(df, 'Credit_mnt')
# # df = one_hot_encode(df, 'Credit_mnt')

# # df