In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
cmap = plt.cm.tab20
from sklearn.model_selection import train_test_split
import tensorflow as tf
layers = tf.keras.layers

In [2]:
data = pd.read_csv("dataset/german_credit_data_withrisk.csv", index_col=0)

In [3]:
# data = data[["Credit amount", "Age", "Duration", "Purpose", "Risk"]]

In [4]:
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [5]:
num_impute = "mean" # One of ["mean", "zero", "infinity"]
cat_impute = "none" # One of ["mode", "none"]

In [6]:
job_index2word = {
    0: "unskilled and non-resident", 
    1: "unskilled and resident", 
    2: "skilled", 
    3: "highly skilled"
}
def assign_job_type(col):
    return job_index2word[col]

In [7]:
data.Job = data.Job.apply(assign_job_type)

In [8]:
cat_cols = ['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
num_cols = ['Age', 'Credit amount', 'Duration']

In [9]:
def impute_with_mean(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = df[col].mean()
    return out
def impute_with_zero(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = 0.0
    return out


def impute_with_infinity(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = float("inf")
    return out
def impute_with_mode(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = df[col].mode().iat[0]
    return out
def impute_with_none(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = "None"
    return out

In [10]:
def get_impute_function(name):
    assert name in ["mean", "zero", "infinity", "mode", "none"]
    if name == "mean":
        return impute_with_mean
    elif name == "zero":
        return impute_with_zero
    elif name == "infinity":
        return impute_with_infinity
    elif name == "mode":
        return impute_with_mode
    else:
        return impute_with_none

In [11]:
def impute_missing_values(df, num_impute, cat_impute):
    num_impute_function = get_impute_function(num_impute)
    cat_impute_function = get_impute_function(cat_impute)
    new_df = num_impute_function(df)
    new_df = cat_impute_function(df)
    return new_df

In [12]:
data.isna().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [13]:
data = impute_missing_values(data, num_impute, cat_impute)

In [14]:
data.isna().sum()

Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

In [15]:
(data.Risk == "good").sum()

700

In [16]:
(data.Risk == "bad").sum()

300

In [17]:
def cat2onehot(df):
    output = df.copy()
    for col in cat_cols:
        if col in df.columns:
            dummies = pd.get_dummies(df[col], prefix="", prefix_sep = col+"_")
            output = pd.concat([dummies, output], axis=1)
  
            output.drop(col, axis=1, inplace=True)
      
    return output

In [18]:
data = cat2onehot(data)

In [19]:
data.columns

Index(['Purpose_business', 'Purpose_car', 'Purpose_domestic appliances',
       'Purpose_education', 'Purpose_furniture/equipment', 'Purpose_radio/TV',
       'Purpose_repairs', 'Purpose_vacation/others', 'Checking account_None',
       'Checking account_little', 'Checking account_moderate',
       'Checking account_rich', 'Saving accounts_None',
       'Saving accounts_little', 'Saving accounts_moderate',
       'Saving accounts_quite rich', 'Saving accounts_rich', 'Housing_free',
       'Housing_own', 'Housing_rent', 'Job_highly skilled', 'Job_skilled',
       'Job_unskilled and non-resident', 'Job_unskilled and resident',
       'Sex_female', 'Sex_male', 'Age', 'Credit amount', 'Duration', 'Risk'],
      dtype='object')

In [20]:
def normalize_numerical(df):
    output = df.copy()
    norm_params = {}
    for col in num_cols:
        mean = df[col].mean()
        std = df[col].std()
        
        output[col] = (df[col] - mean)/std
        norm_params[col] = {"mean":mean, "std":std}
    return output, norm_params

In [21]:
data.head()

Unnamed: 0,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Checking account_None,Checking account_little,...,Job_highly skilled,Job_skilled,Job_unskilled and non-resident,Job_unskilled and resident,Sex_female,Sex_male,Age,Credit amount,Duration,Risk
0,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,0,1,67.0,1169.0,6.0,good
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,22.0,5951.0,48.0,bad
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,1,49.0,2096.0,12.0,good
3,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,1,45.0,7882.0,42.0,good
4,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,53.0,4870.0,24.0,bad


In [22]:
data, norm_params = normalize_numerical(data)

In [23]:
data.head()

Unnamed: 0,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Checking account_None,Checking account_little,...,Job_highly skilled,Job_skilled,Job_unskilled and non-resident,Job_unskilled and resident,Sex_female,Sex_male,Age,Credit amount,Duration,Risk
0,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,0,1,2.765073,-0.744759,-1.235859,good
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,-1.190808,0.949342,2.24707,bad
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,1,1.182721,-0.416354,-0.738298,good
3,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0.831087,1.63343,1.749509,good
4,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,1.534354,0.56638,0.256825,bad


In [24]:
X = data[data.columns[:-1]].values
y = (data.Risk == "good").astype(float)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 123)

In [107]:
model = tf.keras.Sequential([
    layers.Dense(32, activation="relu", kernel_regularizer="l2"),
    layers.Dropout(0.25),
    layers.Dense(64, activation="relu", kernel_regularizer="l2"),
    layers.Dense(128, activation="relu", kernel_regularizer="l2"),
    layers.Dense(256, activation="relu", kernel_regularizer="l2"),
    layers.Dense(1, activation="sigmoid")
])

In [108]:
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(1e-3), metrics=["accuracy"])

In [109]:
model.fit(X_train,y_train, validation_data=(X_test, y_test), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f8a44702b00>