In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
cmap = plt.cm.tab10

from sklearn.model_selection import train_test_split


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import tensorflow as tf
tf.keras.backend.set_floatx('float64')

In [5]:
data = pd.read_csv("../dataset/german_credit_data_withrisk.csv", index_col=0)

In [6]:
for col in data.columns:
    new_col = col.replace(" ", "_")
    if col != new_col:
        data[new_col] = data[col]
        del data[col]
    

In [7]:
num_impute = "mean" # One of ["mean", "zero", "infinity"]
cat_impute = "mode" # One of ["mode", "none"]
random_state = 42

In [8]:
job_index2word = {
    0: "unskilled and non-resident", 
    1: "unskilled and resident", 
    2: "skilled", 
    3: "highly skilled"
}
def assign_job_type(col):
    return job_index2word[col]

In [9]:
data.Job = data.Job.apply(assign_job_type)

In [10]:
cat_cols = ['Sex', 'Job', 'Housing', 'Saving_accounts', 'Checking_account', 'Purpose']
num_cols = ['Age', 'Credit_amount', 'Duration']

In [11]:
def impute_with_mean(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = df[col].mean()
    return out
def impute_with_zero(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = 0.0
    return out


def impute_with_infinity(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = float("inf")
    return out
def impute_with_mode(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = df[col].mode().iat[0]
    return out
def impute_with_none(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = "None"
    return out

In [12]:
def get_impute_function(name):
    assert name in ["mean", "zero", "infinity", "mode", "none"]
    if name == "mean":
        return impute_with_mean
    elif name == "zero":
        return impute_with_zero
    elif name == "infinity":
        return impute_with_infinity
    elif name == "mode":
        return impute_with_mode
    else:
        return impute_with_none

In [13]:
def impute_missing_values(df, num_impute, cat_impute):
    num_impute_function = get_impute_function(num_impute)
    cat_impute_function = get_impute_function(cat_impute)
    new_df = num_impute_function(df)
    new_df = cat_impute_function(df)
    return new_df

In [14]:
data.isna().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Duration              0
Purpose               0
Risk                  0
Saving_accounts     183
Checking_account    394
Credit_amount         0
dtype: int64

In [15]:
data = impute_missing_values(data, num_impute, cat_impute)

In [16]:
data.isna().sum()

Age                 0
Sex                 0
Job                 0
Housing             0
Duration            0
Purpose             0
Risk                0
Saving_accounts     0
Checking_account    0
Credit_amount       0
dtype: int64

In [17]:
# for col in cat_cols:
#     data[col] = pd.Categorical(data[col])
#     data[col] = data[col].cat.codes

In [18]:
data.Risk  = pd.Categorical(data.Risk)
data.Risk = data.Risk.cat.codes

In [None]:
train_df, val_df = train_test_split(data, test_size = 0.2, stratify=data.Risk, random_state = random_state)

In [17]:
train_df.shape

(800, 10)

In [18]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk',
       'Saving_accounts', 'Checking_account', 'Credit_amount'],
      dtype='object')

In [19]:
for col in num_cols:
    min_ = data[col].min()
    data[col] = data[col] - min_
    max_ = data[col].max()
    data[col] = data[col]/max_
    

In [20]:
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Duration,Purpose,Risk,Saving_accounts,Checking_account,Credit_amount
0,0.857143,male,skilled,own,0.029412,radio/TV,1,little,little,0.050567
1,0.053571,female,skilled,own,0.647059,radio/TV,0,little,moderate,0.31369
2,0.535714,male,unskilled and resident,own,0.117647,education,1,little,little,0.101574
3,0.464286,male,skilled,free,0.558824,furniture/equipment,1,little,little,0.419941
4,0.607143,male,skilled,free,0.294118,car,0,little,little,0.254209


In [21]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Risk')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [22]:
train_ds = df_to_dataset(train_df, batch_size=32)
val_ds = df_to_dataset(val_df, shuffle=False, batch_size=32)

In [23]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk',
       'Saving_accounts', 'Checking_account', 'Credit_amount'],
      dtype='object')

In [24]:
feature_columns = []

# numeric cols
for feature in num_cols:
    
    feature_columns.append(tf.feature_column.numeric_column(feature
))

In [25]:
cat_cols

['Sex', 'Job', 'Housing', 'Saving_accounts', 'Checking_account', 'Purpose']

In [26]:
for col_name in cat_cols:
    cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
      col_name, data[col_name].unique())
    indicator_column = tf.feature_column.indicator_column(cat_column)
    feature_columns.append(indicator_column)

In [28]:
# dir(feature_columns[0])

In [29]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)


In [30]:
layers = tf.keras.layers

In [31]:
model = tf.keras.Sequential([
  feature_layer,

  layers.Dense(256, activation='relu', kernel_regularizer="l2"),
  layers.Dense(1, activation="sigmoid")
])



In [32]:
model.compile(optimizer='adam',
              loss="binary_crossentropy",
              metrics=['accuracy'])



In [33]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=100)

Epoch 1/100


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f6d181a59b0>

In [33]:
feature_columns

[NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Credit_amount', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Duration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Job', vocabulary_list=('skilled', 'unskilled and resident', 'highly skilled', 'unskilled and non-resident'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Housing', vocabulary_list=('own', 'free', 'rent'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Saving_accounts', vocabulary_lis