In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
cmap = plt.cm.tab10

from sklearn.model_selection import train_test_split


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import tensorflow as tf
tf.keras.backend.set_floatx('float64')
import pickle

In [124]:
data = pd.read_csv("../dataset/german_credit_data_withrisk.csv", index_col=0)

In [125]:
for col in data.columns:
    new_col = col.replace(" ", "_")
    if col != new_col:
        data[new_col] = data[col]
        del data[col]
    

In [126]:
num_impute = "mean" # One of ["mean", "zero", "infinity"]
cat_impute = "mode" # One of ["mode", "none"]
random_state = 42

In [127]:
job_index2word = {
    0: "unskilled and non-resident", 
    1: "unskilled and resident", 
    2: "skilled", 
    3: "highly skilled"
}
def assign_job_type(col):
    return job_index2word[col]

In [128]:
data.Job = data.Job.apply(assign_job_type)

In [129]:
cat_cols = ['Sex', 'Job', 'Housing', 'Saving_accounts', 'Checking_account', 'Purpose']
num_cols = ['Age', 'Credit_amount', 'Duration']

In [130]:
def impute_with_mean(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = df[col].mean()
    return out
def impute_with_zero(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = 0.0
    return out


def impute_with_infinity(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in num_cols:
            out.loc[out[col].isna(), col] = float("inf")
    return out
def impute_with_mode(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = df[col].mode().iat[0]
    return out
def impute_with_none(df):
    out = pd.DataFrame(df)
    for col in df.columns:
        if col in cat_cols:
            out.loc[out[col].isna(), col] = "None"
    return out

In [131]:
def get_impute_function(name):
    assert name in ["mean", "zero", "infinity", "mode", "none"]
    if name == "mean":
        return impute_with_mean
    elif name == "zero":
        return impute_with_zero
    elif name == "infinity":
        return impute_with_infinity
    elif name == "mode":
        return impute_with_mode
    else:
        return impute_with_none

In [132]:
def impute_missing_values(df, num_impute, cat_impute):
    num_impute_function = get_impute_function(num_impute)
    cat_impute_function = get_impute_function(cat_impute)
    new_df = num_impute_function(df)
    new_df = cat_impute_function(df)
    return new_df

In [133]:
data.isna().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Duration              0
Purpose               0
Risk                  0
Saving_accounts     183
Checking_account    394
Credit_amount         0
dtype: int64

In [134]:
data = impute_missing_values(data, num_impute, cat_impute)

In [135]:
data.isna().sum()

Age                 0
Sex                 0
Job                 0
Housing             0
Duration            0
Purpose             0
Risk                0
Saving_accounts     0
Checking_account    0
Credit_amount       0
dtype: int64

In [136]:
data.Risk

0      good
1       bad
2      good
3      good
4       bad
       ... 
995    good
996    good
997    good
998     bad
999    good
Name: Risk, Length: 1000, dtype: object

In [62]:
# for col in cat_cols:
#     data[col] = pd.Categorical(data[col])
#     data[col] = data[col].cat.codes

In [122]:
data.Risk  = pd.Categorical(data.Risk)
data.Risk = data.Risk.cat.codes

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Length: 1000, dtype: int8


In [65]:
with open("scalers.json", "rb") as input_file:
    scalers = pickle.load(input_file)
    

In [70]:
for col in scalers:
    data[col.replace(" ", "_")] = scalers[col].transform(data[col.replace(" ", "_")].values.reshape(-1, 1))

In [71]:
train_df, val_df = train_test_split(data, test_size = 0.2, stratify=data.Risk, random_state = random_state)

In [72]:
train_df.shape

(800, 10)

In [73]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk',
       'Saving_accounts', 'Checking_account', 'Credit_amount'],
      dtype='object')

In [74]:
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Duration,Purpose,Risk,Saving_accounts,Checking_account,Credit_amount
0,-2.883041,male,skilled,own,-1.236478,radio/TV,1,little,little,-0.745131
1,-3.231145,female,skilled,own,2.248194,radio/TV,0,little,moderate,0.949817
2,-3.022283,male,unskilled and resident,own,-0.738668,education,1,little,little,-0.416562
3,-3.053225,male,skilled,free,1.750384,furniture/equipment,1,little,little,1.634247
4,-2.99134,male,skilled,free,0.256953,car,0,little,little,0.566664


In [75]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Risk')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [76]:
train_ds = df_to_dataset(train_df, batch_size=32)
val_ds = df_to_dataset(val_df, shuffle=False, batch_size=32)

In [77]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk',
       'Saving_accounts', 'Checking_account', 'Credit_amount'],
      dtype='object')

In [113]:
num_cols

['Age', 'Credit_amount', 'Duration']

In [78]:
feature_columns = []

# numeric cols
for feature in num_cols:
    
    feature_columns.append(tf.feature_column.numeric_column(feature
))

In [79]:
cat_cols

['Sex', 'Job', 'Housing', 'Saving_accounts', 'Checking_account', 'Purpose']

In [80]:
for col_name in cat_cols:
    cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
      col_name, data[col_name].unique())
    indicator_column = tf.feature_column.indicator_column(cat_column)
    feature_columns.append(indicator_column)

In [81]:
# dir(feature_columns[0])

In [82]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)


In [83]:
layers = tf.keras.layers

In [89]:
model = tf.keras.Sequential([
  feature_layer,

  layers.Dense(1, activation="sigmoid")
])



In [90]:
model.compile(optimizer='adam',
              loss="binary_crossentropy",
              metrics=['accuracy'])



In [91]:
mcp_save = tf.keras.callbacks.ModelCheckpoint('keras-best-best-model.h5', save_weights_only=True, save_best_only=True, monitor='val_accuracy', mode="auto")


In [92]:
model.fit(train_ds,
          validation_data=val_ds,
          callbacks=[mcp_save],
          epochs=100)

Epoch 1/100


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fc1d8b8ab70>

In [36]:
feature_columns

[NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Credit_amount', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Duration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Job', vocabulary_list=('skilled', 'unskilled and resident', 'highly skilled', 'unskilled and non-resident'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Housing', vocabulary_list=('own', 'free', 'rent'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Saving_accounts', vocabulary_lis

In [93]:
model.load_weights("keras-best-best-model.h5")

In [111]:
model.save("keras-best-model.hdf5")

In [112]:
model.save("keras-best-model-weights.h5")

In [101]:
inputs = np.stack(list(train_ds.take(1)))

  return array(a, dtype, copy=False, order=order, subok=True)


In [118]:
model.predict(inputs[0][0]).shape

(32, 1)

In [120]:
inputs[0][0]

{'Age': <tf.Tensor: shape=(32,), dtype=float64, numpy=
 array([-3.03775397, -3.20793792, -3.18473102, -3.10737468, -3.18473102,
        -3.15378848, -2.89077692, -2.93719073, -3.20793792, -3.11511031,
        -3.13831722, -3.21567356, -3.06096088, -2.91398383, -3.04548961,
        -3.13058158, -3.14605285, -3.13831722, -3.18473102, -2.92171946,
        -3.11511031, -3.20020229, -2.94492636, -3.23114483, -3.10737468,
        -3.16152412, -3.13058158, -3.15378848, -2.94492636, -3.06869651,
        -3.22340919, -3.20793792])>,
 'Sex': <tf.Tensor: shape=(32,), dtype=string, numpy=
 array([b'male', b'male', b'female', b'male', b'female', b'male', b'male',
        b'male', b'female', b'female', b'male', b'female', b'male',
        b'male', b'male', b'male', b'female', b'male', b'female', b'male',
        b'male', b'male', b'female', b'female', b'male', b'female',
        b'female', b'male', b'female', b'male', b'male', b'male'],
       dtype=object)>,
 'Job': <tf.Tensor: shape=(32,), dtype=s

In [119]:
feature_columns

[NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Credit_amount', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Duration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Job', vocabulary_list=('skilled', 'unskilled and resident', 'highly skilled', 'unskilled and non-resident'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Housing', vocabulary_list=('own', 'free', 'rent'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Saving_accounts', vocabulary_lis

In [116]:
new_model = tf.keras.models.load_model("keras-best-model.hdf5")

ValueError: You are trying to load a weight file containing 1 layers into a model with 0 layers.

In [117]:
model.input_shape

AttributeError: The layer has never been called and thus has no defined input shape.

In [139]:
data["Saving_accounts"].unique()

array(['little', 'quite rich', 'rich', 'moderate'], dtype=object)

In [140]:
data["Checking_account"].unique()

array(['little', 'moderate', 'rich'], dtype=object)

In [141]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk',
       'Saving_accounts', 'Checking_account', 'Credit_amount'],
      dtype='object')

In [142]:
data.Housing.unique()

array(['own', 'free', 'rent'], dtype=object)