https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights

https://medium.com/@zergtant/use-weighted-loss-function-to-solve-imbalanced-data-classification-problems-749237f38b75

In [1]:
import sys
import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

sys.path.append("../tf_ifenet")

In [2]:
print(tf.__version__)

2.14.1


In [3]:
from model import IFENetClassifier
from config import DataConfig, ModelConfig
from utility import dataframe_to_dataset

In [4]:
filepath = 'heloc.data.csv'
df = pd.read_csv(filepath)

columns = df.columns
target_columns = ['RiskPerformance']

num_col_names = ['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'MSinceMostRecentTradeOpen',
                 'AverageMInFile', 'NumSatisfactoryTrades', 'NumTrades60Ever2DerogPubRec', 
                 'NumTrades90Ever2DerogPubRec', 'PercentTradesNeverDelq', 'MSinceMostRecentDelq',
                 'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades',
                 'NumTradesOpeninLast12M', 'PercentInstallTrades', 'MSinceMostRecentInqexcl7days', 
                 'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden', 
                 'NetFractionInstallBurden', 'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance',
                 'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']
cat_col_names = []
df.shape

(10459, 24)

In [5]:
from sklearn.model_selection import train_test_split

# This split is according to Tab Survey (Borisov et al., 2022)
train_size = 8359
tmp, test = train_test_split(df, train_size=train_size, random_state=0)
train, vald = train_test_split(tmp, train_size=7000, random_state=0)

print(f'Training set: {train.shape}')
print(f'Validation set: {vald.shape}')
print(f'Test set: {test.shape}')

batch_size = 256
train_ds = dataframe_to_dataset(train, target_columns, batch_size=batch_size)
vald_ds = dataframe_to_dataset(vald, target_columns, shuffle=False, batch_size=batch_size)
test_ds = dataframe_to_dataset(test, target_columns, shuffle=False, batch_size=batch_size)

Training set: (7000, 24)
Validation set: (1359, 24)
Test set: (2100, 24)


In [6]:
data_config = DataConfig(categorical_column_names=cat_col_names, 
                         numerical_column_names=num_col_names,
                         category_output_mode='one_hot',
                         is_normalization=False)
model_config = ModelConfig(num_att=16,
                           r=3.5,
                           clf_num_layers=1,
                           clf_hidden_units=[32],
                           reduction_layer='flatten')

model = IFENetClassifier(data_config, model_config)
model.build_model(train_ds)

In [7]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

lr = 0.01
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr, 
                                                              decay_steps=2000,
                                                              decay_rate=0.95,
                                                              staircase=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

checkpoint_path = 'checkpoints/ifeNet_heloc.h5'
patience = 500
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_accuracy')]

epochs = 5
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [8]:
saved_model_path = 'saved_model/ifeNet_heloc.keras'
model.fit(train_ds, validation_data=vald_ds, epochs=epochs, callbacks=callbacks)
model.load_weights(checkpoint_path)
model.save(saved_model_path)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
ifenet = tf.keras.models.load_model(saved_model_path, safe_mode=False)

In [10]:
y_pred = np.empty((0,))
y_test = np.empty((0,))

for data,label in test_ds.take(2):
    y_hat = ifenet(data)
    y_hat = np.argmax(y_hat, axis=-1)
    y_pred = np.append(y_pred, y_hat.ravel())

    label = label.numpy()
    y_test = np.append(y_test, label.ravel())

In [11]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.646484375
[[110 153]
 [ 28 221]]
              precision    recall  f1-score   support

         0.0       0.80      0.42      0.55       263
         1.0       0.59      0.89      0.71       249

    accuracy                           0.65       512
   macro avg       0.69      0.65      0.63       512
weighted avg       0.70      0.65      0.63       512



In [12]:
ifenet(next(iter(test_ds.map(lambda x,y: x))))
df = ifenet.get_feature_importance()
df

Unnamed: 0,Feature,Score
7,PercentTradesNeverDelq[0],0.077441
9,MaxDelq2PublicRecLast12M[0],0.056392
0,ExternalRiskEstimate[0],0.056269
21,NumBank2NatlTradesWHighUtilization[0],0.04789
14,MSinceMostRecentInqexcl7days[0],0.045895
3,AverageMInFile[0],0.045642
22,PercentTradesWBalance[0],0.045475
20,NumInstallTradesWBalance[0],0.044967
16,NumInqLast6Mexcl7days[0],0.044369
19,NumRevolvingTradesWBalance[0],0.04256
