https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights

https://medium.com/@zergtant/use-weighted-loss-function-to-solve-imbalanced-data-classification-problems-749237f38b75

In [2]:
import sys
import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

sys.path.append("../")

In [3]:
print(tf.__version__)

2.14.1


In [10]:
from sklearn.preprocessing import LabelEncoder

filepath = 'heloc.data.csv'
df = pd.read_csv(filepath)

columns = df.columns
target = 'RiskPerformance'

enc = LabelEncoder()
df[target] = enc.fit_transform(df[target])

print(df.shape)
#columns


(10459, 24)


In [18]:
from sklearn.model_selection import train_test_split

y = df[target].values.astype(np.float32)
X = df.drop(columns=[target]).values

# This split is according to Tab Survey (Borisov et al., 2022)
train_size = 8359
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, train_size=train_size, random_state=0)
X_train, X_vald, y_train, y_vald = train_test_split(X_tmp, y_tmp, train_size=7000, random_state=0)

print(f'Training set: {X_train.shape}')
print(f'Validation set: {X_vald.shape}')
print(f'Test set: {X_test.shape}')

Training set: (7000, 23)
Validation set: (1359, 23)
Test set: (2100, 23)


In [19]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_vald, return_counts=True))
print(np.unique(y_test, return_counts=True))

(array([0., 1.], dtype=float32), array([3651, 3349], dtype=int64))
(array([0., 1.], dtype=float32), array([709, 650], dtype=int64))
(array([0., 1.], dtype=float32), array([1099, 1001], dtype=int64))


In [20]:
def array_to_dataset(data, target, shuffle=True, batch_size=128):
    ds = tf.data.Dataset.from_tensor_slices((data, target))
    if shuffle:
        ds = ds.shuffle(batch_size*2).batch(batch_size).prefetch(batch_size)
    else:
        ds = ds.batch(batch_size)
    return ds

batch_size = 512
train_ds = array_to_dataset(X_train, y_train, batch_size=batch_size)
vald_ds = array_to_dataset(X_vald, y_vald, shuffle=False, batch_size=batch_size)
test_ds = array_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

In [32]:
from ife import IFENetClassifier

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
ife_num_layers = 1
clf_num_layers = 1
clf_hidden_units = [84]
reduction_layer = 'flatten'
num_att = 64
r = 4.12

print(f'n_classes: {n_classes}')
print(f'n_features: {n_features}')

ife_params = {'n_features': n_features,
              'n_outputs': n_classes,
              'num_att': num_att,
              'r': r,
              'ife_num_layers': ife_num_layers, 
              'clf_num_layers': clf_num_layers,
              'clf_hidden_units': clf_hidden_units,
              'reduction_layer': reduction_layer
             }
model = IFENetClassifier(**ife_params)
# model = model.build(input_shape=(n_features,))

n_classes: 2
n_features: 23


In [33]:
x = tf.keras.Input(shape=(n_features,))
# x = tf.keras.layers.Input(shape=input_shape)
model = tf.keras.models.Model(inputs=[x], outputs=model.call(x))
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 23)]                 0         []                            
                                                                                                  
 tf.compat.v1.shape_1 (TFOp  (2,)                         0         ['input_2[0][0]']             
 Lambda)                                                                                          
                                                                                                  
 preprocess_batch_norm (Bat  (None, 23)                   92        ['input_2[0][0]']             
 chNormalization)                                                                                 
                                                                                            

In [34]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

lr = 0.01
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr, 
                                                              decay_steps=2000,
                                                              decay_rate=0.95,
                                                              staircase=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

checkpoint_path = 'checkpoints/ifeNet_heloc.h5'
patience = 500
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_accuracy')]

epochs = 500
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [35]:
saved_model_path = 'saved_model/ifeNet_heloc.h5'
model.fit(train_ds, validation_data=vald_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500

KeyboardInterrupt: 

In [None]:
model.load_weights(checkpoint_path)
model.save_weights(saved_model_path)

In [8]:
# Best model

from ife import IFENetClassifier

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
ife_num_layers = 1
clf_num_layers = 1
clf_hidden_units = [115]
reduction_layer = 'flatten'
num_att = 75
r = 4.15

print(f'n_classes: {n_classes}')
print(f'n_features: {n_features}')

ife_params = {'n_features': n_features,
              'n_outputs': n_classes,
              'num_att': num_att,
              'r': r,
              'ife_num_layers': ife_num_layers, 
              'clf_num_layers': clf_num_layers,
              'clf_hidden_units': clf_hidden_units,
              'reduction_layer': reduction_layer
             }
model = IFENetClassifier(**ife_params)

model.build(input_shape=(None,n_features,))
#model.summary()

path_saved_model = 'saved_model/ifeNet_cover_24.h5'
model.load_weights(path_saved_model)

n_classes: 7
n_features: 54


In [29]:
y_pred = np.empty((0,))
y_test = np.empty((0,))

for data,label in test_ds.take(2):
    y_hat = model(data)
    y_hat = np.argmax(y_hat, axis=-1)
    y_pred = np.append(y_pred, y_hat.ravel())

    label = label.numpy()
    y_test = np.append(y_test, label.ravel())

In [30]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6767578125
[[407 122]
 [209 286]]
              precision    recall  f1-score   support

         0.0       0.66      0.77      0.71       529
         1.0       0.70      0.58      0.63       495

    accuracy                           0.68      1024
   macro avg       0.68      0.67      0.67      1024
weighted avg       0.68      0.68      0.67      1024



In [55]:
first_batch = None
for i, batch in enumerate(test_ds):
    if i == 0:  # Indexing starts at 0, so the second batch has index 1
        first_batch = batch
        break  # Once the second batch is found, break the loop

data, label = first_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.12722
20,Soil_Type7,0.060129
50,Soil_Type37,0.057631
28,Soil_Type15,0.044712
21,Soil_Type8,0.039093
10,Wilderness_Area1,0.035264
5,Horizontal_Distance_To_Roadways,0.02963
7,Hillshade_Noon,0.029422
6,Hillshade_9am,0.026024
49,Soil_Type36,0.024337


In [56]:
second_batch = None
for i, batch in enumerate(test_ds):
    if i == 1:  # Indexing starts at 0, so the second batch has index 1
        second_batch = batch
        break  # Once the second batch is found, break the loop

data,label = second_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.131297
20,Soil_Type7,0.060262
50,Soil_Type37,0.057748
28,Soil_Type15,0.044375
21,Soil_Type8,0.037752
10,Wilderness_Area1,0.03608
5,Horizontal_Distance_To_Roadways,0.029566
7,Hillshade_Noon,0.029403
49,Soil_Type36,0.024609
6,Hillshade_9am,0.023704
