https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights

https://medium.com/@zergtant/use-weighted-loss-function-to-solve-imbalanced-data-classification-problems-749237f38b75

In [1]:
import sys
import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# sys.path.append("../")

In [2]:
print(tf.__version__)

2.14.1


In [3]:
# Utility functions
def dataframe_to_dataset(dataframe, target_columns, batch_size=128, shuffle=True):
    try:
        for target in target_columns:
            if dataframe[target].dtypes == 'object':
                dataframe[target] = dataframe[target].astype('category').cat.codes
        df_copy = dataframe.copy()
        labels = df_copy.loc[:,target_columns]
        df_copy.drop(columns=target_columns, inplace=True)
        df_copy = {key: value.to_numpy()[:,tf.newaxis] for key, value in df_copy.items()}
        dataset = tf.data.Dataset.from_tensor_slices((dict(df_copy), labels))
        if shuffle:
            dataset = dataset.shuffle(batch_size*2).batch(batch_size).prefetch(batch_size)
        else:
            dataset = dataset.batch(batch_size)
        return dataset
    except KeyError:
        print(f'Expected a list of target columns, got {type(target_columns)}.')
    except IndexError:
        print(f'Expected a DataFrame, got {type(dataframe)}.')

In [4]:
class DataConfig():
    def __init__(self, n_outputs, categorical_column_names, numerical_column_names,  
                 category_output_mode='one_hot', is_normalization=False, batch_size=256):
        #self.target_columns = target_columns
        self.n_outputs = n_outputs
        self.categorical_column_names = categorical_column_names
        self.numerical_column_names = numerical_column_names
        self.batch_size = batch_size
        self.category_output_mode = category_output_mode
        self.is_normalization = is_normalization
        #if not isinstance(target, list):
        #    print(f'Argument target must be a list i.e. [target].')
            
class ModelConfig():
    def __init__(self, num_att=16, r=1.5, clf_num_layers=1, clf_hidden_units=[64], reduction_layer='flatten'):
        self.num_att = num_att
        self.r = r
        self.ife_num_layers = 1
        self.clf_num_layers = clf_num_layers
        self.clf_hidden_units = clf_hidden_units
        self.reduction_layer = reduction_layer

In [5]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units, attn_norm_fn, num_att, r=2, initializer="glorot_uniform"):
        super(Attention, self).__init__()
        self.units = units # number of classes/responses
        self.num_att = num_att
        self.r = r
        self.initializer = initializer
        if attn_norm_fn == 'sigmoid':
            self.norm_function = tf.keras.layers.Activation(activation='sigmoid')
        else:
            self.norm_function = tf.keras.layers.Softmax()

    def build(self, input_shape): # input_shape = (batch, n_features)
        self.kernel = self.add_weight(shape=(self.num_att, input_shape[-1], self.units),
                                      initializer=self.initializer,
                                      trainable=True,
                                      name='kernel') # shape = (num_att, n_features, n_outputs)

    def call(self, inputs): # input_shape = (batch, n_features)
        z = tf.matmul(inputs, self.kernel) # (batch, n_features) dot (num_att, n_features, n_outputs) = (num_att, batch, n_outputs)
        # z = tf.nn.softmax(z, axis=-1) # (num_att, batch, n_outputs)
        z = self.norm_function(z) # (num_att, batch, n_outputs)
        
        w = tf.math.exp(self.kernel * self.r) # amplify weights
        outputs = tf.matmul(z, tf.transpose(w, perm=(0,2,1)))  # (num_att, batch, n_outputs) dot (num_att, n_outputs, n_features) = (num_att, batch, n_features)
        # outputs = tf.reduce_mean(a, axis=[1])  # shape = (batch, n_features)
        return outputs # (num_att, batch, n_features)

class IterativeFeatureExclusion(tf.keras.layers.Layer):
    def __init__(self, n_features, n_outputs, attn_norm_fn, num_att=8, r=2):
        super(IterativeFeatureExclusion, self).__init__()
        self.attentions = [Attention(n_outputs, attn_norm_fn, num_att, r=r) for i in range(n_features)]
        mask_ones = np.ones((n_features,), dtype=np.int8)
        self.masks = []
        for j in range(0,n_features):
            mask = mask_ones.copy()
            mask[j] = 0
            self.masks.append(tf.constant(mask, dtype=tf.float32))
        #self.masks = tf.stack(self.masks, axis=1)

    def call(self, inputs):       # input shape = (batch, n_features)
        input_scores = []
        for mask, attention in zip(self.masks,self.attentions):
            inputs_masked = inputs * mask # shape = (num_att, batch, n_features)
            z = tf.expand_dims(attention(inputs_masked), axis=-1) # (num_att, batch, n_features, 1)
            input_scores.append(z)
            
        input_scores = tf.concat(input_scores, axis=-1) # shape = (num_att, batch, n_features, n_features)
        input_scores = tf.reduce_mean(input_scores, axis=[-1]) # shape = (num_att, batch, n_features)
        input_scores = tf.nn.softmax(input_scores, axis=-1) # shape = (num_att, batch, n_features)
        return input_scores

class IFEModule(tf.keras.Model):
    def __init__(self, data_config, model_config):
        super(IFEModule, self).__init__()
        self._attn_norm_fn = 'softmax'

        self._n_outputs = data_config.n_outputs
        self._categorical_column_names = data_config.categorical_column_names
        self._numerical_column_names = data_config.numerical_column_names
        self._category_output_mode = data_config.category_output_mode
        self._is_normalization = data_config.is_normalization
        
        self._num_att = model_config.num_att
        self._r = model_config.r
        self._ife_num_layers = model_config.ife_num_layers

        self._n_features = 0
        self._encoder_layers = {}
        
        self.feature_indices = {}
        self.input_scores = None

    def _get_category_encoding_layer(self, name, dataset, dtype, max_tokens=None):
        feature_ds = dataset.map(lambda x, y: x[name])
        if dtype == tf.string:
            index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
        elif dtype == tf.int64:
            index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)
        
        index.adapt(feature_ds)
        encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode=self._category_output_mode, name=name)
        return lambda feature: encoder(index(feature))
    
    def _get_numerical_encoding_layer(self, name, dataset):
        feature_ds = dataset.map(lambda x, y: x[name])
        
        if self._is_normalization:
            encoder = tf.keras.layers.Normalization(axis=None)
            encoder.adapt(feature_ds)
            return lambda feature: encoder(feature)
            return encoder
        else:
            return lambda feature: tf.cast(feature, dtype=tf.float32)
        
    def _create_encoder_layers(self, dataset, feature_names, feature_dtypes):
        for name in feature_names:
            if name in self._categorical_column_names:
                layer = self._get_category_encoding_layer(name, dataset, feature_dtypes[name])
                self._encoder_layers[name] = layer
            elif name in self._numerical_column_names:
                layer = self._get_numerical_encoding_layer(name, dataset)
                self._encoder_layers[name] = layer

        st = 0
        ed = 0
        for name, layer in self._encoder_layers.items():
            example_input = next(iter(dataset.map(lambda x, y: x[name]))).numpy()
            example_output = layer(example_input)
            feature_size = example_output.shape[-1]  # Store the size (last dimension)
            ed = st + feature_size
            self._n_features = ed
            index = list([st, ed])
            st = ed
            self.feature_indices[name] = index

    def build_model(self, dataset):
        if isinstance(dataset, tf.data.Dataset):
            feature_dtypes = {key: spec.dtype for key, spec in dataset.element_spec[0].items()}
            feature_names = list(feature_dtypes.keys())
            
            self._create_encoder_layers(dataset, feature_names, feature_dtypes)
    
            self._preprocess = tf.keras.layers.BatchNormalization(name='preprocess_batch_norm')
            self._ife_attn = IterativeFeatureExclusion(self._n_features, self._n_outputs, self._attn_norm_fn, self._num_att, self._r)
        else:
            print(f'Expected a tf.data.Dataset, got {type(dataset)}.')

class IFENetClassifier(IFEModule):
    def __init__(self, data_config, model_config):
        super(IFENetClassifier, self).__init__(data_config, model_config)

        self.target_activation = 'softmax'
        self.data_config = data_config
        self.model_config = model_config

        self._clf_num_layers = self.model_config.clf_num_layers
        self._clf_hidden_units = self.model_config.clf_hidden_units
        self._reduction = self.model_config.reduction_layer

        # build the predictive layers
        clf_hidden_layers = []
        for l in range(0, self._clf_num_layers):
            clf_hidden_layers.append(tf.keras.layers.Dense(units=self._clf_hidden_units[l], activation='relu'))
            clf_hidden_layers.append(tf.keras.layers.BatchNormalization())

        if self._reduction == 'flatten':
            self._reduction_layer = tf.keras.layers.Flatten()
        elif self._reduction == 'average':
            self._reduction_layer = tf.keras.layers.GlobalAveragePooling1D()
        
        self.clf_hidden_layers = tf.keras.Sequential(clf_hidden_layers, name='fc_hidden_layers')
        self.fc_out = tf.keras.layers.Dense(units=self._n_outputs, activation=self.target_activation, name='fc_out')
    
    def get_feature_importance(self):
        feat_scores = np.mean(self.input_scores, axis=(0,1))

        feat_rank = {}
        for col,score in zip(columns, feat_scores):
            feat_rank[col] = score
        
        df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
        df_feat_rank.sort_values(by='Score', ascending=False)
        return df_feat_rank
    
    def call(self, inputs): # (batch, n_features)
        # preprocessing the inputs
        features = [self._encoder_layers[name](inputs[name]) for name in self._encoder_layers]
        features = tf.concat(features, axis=-1)
        
        # features are the preprocessed inputs
        batch_size = tf.shape(features)[0]
        x = self._preprocess(features) # (batch, n_features)
        norm_inputs = x
        norm_inputs = tf.broadcast_to(norm_inputs, [self._num_att, batch_size, self._n_features]) # expand and broadcast it to the shape of input_scores
        
        self.input_scores = self._ife_attn(x)
        x = norm_inputs * self.input_scores
        
        x = tf.transpose(x, perm=(1,0,2))
        x = self._reduction_layer(x)
        
        x = self.clf_hidden_layers(x)
        outputs = self.fc_out(x)
        return outputs


In [6]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
cat_col_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
num_col_names = ['age', 'fnlwgt', 'education_num', 'capital-gain', 'capital-loss', 'hours-per-week']
target_columns = 'income'
columns = features + [target_columns]
filepath = 'adult.data'
df = pd.read_csv(filepath, names=columns)

print(df.shape)
# Replace all ' ?' with np.NaN, and remove rows with missing values (np.NaN) or replace all with 0
df.replace(' ?', np.NaN, inplace=True)
df.dropna(inplace=True)

(32561, 15)


Check for missing values

In [7]:
#print(np.sum(df.isna(), axis=0))

In [8]:
from sklearn.model_selection import train_test_split

#y = np_data[:, 104:106]
#X = np_data[:,:106]
train_size = 0.7
tmp, test = train_test_split(df, train_size=train_size, random_state=0)
train, vald = train_test_split(tmp, train_size=0.9, random_state=0)

print(f'Training set: {train.shape}')
print(f'Validation set: {vald.shape}')
print(f'Test set: {test.shape}')

batch_size = 1024
train_ds = dataframe_to_dataset(train, [target_columns], batch_size=batch_size)
vald_ds = dataframe_to_dataset(vald, [target_columns], shuffle=False, batch_size=batch_size)
test_ds = dataframe_to_dataset(test, [target_columns], shuffle=False, batch_size=batch_size)

Training set: (19001, 15)
Validation set: (2112, 15)
Test set: (9049, 15)


In [9]:
data_config = DataConfig(n_outputs=2, 
                         categorical_column_names=cat_col_names, 
                         numerical_column_names=num_col_names)
model_config = ModelConfig()

model = IFENetClassifier(data_config, model_config)
model.build_model(train_ds)

In [10]:
[(data, label)] = train_ds.take(1)
model(data)

<tf.Tensor: shape=(1024, 2), dtype=float32, numpy=
array([[1.0044001e-08, 1.0000000e+00],
       [7.5824543e-15, 1.0000000e+00],
       [1.1501485e-18, 1.0000000e+00],
       ...,
       [1.3705589e-10, 1.0000000e+00],
       [7.3535076e-09, 1.0000000e+00],
       [5.5717450e-08, 1.0000000e+00]], dtype=float32)>

In [11]:
# x = tf.keras.Input(shape=(n_features,))
# x = tf.keras.layers.Input(shape=input_shape)
# model = tf.keras.models.Model(inputs=[x], outputs=model.call(x))
# model.summary()

In [34]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

lr = 0.01
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr, 
                                                              decay_steps=2000,
                                                              decay_rate=0.95,
                                                              staircase=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

checkpoint_path = 'checkpoints/ifeNet_adult.h5'
patience = 2
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_accuracy')]

epochs = 10
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [35]:
saved_model_path = 'saved_model/ifeNet_adult.h5'
model.fit(train_ds, validation_data=vald_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.src.callbacks.History at 0x267a5499520>

In [None]:
model.load_weights(checkpoint_path)
model.save_weights(saved_model_path)

In [None]:
inputs = tf.random.normal((4, n_features))
tmp = model(inputs)

fe_train.get_feature_scores(model.input_scores)

In [14]:
# Best model

from ife import IFENetClassifier

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
ife_num_layers = 1
clf_num_layers = 1
clf_hidden_units = [128]
reduction_layer = 'flatten'
num_att = 8
r = 5.6498

print(f'n_classes: {n_classes}')
print(f'n_features: {n_features}')

ife_params = {'n_features': n_features,
              'n_outputs': n_classes,
              'num_att': num_att,
              'r': r,
              'ife_num_layers': ife_num_layers, 
              'clf_num_layers': clf_num_layers,
              'clf_hidden_units': clf_hidden_units,
              'reduction_layer': reduction_layer
             }
model = IFENetClassifier(**ife_params)

model.build(input_shape=(None,n_features,))
#model.summary()

path_saved_model = 'saved_model/ifeNet_cover_24.h5'
checkpoint_path = 'checkpoints/weights.hdf5'
model.load_weights(checkpoint_path)

n_classes: 2
n_features: 106


In [24]:
y_pred = np.empty((0,))
y_test = np.empty((0,))

for data,label in test_ds.take(2):
    y_hat = model(data)
    y_hat = np.argmax(y_hat, axis=-1)
    y_pred = np.append(y_pred, y_hat.ravel())

    label = label.numpy()
    label = np.argmax(label, axis=-1)
    y_test = np.append(y_test, label.ravel())


In [25]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
[[750   0]
 [  0 250]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       750
         1.0       1.00      1.00      1.00       250

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [55]:
first_batch = None
for i, batch in enumerate(test_ds):
    if i == 0:  # Indexing starts at 0, so the second batch has index 1
        first_batch = batch
        break  # Once the second batch is found, break the loop

data, label = first_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.12722
20,Soil_Type7,0.060129
50,Soil_Type37,0.057631
28,Soil_Type15,0.044712
21,Soil_Type8,0.039093
10,Wilderness_Area1,0.035264
5,Horizontal_Distance_To_Roadways,0.02963
7,Hillshade_Noon,0.029422
6,Hillshade_9am,0.026024
49,Soil_Type36,0.024337


In [56]:
second_batch = None
for i, batch in enumerate(test_ds):
    if i == 1:  # Indexing starts at 0, so the second batch has index 1
        second_batch = batch
        break  # Once the second batch is found, break the loop

data,label = second_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.131297
20,Soil_Type7,0.060262
50,Soil_Type37,0.057748
28,Soil_Type15,0.044375
21,Soil_Type8,0.037752
10,Wilderness_Area1,0.03608
5,Horizontal_Distance_To_Roadways,0.029566
7,Hillshade_Noon,0.029403
49,Soil_Type36,0.024609
6,Hillshade_9am,0.023704


In [18]:
def discretize_colum(data_clm, num_values=10):
    """ Discretize a column by quantiles """
    r = np.argsort(data_clm)
    bin_sz = (len(r) / num_values) + 1  # make sure all quantiles are in range 0-(num_quarts-1)
    q = r // bin_sz
    return q

url_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
label = "income"
columns = features + [label]
df = pd.read_csv(url_data, names=columns)

'''
# Fill NaN with something better?
df.fillna(0, inplace=True)
if True:
    columns_to_discr = [('age', 10), ('fnlwgt', 25), ('capital-gain', 10), ('capital-loss', 10),
                        ('hours-per-week', 10)]
    for clm, nvals in columns_to_discr:
        df[clm] = discretize_colum(df[clm], num_values=nvals)
        df[clm] = df[clm].astype(int).astype(str)
    df['education_num'] = df['education_num'].astype(int).astype(str)
    cat_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
X = df[features].to_numpy()
y = df[label].to_numpy()'''

"\n# Fill NaN with something better?\ndf.fillna(0, inplace=True)\nif True:\n    columns_to_discr = [('age', 10), ('fnlwgt', 25), ('capital-gain', 10), ('capital-loss', 10),\n                        ('hours-per-week', 10)]\n    for clm, nvals in columns_to_discr:\n        df[clm] = discretize_colum(df[clm], num_values=nvals)\n        df[clm] = df[clm].astype(int).astype(str)\n    df['education_num'] = df['education_num'].astype(int).astype(str)\n    cat_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\nX = df[features].to_numpy()\ny = df[label].to_numpy()"

In [19]:
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [20]:
df.fillna(0, inplace=True)

In [22]:
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
