https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights

https://medium.com/@zergtant/use-weighted-loss-function-to-solve-imbalanced-data-classification-problems-749237f38b75

In [2]:
import sys
import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# sys.path.append("../")

In [3]:
print(tf.__version__)

2.14.1


In [4]:
# Utility functions

def dataframe_to_dataset(dataframe, target, batch_size=128, shuffle=True):
    df_copy = dataframe.copy()
    labels = df_copy.loc[:,target]
    df_copy.drop(columns=target, inplace=True)
    df_copy = {key: value.to_numpy()[:,tf.newaxis] for key, value in df_copy.items()}
    dataset = tf.data.Dataset.from_tensor_slices((dict(df_copy), labels))
    if shuffle:
        dataset = dataset.shuffle(batch_size*2).batch(batch_size).prefetch(batch_size)
    else:
        dataset = dataset.batch(batch_size)
    return dataset

def _get_category_encoding_layer(name, dataset, dtype, category_output_mode='one_hot', max_tokens=None):
    feature_ds = dataset.map(lambda x, y: x[name])
    if dtype == tf.string:
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    elif dtype == tf.int64:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)
    
    index.adapt(feature_ds)
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode=category_output_mode, name=name)
    #return lambda feature: encoder(index(feature))
    return encoder(index(feature))

def _get_numerical_encoding_layer(name, dataset, is_normalization):
    feature_ds = dataset.map(lambda x, y: x[name])
    
    if is_normalization:
        encoder = tf.keras.layers.Normalization(axis=None)
        encoder.adapt(feature_ds)
        #return lambda feature: encoder(feature)
        return encoder(feature)
    else:
        #return lambda feature: tf.cast(feature, dtype=tf.float32)
        return tf.cast(feature, dtype=tf.float32)

def _create_input_layer(feature_names, feature_dtypes, categorical_column_names, numerical_column_names):
    all_inputs = {}
    for name in feature_names:
        if feature_dtypes[name] == tf.string and name in categorical_column_names:
            dtype = tf.string # object cat
        elif feature_dtypes[name] == tf.int64 and name in categorical_column_names:
            dtype = tf.int64 # int64 cat
        elif feature_dtypes[name] == tf.int64 and name in numerical_column_names:
            dtype = tf.float32 # int64 num
        elif feature_dtypes[name] == tf.float32 and name in numerical_column_names:
            dtype = tf.float32 # float32 num
        #shape = (1,) if dtype == tf.float32 else ()
        shape = (1,)
        layer = tf.keras.Input(shape=shape, dtype=dtype, name=name)
        all_inputs[name] = layer

    return all_inputs

def _create_encoder_layers(dataset, feature_dtypes, categorical_column_names, numerical_column_names, category_output_mode, is_normalization):
    cat_encoder_layers = {}
    for name in categorical_column_names:
        layer = _get_category_encoding_layer(name, dataset, feature_dtypes[name], category_output_mode)
        cat_encoder_layers[name] = layer

    num_encoder_layers = {}
    for name in numerical_column_names:
        layer = _get_numerical_encoding_layer(name, dataset, is_normalization)
        num_encoder_layers[name] = layer

    return cat_encoder_layers, num_encoder_layers

def encode_features(dataset, categorical_column_names, numerical_column_names, category_output_mode='one_hot', is_normalization=False):
    feature_dtypes = {key: spec.dtype for key, spec in train_ds.element_spec[0].items()}
    feature_names = list(feature_dtypes.keys())
    
    input_layers = _create_input_layer(feature_names, feature_dtypes, categorical_column_names, numerical_column_names)
    cat_encoder_layers, num_encoder_layers = _create_encoder_layers(dataset, feature_dtypes, 
                                                                    categorical_column_names, numerical_column_names,
                                                                    category_output_mode, is_normalization)

    feature_indices = {}
    encoded_features = []
    st = 0
    ed = 0
    for name, input_layer in input_layers.items():
        if name in cat_encoder_layers:
            encoded_feature = cat_encoder_layers[name](input_layer)
        elif name in num_encoder_layers:
            encoded_feature = num_encoder_layers[name](input_layer)
    
        feature_size = encoded_feature.get_shape()[-1]
        ed = st + feature_size
        index = [st, ed]
        st = ed
        feature_indices[name] = index
        encoded_features.append(encoded_feature)

    concatenate_layers = tf.keras.layers.Concatenate(encoded_features, name='concatenate')
    return input_layers, concatenate_layers, feature_indices

In [5]:
class DataConfig():
    def __init__(self, n_features, n_outputs, categorical_column_names, numerical_column_names, target, 
                 category_output_mode='one_hot', is_normalization=False, batch_size=256):
        self.n_features = n_features
        self.n_outputs = n_outputs
        self.categorical_column_names = categorical_column_names
        self.numerical_column_names = numerical_column_names
        self.target = target
        self.batch_size = batch_size
        self.category_output_mode = category_output_mode
        self.is_normalization = is_normalization

class ModelConfig():
    def __init__(self, num_att=16, r=1.5, clf_num_layers=1, clf_hidden_units=64, reduction_layer='flatten'):
        self.num_att = num_att
        self.r = r
        self.ife_num_layers = 1
        self.clf_num_layers = clf_num_layers
        self.clf_hidden_units = clf_hidden_units
        self.reduction_layer = reduction_layer

In [27]:
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        
        self.hidden1 = tf.keras.layers.Dense(units=84, activation='relu', name='hidden1')
        self.hidden2 = tf.keras.layers.Dense(units=48, activation='relu', name='hidden2')
        self.output_layer = tf.keras.layers.Dense(units=7, activation='softmax', name='output_layer')
        self.feature_indices = {}
        self.encoder_layers = {}
        self.n_features = 0

    def _get_category_encoding_layer(self, name, dataset, dtype, category_output_mode='one_hot', max_tokens=None):
        feature_ds = dataset.map(lambda x, y: x[name])
        if dtype == tf.string:
            index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
        elif dtype == tf.int64:
            index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)
        
        index.adapt(feature_ds)
        encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode=category_output_mode, name=name)
        return lambda feature: encoder(index(feature))
        #return encoder(index(feature))
        #return tf.keras.Sequential([index, encoder])
    
    def _get_numerical_encoding_layer(self, name, dataset, is_normalization):
        feature_ds = dataset.map(lambda x, y: x[name])
        
        if is_normalization:
            encoder = tf.keras.layers.Normalization(axis=None)
            encoder.adapt(feature_ds)
            return lambda feature: encoder(feature)
            return encoder
        else:
            return lambda feature: tf.cast(feature, dtype=tf.float32)
            # return tf.keras.layers.Lambda(lambda feature: tf.cast(feature, dtype=tf.float32))
        
    def _create_encoder_layers(self, dataset, feature_names, feature_dtypes, 
                               categorical_column_names, numerical_column_names, 
                               category_output_mode, is_normalization):
        for name in feature_names:
            if name in categorical_column_names:
                layer = self._get_category_encoding_layer(name, dataset, feature_dtypes[name], category_output_mode)
                self.encoder_layers[name] = layer
            elif name in numerical_column_names:
                layer = self._get_numerical_encoding_layer(name, dataset, is_normalization)
                self.encoder_layers[name] = layer

        st = 0
        ed = 0
        for name, layer in self.encoder_layers.items():
            example_input = next(iter(dataset.map(lambda x, y: x[name]))).numpy()
            example_output = layer(example_input)
            feature_size = example_output.shape[-1]  # Store the size (last dimension)
            ed = st + self.n_features
            index = list([st, ed])
            #print(index)
            st = ed
            self.feature_indices[name] = index
    
    def build_model(self, dataset, categorical_column_names, numerical_column_names, category_output_mode='one_hot', is_normalization=False):

        feature_dtypes = {key: spec.dtype for key, spec in dataset.element_spec[0].items()}
        feature_names = list(feature_dtypes.keys())
        
        self._create_encoder_layers(dataset, feature_names, feature_dtypes, 
                                    categorical_column_names, numerical_column_names, 
                                    category_output_mode, is_normalization)

        self.preprocess = tf.keras.layers.BatchNormalization(name='preprocess_batch_norm')
    
    def call(self, inputs):
        features = [self.encoder_layers[name](inputs[name]) for name in self.encoder_layers]

        features = tf.concat(features, axis=-1)
        x = self.preprocess(features)
        x = self.hidden1(x)
        x = self.hidden2(x)
        outputs = self.output_layer(x)
        return outputs

In [7]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
cat_col_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
num_col_names = ['age', 'fnlwgt', 'education_num', 'capital-gain', 'capital-loss', 'hours-per-week']
target = 'income'
columns = features + [target]
filepath = 'adult.data'
df = pd.read_csv(filepath, names=columns)

print(df.shape)
# Replace all ' ?' with np.NaN, and remove rows with missing values (np.NaN) or replace all with 0
df.replace(' ?', np.NaN, inplace=True)
df.dropna(inplace=True)

(32561, 15)


Check for missing values

In [8]:
#print(np.sum(df.isna(), axis=0))

In [9]:
from sklearn.model_selection import train_test_split

#y = np_data[:, 104:106]
#X = np_data[:,:106]
train_size = 0.7
tmp, test = train_test_split(df, train_size=train_size, random_state=0)
train, vald = train_test_split(tmp, train_size=0.9, random_state=0)

print(f'Training set: {train.shape}')
print(f'Validation set: {vald.shape}')
print(f'Test set: {test.shape}')

batch_size = 1024
train_ds = dataframe_to_dataset(train, target, batch_size=batch_size)
vald_ds = dataframe_to_dataset(vald, target, shuffle=False, batch_size=batch_size)
test_ds = dataframe_to_dataset(test, target, shuffle=False, batch_size=batch_size)

Training set: (19001, 15)
Validation set: (2112, 15)
Test set: (9049, 15)


In [28]:
data_config = DataConfig(n_features=15, n_outputs=2, 
                         categorical_column_names=cat_col_names, 
                         numerical_column_names=num_col_names, target=[target])
model_config = ModelConfig()

model = MyModel()
model.build_model(train_ds, cat_col_names, num_col_names)
#outputs = model(concatenate_layers)
#tab_model = tf.keras.models.Model(inputs=all_inputs, outputs=outputs)
[(data, label)] = train_ds.take(1)
y_hat = model(data)

In [30]:
model.feature_indices

{'age': ListWrapper([0, 1]),
 'workclass': ListWrapper([1, 9]),
 'fnlwgt': ListWrapper([9, 10]),
 'education': ListWrapper([10, 27]),
 'education_num': ListWrapper([27, 28]),
 'marital-status': ListWrapper([28, 36]),
 'occupation': ListWrapper([36, 51]),
 'relationship': ListWrapper([51, 58]),
 'race': ListWrapper([58, 64]),
 'sex': ListWrapper([64, 67]),
 'capital-gain': ListWrapper([67, 68]),
 'capital-loss': ListWrapper([68, 69]),
 'hours-per-week': ListWrapper([69, 70]),
 'native-country': ListWrapper([70, 112])}

In [78]:
from ife import IFENetClassifier

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
ife_num_layers = 1
clf_num_layers = 1
clf_hidden_units = [128]
reduction_layer = 'flatten'
num_att = 8
r = 5.6498

print(f'n_classes: {n_classes}')
print(f'n_features: {n_features}')

ife_params = {'n_features': n_features,
              'n_outputs': n_classes,
              'num_att': num_att,
              'r': r,
              'ife_num_layers': ife_num_layers, 
              'clf_num_layers': clf_num_layers,
              'clf_hidden_units': clf_hidden_units,
              'reduction_layer': reduction_layer
             }
model = IFENetClassifier(**ife_params)
# model = model.build(input_shape=(n_features,))

n_classes: 2
n_features: 106


In [79]:
# x = tf.keras.Input(shape=(n_features,))
# x = tf.keras.layers.Input(shape=input_shape)
# model = tf.keras.models.Model(inputs=[x], outputs=model.call(x))
# model.summary()

In [56]:
loss_fn = tf.keras.losses.CategoricalCrossentropy()

lr = 0.01
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr, 
                                                              decay_steps=2000,
                                                              decay_rate=0.95,
                                                              staircase=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

checkpoint_path = 'checkpoints/ifeNet_adult.h5'
patience = 2
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_accuracy')]

epochs = 2
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [58]:
saved_model_path = 'saved_model/ifeNet_adult.h5'
model.fit(train_ds, validation_data=vald_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/500
 2/27 [=>............................] - ETA: 5:15 - loss: 1.1173 - accuracy: 0.6099 

KeyboardInterrupt: 

In [None]:
model.load_weights(checkpoint_path)
model.save_weights(saved_model_path)

In [None]:
inputs = tf.random.normal((4, n_features))
tmp = model(inputs)

fe_train.get_feature_scores(model.input_scores)

In [14]:
# Best model

from ife import IFENetClassifier

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
ife_num_layers = 1
clf_num_layers = 1
clf_hidden_units = [128]
reduction_layer = 'flatten'
num_att = 8
r = 5.6498

print(f'n_classes: {n_classes}')
print(f'n_features: {n_features}')

ife_params = {'n_features': n_features,
              'n_outputs': n_classes,
              'num_att': num_att,
              'r': r,
              'ife_num_layers': ife_num_layers, 
              'clf_num_layers': clf_num_layers,
              'clf_hidden_units': clf_hidden_units,
              'reduction_layer': reduction_layer
             }
model = IFENetClassifier(**ife_params)

model.build(input_shape=(None,n_features,))
#model.summary()

path_saved_model = 'saved_model/ifeNet_cover_24.h5'
checkpoint_path = 'checkpoints/weights.hdf5'
model.load_weights(checkpoint_path)

n_classes: 2
n_features: 106


In [24]:
y_pred = np.empty((0,))
y_test = np.empty((0,))

for data,label in test_ds.take(2):
    y_hat = model(data)
    y_hat = np.argmax(y_hat, axis=-1)
    y_pred = np.append(y_pred, y_hat.ravel())

    label = label.numpy()
    label = np.argmax(label, axis=-1)
    y_test = np.append(y_test, label.ravel())


In [25]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
[[750   0]
 [  0 250]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       750
         1.0       1.00      1.00      1.00       250

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [55]:
first_batch = None
for i, batch in enumerate(test_ds):
    if i == 0:  # Indexing starts at 0, so the second batch has index 1
        first_batch = batch
        break  # Once the second batch is found, break the loop

data, label = first_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.12722
20,Soil_Type7,0.060129
50,Soil_Type37,0.057631
28,Soil_Type15,0.044712
21,Soil_Type8,0.039093
10,Wilderness_Area1,0.035264
5,Horizontal_Distance_To_Roadways,0.02963
7,Hillshade_Noon,0.029422
6,Hillshade_9am,0.026024
49,Soil_Type36,0.024337


In [56]:
second_batch = None
for i, batch in enumerate(test_ds):
    if i == 1:  # Indexing starts at 0, so the second batch has index 1
        second_batch = batch
        break  # Once the second batch is found, break the loop

data,label = second_batch
model(data)

feat_scores = model.input_scores
feat_scores = np.mean(feat_scores, axis=(0,1))

feat_rank = {}
for col,score in zip(columns,feat_scores):
    #print(f'{col}: {score}')
    feat_rank[col] = score

df_feat_rank = pd.DataFrame(list(feat_rank.items()), columns=['Feature', 'Score'])
df_feat_rank.sort_values(by='Score', ascending=False)

Unnamed: 0,Feature,Score
0,Elevation,0.131297
20,Soil_Type7,0.060262
50,Soil_Type37,0.057748
28,Soil_Type15,0.044375
21,Soil_Type8,0.037752
10,Wilderness_Area1,0.03608
5,Horizontal_Distance_To_Roadways,0.029566
7,Hillshade_Noon,0.029403
49,Soil_Type36,0.024609
6,Hillshade_9am,0.023704


In [18]:
def discretize_colum(data_clm, num_values=10):
    """ Discretize a column by quantiles """
    r = np.argsort(data_clm)
    bin_sz = (len(r) / num_values) + 1  # make sure all quantiles are in range 0-(num_quarts-1)
    q = r // bin_sz
    return q

url_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
label = "income"
columns = features + [label]
df = pd.read_csv(url_data, names=columns)

'''
# Fill NaN with something better?
df.fillna(0, inplace=True)
if True:
    columns_to_discr = [('age', 10), ('fnlwgt', 25), ('capital-gain', 10), ('capital-loss', 10),
                        ('hours-per-week', 10)]
    for clm, nvals in columns_to_discr:
        df[clm] = discretize_colum(df[clm], num_values=nvals)
        df[clm] = df[clm].astype(int).astype(str)
    df['education_num'] = df['education_num'].astype(int).astype(str)
    cat_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
X = df[features].to_numpy()
y = df[label].to_numpy()'''

"\n# Fill NaN with something better?\ndf.fillna(0, inplace=True)\nif True:\n    columns_to_discr = [('age', 10), ('fnlwgt', 25), ('capital-gain', 10), ('capital-loss', 10),\n                        ('hours-per-week', 10)]\n    for clm, nvals in columns_to_discr:\n        df[clm] = discretize_colum(df[clm], num_values=nvals)\n        df[clm] = df[clm].astype(int).astype(str)\n    df['education_num'] = df['education_num'].astype(int).astype(str)\n    cat_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\nX = df[features].to_numpy()\ny = df[label].to_numpy()"

In [19]:
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [20]:
df.fillna(0, inplace=True)

In [22]:
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
