In [1]:
import sys
import tensorflow as tf
import numpy as np
import pandas as pd

In [None]:
"""
Created on Mon Nov 10 14:10:00 2024
@author: Fathe, Abdulrahman and Mohd Halim
"""

import numpy as np
from sklearn.preprocessing import OneHotEncoder

class ColumnEncoder():
    def __init__(self, **kwargs):
        self.columns_cat = kwargs['columns_cat']
        self.columns_num = kwargs['columns_num']
        self.encoders = {}
        self.col_indices = {}

    # fit the encoder
    def fit(self, dataframe):
        for col in dataframe.columns:
            enc_col = dataframe[[col]].to_numpy()
            if col in self.columns_cat: # if categorical, encode
                enc = OneHotEncoder()
                enc_col = enc.fit(enc_col)
                self.encoders[col] = enc

    
    def transform(self, dataframe):
        try:
            encoded_columns = np.empty((len(dataframe),0))
            st = 0
            ed = 0
            for col in dataframe.columns:
                enc_col = dataframe[[col]].to_numpy()
                if col in self.columns_cat: # if categorical, encode
                    enc_col = self.encoders[col].transform(enc_col).toarray()
                    
                ed = st + enc_col.shape[-1]
                index = [st, ed]
                st = ed
                self.col_indices[col] = index
                encoded_columns = np.append(encoded_columns, enc_col, axis=-1)
    
            return encoded_columns
        except:
            print(f'Run fit() to fit the encoder.') 

    # Return a list of features with their scores
    # The feature scores must be passed to this function
    def get_feature_scores(self, input_scores, display=True):         
        feature_scores = {}
        for feat in self.col_indices.keys():
            index = self.col_indices[feat]
            feature_scores[feat] = input_scores[slice(*index)]

        for k in feature_scores.keys():
            print(f'{k}: {feature_scores[k]}') 

        return feature_scores

In [134]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
target = 'income'
columns = features + [target]

filepath = 'adult.data'
df = pd.read_csv(filepath, names=columns)

cat_col_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
num_col_names = ['age', 'fnlwgt', 'education_num', 'capital-gain', 'capital-loss', 'hours-per-week']
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,1,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,1,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,1,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,1,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,5178,0,40,United-States,>50K


In [135]:
df.replace(' ?', np.NaN, inplace=True)
#df.dropna(inplace=True)
df.fillna('0', inplace=True)

In [136]:
df[target] = df[target].astype('category').cat.codes

In [161]:
col_dtypes = df.dtypes
col_dtypes.drop(target, inplace=True)
for key,value in col_dtypes.items():
    print(f'{key}: {value}')

print(list(col_dtypes.keys()))

age: int64
workclass: object
fnlwgt: int64
education: object
education_num: int64
marital-status: object
occupation: object
relationship: object
race: object
sex: int64
capital-gain: int64
capital-loss: int64
hours-per-week: int64
native-country: object
['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']


In [140]:
from sklearn.model_selection import train_test_split

#y = np_data[:, 104:106]
#X = np_data[:,:106]
train_size = 0.7
tmp, test = train_test_split(df, train_size=train_size, random_state=0)
train, vald = train_test_split(tmp, train_size=0.9, random_state=0)

print(f'Training set: {train.shape}')
print(f'Validation set: {vald.shape}')
print(f'Test set: {test.shape}')


def array_to_dataset(df, target, shuffle=True, batch_size=128):
    df_copy = df.copy()
    labels = df_copy.pop(target)
    df_copy = {key: value.to_numpy()[:,tf.newaxis] for key, value in df_copy.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df_copy), labels))
    if shuffle:
        ds = ds.shuffle(batch_size*2).batch(batch_size).prefetch(batch_size)
    else:
        ds = ds.batch(batch_size)
    return ds

batch_size = 1024
train_ds = array_to_dataset(train, target, batch_size=batch_size)
vald_ds = array_to_dataset(vald, target, shuffle=False, batch_size=batch_size)
test_ds = array_to_dataset(test, target, shuffle=False, batch_size=batch_size)

Training set: (20512, 15)
Validation set: (2280, 15)
Test set: (9769, 15)


In [148]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None, output_mode='multi_hot'):
    # Create a layer that turns strings into integer indices.
    if dtype == 'object':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    elif dtype == 'int64':
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)
    
    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)
    
    # Encode the integer indices.
    #if is_embedding:
    #    encoder = tf.keras.layers.Embedding(input_dim=index.vocabulary_size(), output_dim=embedding_output_dim, name=name)
    #else:
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode=output_mode, name=name)
    
    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))

cat_encoder_layers = {}
for name in cat_col_names:
    layer = get_category_encoding_layer(name, train_ds, col_dtypes[name], output_mode='one_hot')
    cat_encoder_layers[name] = layer

cat_encoder_layers

{'workclass': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'education': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'marital-status': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'occupation': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'relationship': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'race': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'sex': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>,
 'native-country': <function __main__.get_category_encoding_layer.<locals>.<lambda>(feature)>}

In [150]:
def get_numerical_encoding_layer(name, dataset, is_normalization=False):
    feature_ds = dataset.map(lambda x, y: x[name])
    
    if is_normalization:
        encoder = tf.keras.layers.Normalization(axis=None)
        encoder.adapt(feature_ds)
        return lambda feature: encoder(feature)
    else:
        return lambda feature: feature

num_encoder_layers = {}
for name in num_col_names:
    layer = get_numerical_encoding_layer(name, train_ds)
    num_encoder_layers[name] = layer

num_encoder_layers

{'age': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>,
 'fnlwgt': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>,
 'education_num': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>,
 'capital-gain': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>,
 'capital-loss': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>,
 'hours-per-week': <function __main__.get_numerical_encoding_layer.<locals>.<lambda>(feature)>}

In [153]:
feature_names = columns.copy()
feature_names.remove(target)

def create_input_layer(feature_names, cat_col_names, num_col_names):
    all_inputs = {}
    for name in feature_names:
        #print(f'{name}: {col_dtypes[name]}')
        if col_dtypes[name] == 'object' and name in cat_col_names:
            #print(f'{name}: object cat')
            dtype = tf.string
        elif col_dtypes[name] == 'int64' and name in cat_col_names:
            #print(f'{name}: int64 cat')
            dtype = tf.int64
        elif col_dtypes[name] == 'int64' and name in num_col_names:
            #print(f'{name}: int64 num')
            dtype = tf.float32
        elif col_dtypes[name] == 'float32' and name in num_col_names:
            #print(f'{name}: float32 num')
            dtype = tf.float32
        #shape = (1,) if dtype == tf.float32 else ()
        shape = (1,)
        layer = tf.keras.Input(shape=shape, dtype=dtype, name=name)
        all_inputs[name] = layer

    return all_inputs

all_inputs = create_input_layer(feature_names, cat_col_names, num_col_names)

In [156]:
encoded_features = []

st = 0
ed = 0
feature_indices = {}
for name, input_layer in all_inputs.items():
    if name in encoder_layers:
        encoded_feature = cat_encoder_layers[name](input_layer)
    else:
        encoded_feature = num_encoder_layers[name](input_layer)

    feature_size = encoded_feature.get_shape()[-1]
    ed = st + feature_size
    index = [st, ed]
    st = ed
    feature_indices[name] = index
    encoded_features.append(encoded_feature)

all_features = tf.keras.layers.concatenate(encoded_features, name='concatenate')
feature_indices

{'age': [0, 1],
 'workclass': [1, 11],
 'fnlwgt': [11, 12],
 'education': [12, 29],
 'education_num': [29, 30],
 'marital-status': [30, 38],
 'occupation': [38, 54],
 'relationship': [54, 61],
 'race': [61, 67],
 'sex': [67, 70],
 'capital-gain': [70, 71],
 'capital-loss': [71, 72],
 'hours-per-week': [72, 73],
 'native-country': [73, 116]}

In [171]:
class FeatureEncoder():
    #def __init__(self, dataset, columns_dtypes, categorical_column_names, numerical_column_names):
    def __init__(self):
        pass
        #self.dataset = dataset
        #self.columns_dtypes = columns_dtypes
        #self.feature_names = list(columns_dtypes.keys())
        #self.categorical_column_names = categorical_column_names
        #self.numerical_column_names = numerical_column_names

    def _get_category_encoding_layer(self, name, dataset, dtype, max_tokens=None, output_mode='multi_hot'):
        feature_ds = dataset.map(lambda x, y: x[name])
        if dtype == 'object':
            index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
        elif dtype == 'int64':
            index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)
        
        index.adapt(feature_ds)
        # Encode the integer indices.
        #if is_embedding:
        #    encoder = tf.keras.layers.Embedding(input_dim=index.vocabulary_size(), output_dim=embedding_output_dim, name=name)
        #else:
        encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode=output_mode, name=name)
        return lambda feature: encoder(index(feature))

    def _get_numerical_encoding_layer(self, name, dataset, is_normalization=False):
        feature_ds = dataset.map(lambda x, y: x[name])
        
        if is_normalization:
            encoder = tf.keras.layers.Normalization(axis=None)
            encoder.adapt(feature_ds)
            return lambda feature: encoder(feature)
        else:
            return lambda feature: feature


class PreprocessingLayer():
    def __init__(self, dataset, columns_dtypes, categorical_column_names, numerical_column_names, 
                 category_output_mode, is_normalization):
        self._feature_encoder = FeatureEncoder()

        self._dataset = dataset
        self._columns_dtypes = columns_dtypes
        self._feature_names = list(columns_dtypes.keys())
        self._categorical_column_names = categorical_column_names
        self._numerical_column_names = numerical_column_names
        self._category_output_mode = category_output_mode
        self._is_normalization = is_normalization

        self.concatenation_layer = None
        self.feature_indices = {}
    
    def _create_input_layer(self):
        all_inputs = {}
        for name in self._feature_names:
            if self._columns_dtypes[name] == 'object' and name in self._categorical_column_names:
                dtype = tf.string # object cat
            elif self._columns_dtypes[name] == 'int64' and name in self._categorical_column_names:
                dtype = tf.int64 # int64 cat
            elif self._columns_dtypes[name] == 'int64' and name in self._numerical_column_names:
                dtype = tf.float32 # int64 num
            elif self._columns_dtypes[name] == 'float32' and name in self._numerical_column_names:
                dtype = tf.float32 # float32 num
            #shape = (1,) if dtype == tf.float32 else ()
            shape = (1,)
            layer = tf.keras.Input(shape=shape, dtype=dtype, name=name)
            all_inputs[name] = layer
    
        return all_inputs

    def _create_encoder_layers(self):
        cat_encoder_layers = {}
        for name in self._categorical_column_names:
            layer = self._feature_encoder._get_category_encoding_layer(name, self._dataset, self._columns_dtypes[name], output_mode=self._category_output_mode)
            cat_encoder_layers[name] = layer

        num_encoder_layers = {}
        for name in self._numerical_column_names:
            layer = self._feature_encoder._get_numerical_encoding_layer(name, self._dataset, is_normalization=self._is_normalization)
            num_encoder_layers[name] = layer

        return cat_encoder_layers, num_encoder_layers

    def create_preprocessing_layer(self):
        input_layers = self._create_input_layer()
        cat_encoder_layers, num_encoder_layers = self._create_encoder_layers()
        
        encoded_features = []
        st = 0
        ed = 0
        for name, input_layer in input_layers.items():
            if name in encoder_layers:
                encoded_feature = cat_encoder_layers[name](input_layer)
            else:
                encoded_feature = num_encoder_layers[name](input_layer)
        
            feature_size = encoded_feature.get_shape()[-1]
            ed = st + feature_size
            index = [st, ed]
            st = ed
            self.feature_indices[name] = index
            encoded_features.append(encoded_feature)
        
        self.concatenation_layer = tf.keras.layers.concatenate(encoded_features, name='concatenate')
        

preprocessing_layer = PreprocessingLayer(train_ds, columns_dtypes=col_dtypes, 
                                         categorical_column_names=cat_col_names, 
                                         numerical_column_names=num_col_names,
                                         category_output_mode='one_hot',
                                         is_normalization=False)
preprocessing_layer.create_preprocessing_layer()
preprocessing_layer.concatenation_layer

#all_inputs = create_input_layer(feature_names, cat_col_names, num_col_names)



<KerasTensor: shape=(None, 116) dtype=float32 (created by layer 'concatenate')>

In [None]:

encoded_features = []
st = 0
ed = 0
feature_indices = {}
for name, input_layer in all_inputs.items():
    if name in encoder_layers:
        encoded_feature = cat_encoder_layers[name](input_layer)
    else:
        encoded_feature = num_encoder_layers[name](input_layer)

    feature_size = encoded_feature.get_shape()[-1]
    ed = st + feature_size
    index = [st, ed]
    st = ed
    feature_indices[name] = index
    encoded_features.append(encoded_feature)

all_features = tf.keras.layers.concatenate(encoded_features, name='concatenate')
#feature_indices

In [None]:
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)

In [126]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"],
              run_eagerly=True)

In [141]:
model.fit(train_ds, epochs=10, validation_data=vald_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 

In [129]:
label

<tf.Tensor: shape=(6,), dtype=string, numpy=
array([b' <=50K', b' >50K', b' <=50K', b' <=50K', b' <=50K', b' <=50K'],
      dtype=object)>

In [124]:
# Example of using an encoder layer
[(data, label)] = train_ds.take(1)
#print(data)
#col = 'education'
#data_col = data[col]
#print(data_col)
#encoder_layers[col](data_col)
model(data)

<tf.Tensor: shape=(6, 116), dtype=float32, numpy=
array([[2.20000e+01, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.22272e+05, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 9.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
 