## Neural-Net for Tabular data
### <u>Problem statement 3:</u> Variable length features
The following dataset is a synthetic dataset generated from `make_classification`. <br>
One variable withholds some set of features. How can can we consider those features? In the following, we assume there are 10 different types of credit card and each credit card has features like `Valid_Date`, `Daily_Max_Transaction`, etc...  

In [11]:
from sklearn.datasets import make_classification
import numpy as np

In [12]:
dataset = make_classification(
    n_samples=10_000, 
    n_features=30,
    n_clusters_per_class=2,
    n_informative=10,
    n_classes=4
)
X, y = dataset

In [13]:
np.unique(y)

array([0, 1, 2, 3])

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
base_classes = list()

for i in range(4):
    base_classes.append(X_scaled[y == i])

In [16]:
import numpy as np

num_points = 5_000
class1_dist = [.5, .5, 0, 0]
class2_dist = [0, .2, .6, .2]

def make_var_len_feature_point(dist):
    feature_sets = []
    num_features = np.random.randint(3, 11)
    for _ in range(num_features):
        # choose which distribution the credit card comes from
        base_class = np.random.choice([0, 1, 2, 3], 1, p=dist)
        base_class_points = base_classes[base_class[0]]
        feature_set_idx = np.random.choice(base_class_points.shape[0], 1)
        feature_sets.append(base_class_points[feature_set_idx])
        
    for _ in range(10 - num_features):
        feature_sets.append(np.zeros((1, 30)))

    return np.concatenate(feature_sets)[np.newaxis, :, :]


class1_points = []
for _ in range(num_points):
    class1_points.append(
        make_var_len_feature_point(class1_dist)
    )
class1_points = np.concatenate(class1_points)
    
class2_points = []
for _ in range(num_points):
    class2_points.append(
        make_var_len_feature_point(class2_dist)
    )
class2_points = np.concatenate(class2_points)

In [17]:
class2_points.shape # (no_people, no_credit_cards, no_credit_cards_features{validty, end_date, ...})

(5000, 10, 30)

In [18]:
def sample_generator(batch_size):
    while True:
        batch_idx = np.random.choice(
            class1_points.shape[0], batch_size // 2)
        batch_x = np.concatenate([
            class1_points[batch_idx],
            class2_points[batch_idx],
        ])
        batch_y = np.concatenate([
            np.zeros(batch_size // 2),
            np.ones(batch_size // 2),
        ])
        yield ({'no_inputs': batch_x}, 
               {'output': batch_y})

In [19]:
from tensorflow.keras.layers import (Input, Dropout, Dense, 
                                     BatchNormalization, Embedding, 
                                     Flatten, Concatenate, Conv1D, 
                                     Activation, GlobalAveragePooling1D,
                                     GlobalMaxPool1D, RepeatVector
                                    )
from tensorflow.keras.models import Model

# Dropout probability 
p = .1
batch_size = 32

In [22]:
# The use of Convolution network is important cause with it we can extract important features 
# by applying the same function to each credit card
# Use Convolution Layer to treat each feature set of each Credit Card separatly
# Use GlobalMax/AveragePool to combine information from all the cards together into 1

#------ Get features from each credit card -------
inputs = Input((10, 30), name='no_inputs') # Eg. 10 credit cards records having 30 features each  


x = Dropout(p)(inputs)
x = Conv1D(10, 1)(x)
x = Activation('relu')(x)

global_ave = GlobalAveragePooling1D()(x)
global_max = GlobalMaxPool1D()(x)

x = Concatenate()([global_ave, global_max])
x = BatchNormalization()(x)


#------- SKIP-CONNECTION(Concatenation): appending the global context  
#------- Combine the extracted features which held the global information
#------- of the 10 different credit cards records to the inputs 
x = RepeatVector(10)(x)
x = Concatenate()([inputs, x])

x = Dropout(p)(x)
x = Conv1D(10, 1)(x)
x = Activation('relu')(x)

global_ave = GlobalAveragePooling1D()(x)
global_max = GlobalMaxPool1D()(x)
x = Concatenate()([global_ave, global_max])

x = BatchNormalization()(x)

#---- Dense Layers 
x = Dropout(p)(x)
x = Dense(100, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)
x = Dense(20, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)
x = Dense(10, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)
out = Dense(1, activation='sigmoid', name='output')(x)


In [28]:
model = Model(inputs=inputs, outputs=out)

for layer in model.layers:
    print(layer.output_shape)

[(None, 10, 30)]
(None, 10, 30)
(None, 10, 10)
(None, 10, 10)
(None, 10)
(None, 10)
(None, 20)
(None, 20)
(None, 10, 20)
(None, 10, 50)
(None, 10, 50)
(None, 10, 10)
(None, 10, 10)
(None, 10)
(None, 10)
(None, 20)
(None, 20)
(None, 20)
(None, 100)
(None, 100)
(None, 100)
(None, 20)
(None, 20)
(None, 20)
(None, 10)
(None, 10)
(None, 10)
(None, 1)


In [24]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy', 
             metrics=['accuracy'])
#model.summary()

In [25]:
model.fit_generator(
    sample_generator(batch_size),
    steps_per_epoch=10_000 / batch_size,
    epochs=20,
    max_queue_size=10
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f7968303c90>