## Neural-Net for Tabular data
### <u>Problem statement 4:</u> Ordered variable features
The following dataset is a synthetic dataset generated from `make_classification`. <br>
In this case, each datapoint has an ordered set of features. We assume someone will default loans based on monthly stipend so in that we consider the chronological order of the month. 

In [2]:
from sklearn.datasets import make_classification

In [4]:
base_dataset = make_classification(
    n_samples=10_000, 
    n_features=30, 
    n_informative=10,
    n_clusters_per_class=2,
    n_classes=4)

X, y = base_dataset

X.shape, y.shape

((10000, 30), (10000,))

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
base_classes = []

for i in range(4):
    base_classes.append(X_scaled[y == i])

In [9]:
import numpy as np

num_points = 5_000
class1_dist = np.array([.5, .5, 0, 0])
class2_dist = np.array([0, .2, .6, .2])

def make_var_len_feature_point(dist):
    sequence_dist = dist.copy()
    
    feature_sets = []
    previous_feature_set = np.zeros((1, 30))
    num_features = np.random.randint(3, 11)
    for i in range(num_features):
        # choose which distribution the transaction comes from
        base_class = np.random.choice([0, 1, 2, 3], 1, p=sequence_dist)
        base_class_points = base_classes[base_class[0]]
        feature_set_idx = np.random.choice(base_class_points.shape[0], 1)
        previous_feature_set += base_class_points[feature_set_idx]
        feature_sets.append(previous_feature_set)
        
        # now make it more likely to come from the same dist
        dist_update = np.zeros([4]); dist_update[base_class] = 1
        sequence_dist += dist_update
        sequence_dist = sequence_dist / sequence_dist.sum()

        
    for _ in range(10 - num_features):
        feature_sets.append(np.zeros((1, 30)))

    return np.concatenate(feature_sets)[np.newaxis, :, :]


class1_points = []
for _ in range(num_points):
    class1_points.append(
        make_var_len_feature_point(class1_dist))
class1_points = np.concatenate(class1_points)
    
class2_points = []
for _ in range(num_points):
    class2_points.append(
        make_var_len_feature_point(class2_dist))
class2_points = np.concatenate(class2_points)

In [10]:
class2_points.shape, class1_points.shape 

((5000, 10, 30), (5000, 10, 30))

In [12]:
def sample_generator(batch_size):
    while True:
        batch_idx = np.random.choice(
            class1_points.shape[0], batch_size // 2)
        batch_x = np.concatenate([
            class1_points[batch_idx],
            class2_points[batch_idx],
        ])
        batch_y = np.concatenate([
            np.zeros(batch_size // 2),
            np.ones(batch_size // 2),
        ])
        yield ({'num_inputs': batch_x}, 
               {'output': batch_y})

In [16]:
from tensorflow.keras.layers import (Input, Dropout, Dense, 
                                     BatchNormalization, Concatenate, 
                                     Activation, RepeatVector, 
                                     Bidirectional, GRU
                                    )
from tensorflow.keras.models import Model

# Dropout probability 
p = .1
batch_size = 32

In [17]:
# The feature order is taken in consideration by the use of RNN 
# in particular a GRU(Gated Reccurent Unit) which is wrapped in Biderectional 
# layer to spot the relation from both ends. 
inputs = Input((10, 30), name='num_inputs')

x = Dropout(p)(inputs)
x = Bidirectional( GRU(10) )(x)
x = BatchNormalization()(x)

#--- SKIP-CONNECTION(Concatenation): appending the global context to the local context.   
#--- Combine the extracted features which held the global information
#--- of the 10 different credit cards records to the inputs 
x = RepeatVector(10)(x)
x = Concatenate()([inputs, x])

x = Dropout(p)(x)
x = Bidirectional( GRU(10) )(x)
x = BatchNormalization()(x)

#---- Dense Layers -----
x = Dropout(p)(x)
x = Dense(100, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)
x = Dense(20, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)
x = Dense(10, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(p)(x)

#------ Output -------
out = Dense(1, activation='sigmoid', name='output')(x)



In [19]:
model = Model(inputs=inputs, outputs=out)

for layer in model.layers:
    print(layer.output_shape)

[(None, 10, 30)]
(None, 10, 30)
(None, 20)
(None, 20)
(None, 10, 20)
(None, 10, 50)
(None, 10, 50)
(None, 20)
(None, 20)
(None, 20)
(None, 100)
(None, 100)
(None, 100)
(None, 20)
(None, 20)
(None, 20)
(None, 10)
(None, 10)
(None, 10)
(None, 1)


In [21]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy', 
             metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num_inputs (InputLayer)         [(None, 10, 30)]     0                                            
__________________________________________________________________________________________________
dropout_6 (Dropout)             (None, 10, 30)       0           num_inputs[0][0]                 
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 20)           2520        dropout_6[0][0]                  
__________________________________________________________________________________________________
batch_normalization_v2_5 (Batch (None, 20)           80          bidirectional_2[0][0]            
____________________________________________________________________________________________

In [22]:
model.fit_generator(
    sample_generator(batch_size),
    steps_per_epoch=10_000 / batch_size,
    epochs=20,
    max_queue_size=10
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3694542190>