# CIFAR10 MLP

Convolutional Neural Networks model should perform a lot better in tackling this problem, however I will still try with MLP model and see how much I can do with it.

## Imports

In [16]:
import os
import time
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from mltoolkit.utils import dump_keras_model,dump_arrays,dump_sklearn_model,\
                            get_tf_logdir

## Functions

In [20]:
def get_tf_logdir(name,root="C:\\Users\\pi314\\Learning\\Data Science\\TensorFlow\\"):
    '''
    Get a new logdir for tensorboard with the current timestamp.
    
    Parameters:
    name: str
        The extra name as suffix for the logdir.
    '''
    timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
    if name: name = timestamp+'_'+name
    else: name = timestamp
    logdir = os.path.join(root,"Tensorboard",name)
    os.makedirs(logdir,exist_ok=True)
    return logdir

In [108]:
def make_mlp(Input_X,output_y,hidden_layers,neurons,
             flatten=False,
             hid_activation='relu',
             hid_initializer='glorot_uniform',
             hid_regularizer=None,
             out_activation=None,
             batch_norm=False):
    model = keras.models.Sequential([
        keras.layers.Input(shape=Input_X.shape[1:])
    ])
    if flatten: model.add(keras.layers.Flatten())
    if batch_norm: model.add(keras.layers.BatchNormalization())
    if not batch_norm:
        [model.add(layer) for layer in [
            keras.layers.Dense(neurons,
                               activation=hid_activation,
                               kernel_initializer=hid_initializer,
                               kernel_regularizer=hid_regularizer)
            for i in range(hidden_layers)
        ]]
    else:
        [[model.add(layer),
         model.add(keras.layers.BatchNormalization()),
         model.add(keras.layers.Activation(hid_activation))] 
         for layer in [
            keras.layers.Dense(neurons,
                               kernel_initializer=hid_initializer,
                               kernel_regularizer=hid_regularizer)
            for i in range(hidden_layers)
        ]]
        
    
    if output_y.dtype == int or output_y.dtype == float:
        model.add(keras.layers.Dense(1,activation=None))
    else:
        model.add(keras.layers.Dense(np.unique(output_y).size,activation=out_activation))
    return model

In [218]:
def dump_keras_model(model,path="Trained Models\\",filename=None,
                     yhat=None,scores=None,compress=5,save_weights=False,
                     weights_precision='half'):
    '''
    FOR Keras Model ONLY.
    Dump the objects passed as arguments into .h5 and .pkl file.
    
    Parameters:
    ----------
    model: keras model
        The keras model to be dumped.
    path: str, Default: "Trained Models\\"
        The path to dump the model.
    filename:
        The filename to be dumped. If None, model default name instance will be
        used.
    yhat: array, list, dict etc., Default: None
        Predicted datasets of the model.
    scores: array, list, dict etc., Default: None
        Evaluation scores of the model.
    compress: int, Default: 5
        Compression ratio for yhat and scores only (joblib.dump).
    save_weights: bool, Default: False
        If True, model's weights will be saved as well in .CSV format.
    weights_precision: str | {'half','full'}, Default: 'half'
        If half, floats will be stored at a smaller precision and comsumes
        lesser space. If full, floats will be stored at the highest precision.
    '''
    try:
        os.mkdir(path)
    except:
        pass
    timestamp = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    # Dump keras model
    if not filename: filename = model.name
    model.save(filepath=path+timestamp+'_'+filename)
    if save_weights:
        os.makedirs(path+timestamp+'_'+filename+'_weights')
        for weight in model.weights:
            name = weight.name[:weight.name.find(':')].replace('/','_')
            if weights_precision == 'full':
                np.savetxt(path+timestamp+'_'+filename+'_weights'+'\\'+f'{name}_weights.csv',
                           weight.numpy(),delimiter=',',fmt='%.18f',encoding='utf-8')
            elif weights_precision == 'half':
                pd.DataFrame(weight.numpy()).to_csv(path+timestamp+'_'+filename+'_weights'+'\\'+f'{name}_weights.csv',
                                         index=False)
            else:
                raise ValueError("Only 'half' or 'full' are allowed for 'weights_precision'")
            
    # Dump yhat
    path2 = path+timestamp+'_'+filename+'\\'
    if yhat is not None:
        joblib.dump(yhat,path2+timestamp+'_'+filename+"_yhat.pkl",compress=compress)
    # Dump scores
    if scores is not None:
        joblib.dump(scores,path2+timestamp+'_'+filename+"_scores.pkl",compress=compress)

## Loading the datasets

In [2]:
X_train_trans = joblib.load("Datasets\\X_train_trans.pkl")
X_test_trans = joblib.load("Datasets\\X_test_trans.pkl")
y_train_raw = joblib.load("Datasets\\Raw Data\\y_train_raw.pkl")
y_test_raw = joblib.load("Datasets\\Raw Data\\y_test_raw.pkl")

X_train_trans.shape,X_test_trans.shape,y_train_raw.shape,y_test_raw.shape

((50000, 700), (10000, 700), (50000,), (10000,))

Note that the X_train and X_test are both PCA transformed with 700 components to reduce the dimension.

## Splitting the Datasets

In [3]:
X_train,X_valid,y_train,y_valid = train_test_split(X_train_trans,y_train_raw,test_size=0.1,stratify=y_train_raw)

## MLP (20 x 100)

mlp_0: Default MLP with 20 hidden layers and 100 neurons each

In [91]:
mlp_0 = make_mlp(X_train_trans,y_train,20,100,'elu','he_normal',out_activation='softmax')

In [92]:
mlp_0.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_105 (Dense)           (None, 100)               70100     
                                                                 
 dense_106 (Dense)           (None, 100)               10100     
                                                                 
 dense_107 (Dense)           (None, 100)               10100     
                                                                 
 dense_108 (Dense)           (None, 100)               10100     
                                                                 
 dense_109 (Dense)           (None, 100)               10100     
                                                                 
 dense_110 (Dense)           (None, 100)               10100     
                                                                 
 dense_111 (Dense)           (None, 100)              

In [56]:
mlp_0_logdir = get_tf_logdir("mlp_0")
mlp_0_tfboard = keras.callbacks.TensorBoard(mlp_0_logdir)

In [93]:
mlp_0_early = keras.callbacks.EarlyStopping(patience=10)
mlp_0_opt = keras.optimizers.Nadam(learning_rate=0.001)
mlp_0_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy',factor=0.5,patience=3)

In [94]:
mlp_0.compile(optimizer=mlp_0_opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [95]:
mlp_0.fit(X_train,y_train,batch_size=500,epochs=200,
          callbacks=[mlp_0_tfboard,mlp_0_early,mlp_0_schedule],
          validation_data=[X_valid,y_valid])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200


<keras.callbacks.History at 0x2a450dd89d0>

In [96]:
mlp_0.evaluate(X_test_trans,y_test_raw)



[2.4150688648223877, 0.45649999380111694]

### Saving model

In [219]:
dump_keras_model(mlp_0,filename="mlp_0.h5",save_weights=True,weights_precision='half')

## MLP (20 x 100, Batch Norm)

In [100]:
mlp_1 = make_mlp(X_train_trans,y_train,20,100,'elu','he_normal',out_activation='softmax',batch_norm=True)

In [101]:
mlp_1.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (BatchN  (None, 700)              2800      
 ormalization)                                                   
                                                                 
 dense_126 (Dense)           (None, 100)               70100     
                                                                 
 batch_normalization_1 (Batc  (None, 100)              400       
 hNormalization)                                                 
                                                                 
 activation (Activation)     (None, 100)               0         
                                                                 
 dense_127 (Dense)           (None, 100)               10100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)             

In [102]:
mlp_1_logdir = get_tf_logdir("mlp_1")
mlp_1_tfboard = keras.callbacks.TensorBoard(mlp_1_logdir)

In [103]:
mlp_1_early = keras.callbacks.EarlyStopping(patience=10)
mlp_1_opt = keras.optimizers.Nadam(learning_rate=0.001)
mlp_1_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy',factor=0.5,patience=3)

In [104]:
mlp_1.compile(optimizer=mlp_1_opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [105]:
mlp_1.fit(X_train,y_train,batch_size=500,epochs=200,
          callbacks=[mlp_1_tfboard,mlp_1_early,mlp_1_schedule],
          validation_data=[X_valid,y_valid])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


<keras.callbacks.History at 0x2a45d201210>

### Saving model

In [220]:
dump_keras_model(mlp_1,filename="mlp_1_batch_norm.h5",save_weights=True,weights_precision='half')

## MLP (20 x 100, no PCA)

I am suspecting the PCA actually make the training worse. Let's try once with the same configuration as mlp_0 but with the raw dataset without PCA transformation.

In [106]:
X_train_raw = joblib.load("Datasets\\Raw Data\\X_train_raw.pkl")
X_test_raw = joblib.load("Datasets\\Raw Data\\X_test_raw.pkl")
X_train_raw.shape,X_test_raw.shape

((50000, 32, 32, 3), (10000, 32, 32, 3))

In [117]:
X_train_nopca,X_valid_nopca,y_train_nopca,y_valid_nopca = train_test_split(X_train_raw,y_train_raw,test_size=0.1,stratify=y_train_raw)

In [119]:
X_train_nopca.shape,X_valid_nopca.shape,y_train_nopca.shape,y_valid_nopca.shape

((45000, 32, 32, 3), (5000, 32, 32, 3), (45000,), (5000,))

In [110]:
mlp_2 = make_mlp(X_train_raw.copy(),y_train,20,100,True,'elu','he_normal',out_activation='softmax')

In [111]:
mlp_2.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 3072)              0         
                                                                 
 dense_168 (Dense)           (None, 100)               307300    
                                                                 
 dense_169 (Dense)           (None, 100)               10100     
                                                                 
 dense_170 (Dense)           (None, 100)               10100     
                                                                 
 dense_171 (Dense)           (None, 100)               10100     
                                                                 
 dense_172 (Dense)           (None, 100)               10100     
                                                                 
 dense_173 (Dense)           (None, 100)              

In [112]:
mlp_2_logdir = get_tf_logdir("mlp_2")
mlp_2_tfboard = keras.callbacks.TensorBoard(mlp_2_logdir)

In [126]:
mlp_2_early = keras.callbacks.EarlyStopping(patience=10)
mlp_2_opt = keras.optimizers.Nadam(learning_rate=0.001)
mlp_2_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=3)

In [127]:
mlp_2.compile(optimizer=mlp_2_opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [128]:
mlp_2.fit(X_train_nopca,y_train_nopca,batch_size=500,epochs=200,
          callbacks=[mlp_2_tfboard,mlp_2_early,mlp_2_schedule],
          validation_data=[X_valid_nopca,y_valid_nopca])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x2a434c23b20>

In [129]:
mlp_2.evaluate(X_test_raw,y_test_raw)



[1.5838027000427246, 0.43309998512268066]

It seems like my guessing was wrong. PCA actually gives better model performance.

### Saving model

In [224]:
dump_keras_model(mlp_2,filename="mlp_2_nopca.h5",save_weights=True,weights_precision='half')

## MLP (20 x 100, SELU)

SELU activation requires the input to be scaled to mean 0 and std 1.

In [9]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test_trans)

In [13]:
mlp_3 = keras.models.Sequential([keras.layers.Input((700,))])
[mlp_3.add(layer) for layer in [
    keras.layers.Dense(100,activation='selu',kernel_initializer='lecun_normal',)
    for i in range(20)
]]
mlp_3.add(keras.layers.Dense(10,activation='softmax',kernel_initializer='lecun_normal'))

In [14]:
mlp_3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 100)               70100     
                                                                 
 dense_22 (Dense)            (None, 100)               10100     
                                                                 
 dense_23 (Dense)            (None, 100)               10100     
                                                                 
 dense_24 (Dense)            (None, 100)               10100     
                                                                 
 dense_25 (Dense)            (None, 100)               10100     
                                                                 
 dense_26 (Dense)            (None, 100)               10100     
                                                                 
 dense_27 (Dense)            (None, 100)              

In [17]:
mlp_3_logdir = get_tf_logdir("mlp_3")
mlp_3_tfboard = keras.callbacks.TensorBoard(mlp_3_logdir)

In [18]:
mlp_3_early = keras.callbacks.EarlyStopping(patience=10)
mlp_3_opt = keras.optimizers.Nadam(learning_rate=0.001)
mlp_3_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=3)

In [19]:
mlp_3.compile(optimizer=mlp_3_opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
mlp_3.fit(X_train_scaled,y_train,batch_size=500,epochs=200,
          callbacks=[mlp_3_tfboard,mlp_3_early,mlp_3_schedule],
          validation_data=[X_valid_scaled,y_valid])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


<keras.callbacks.History at 0x2909dc34760>

In [21]:
mlp_3.evaluate(X_test_scaled,y_test_raw)



[2.4533722400665283, 0.40959998965263367]

### Saving model

In [23]:
dump_keras_model(mlp_3,filename="mlp_3_selu.h5",save_weights=True,weights_precision='half')

This model is not particularly useful, except that it converges faster than the one with batch normalization.

## MLP (20 x 100, Dropout)

Since the first model mlp_0 performs the best while looking like its overfitting. We will try to apply Dropout regularization to the model mlp_0 and see if it improves the prediction accuracy.

In [53]:
mlp_4_dropout_rate = 0.05
mlp_4 = keras.models.Sequential([
    keras.layers.Input((700,))
])
mlp_4.add(keras.layers.Dropout(mlp_4_dropout_rate))
[[mlp_4.add(keras.layers.Dense(100,activation='elu',kernel_initializer='he_normal')),
 mlp_4.add(keras.layers.Dropout(mlp_4_dropout_rate))] for i in range(20)]
mlp_4.add(keras.layers.Dense(10,activation='softmax'))

In [54]:
mlp_4.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_106 (Dropout)       (None, 700)               0         
                                                                 
 dense_148 (Dense)           (None, 100)               70100     
                                                                 
 dropout_107 (Dropout)       (None, 100)               0         
                                                                 
 dense_149 (Dense)           (None, 100)               10100     
                                                                 
 dropout_108 (Dropout)       (None, 100)               0         
                                                                 
 dense_150 (Dense)           (None, 100)               10100     
                                                                 
 dropout_109 (Dropout)       (None, 100)              

In [27]:
mlp_4_logdir = get_tf_logdir("mlp_4")
mlp_4_tfboard = keras.callbacks.TensorBoard(mlp_4_logdir)

In [55]:
mlp_4_early = keras.callbacks.EarlyStopping(patience=10)
mlp_4_opt = keras.optimizers.Nadam(learning_rate=0.001)
mlp_4_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=3)

In [56]:
mlp_4.compile(optimizer=mlp_4_opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [57]:
mlp_4.fit(X_train,y_train,batch_size=500,epochs=200,
          callbacks=[mlp_4_tfboard,mlp_4_early,mlp_4_schedule],
          validation_data=[X_valid,y_valid])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200


<keras.callbacks.History at 0x290a34d0190>

In [58]:
mlp_4.evaluate(X_test_trans,y_test_raw)



[1.3993505239486694, 0.5192000269889832]

### Saving model

In [59]:
dump_keras_model(mlp_4,filename="mlp_4_dropout.h5",save_weights=True,weights_precision='half')

This model is not particularly useful, except that it converges faster than the one with batch normalization.

After regularizing the exact same model as mlp_0 with dropout regularization, we get to increase the performance up to 0.5192.\
Although it sounds not very useful, but this approach is still better than the few other approaches tried in mlp_1 to mlp_3.

## Conclusion

The best MLP model that I discovered so far for predicting CIFAR10 datasets is as follow:

Model: Sequential (20 hidden layers with 100 neurons each)\
Activation: ELU\
Initialization: He Normal\
Regularization: Early Stopping, Dropout (rate = 0.05)\
Optimizer: Nadam\
Loss Function: Sparse Categorical Cross Entropy\
Learning Rate Schedule: Performance Scheduler (factor = 0.5, patience = 3)\
Output Activation: Softmax

Evaluation Accuracy: 0.5192