In [1]:
from tensorflow.keras import datasets
import tensorflow as tf
import optuna

# loading CIFAR100 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar100.load_data()
def train_augment(image, label):
    # normalize image
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
    image = tf.image.random_crop(image, size=[32, 32, 3])
    
    return image, label  # Return both image and label

def test_augment(image, label):
    return image, label

In [2]:
from sklearn.model_selection import train_test_split
# splitting validation set and test set
test_images, val_images, test_labels, val_labels = train_test_split(test_images, test_labels, test_size=0.5, random_state=42)

In [3]:
train_ds = (
    tf.data.Dataset.from_tensor_slices((train_images, train_labels))
    .map(train_augment, num_parallel_calls=tf.data.AUTOTUNE)  # Now works correctly
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)
val_ds = (
    tf.data.Dataset.from_tensor_slices((val_images, val_labels))
    .map(test_augment, num_parallel_calls=tf.data.AUTOTUNE)  # Ensure label is passed correctly
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)

test_ds = (
    tf.data.Dataset.from_tensor_slices((test_images, test_labels))
    .map(test_augment, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)


Finding best architecture complexity between 1, 2 and 3 conv layers

In [4]:
def conv_block(k, filter_size = (3, 3), dropout=0, regularization=0):
    block = [
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(k, filter_size, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regularization)),
    ]
    if dropout:
        block.append(tf.keras.layers.Dropout(dropout))
    return block

def final_block(dropout = 0, regularization=0):
    block = [
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regularization)),
    ]
    if dropout:
        block.append(tf.keras.layers.Dropout(dropout))
    block.append(tf.keras.layers.Dense(100, activation='softmax'))
    return block


In [5]:
def create_model(num_layers,base = 5, kernels = (3, 3), regularization=0, dropout_conv=0, dropout_dense=0):
    inputs = tf.keras.layers.Input(shape=(32, 32, 3))
    x = tf.keras.layers.Conv2D(2**base, kernels, activation='relu')(inputs)
    
    for layer_number in range(num_layers):
        for sub_layer in conv_block(2**(base+layer_number), kernels, dropout_conv, regularization):
            try :
                x = sub_layer(x)
            except ValueError:
                pass
            
    for sub_layer in final_block(dropout_dense,regularization):
        x = sub_layer(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
        
    return model

In [6]:
model = create_model(3, kernels = (7,7), regularization=1, dropout_conv=1, dropout_dense=1)

In [7]:
def objective(trial):
    lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    model = create_model(num_layers = trial.suggest_int('num_layers', 0, 3), 
                         base = trial.suggest_int('base', 4, 7),
                         kernels = trial.suggest_categorical("kernels", [(3, 3), (5, 5), (7, 7)]),
                         regularization = trial.suggest_float('regularization', 1e-6, 1e-3, log=True),
                         dropout_conv = trial.suggest_float('dropout_conv', 0, 0.3),
                         dropout_dense = trial.suggest_float('dropout_dense', 0, 0.3),
                         )
                         
                    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(train_ds, epochs=10, validation_data=(val_ds), verbose=0)
    return model.evaluate(test_ds, verbose=0)[1]

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Run Optuna optimization
study = optuna.create_study(direction="maximize")  
study.optimize(objective, n_trials=50)
# Best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2025-02-12 12:27:07,458] A new study created in memory with name: no-name-7254ab8d-18bf-4a4e-9174-a6c5d733cf14
[I 2025-02-12 12:30:34,111] Trial 0 finished with value: 0.06639999896287918 and parameters: {'learning_rate': 1.5095468077916711e-05, 'num_layers': 3, 'base': 6, 'kernels': (7, 7), 'regularization': 0.0001979669373286028, 'dropout_conv': 0.2923313796841476, 'dropout_dense': 0.12002978223693585}. Best is trial 0 with value: 0.06639999896287918.
[I 2025-02-12 12:32:53,007] Trial 1 finished with value: 0.06419999897480011 and parameters: {'learning_rate': 0.00011472347754379394, 'num_layers': 1, 'base': 7, 'kernels': (3, 3), 'regularization': 3.198444845007553e-05, 'dropout_conv': 0.05891105885245885, 'dropout_dense': 0.17764114906686554}. Best is trial 0 with value: 0.06639999896287918.
[I 2025-02-12 12:56:29,270] Trial 2 finished with value: 0.05040000006556511 and parameters: {'learning_rate': 0.0005482136313599965, 'num_layers': 1, 'base': 6, 'kernels': (3, 3), 'regulariz

Best hyperparameters: {'learning_rate': 0.00032430897891660293, 'num_layers': 0, 'base': 5, 'kernels': (7, 7), 'regularization': 5.162667587400127e-05, 'dropout_conv': 0.11100109380399757, 'dropout_dense': 0.15221621493494913}


Optuna optimizes hyperparameters using search algorithms, such as:

1️⃣ Bayesian Optimization (TPE - Tree-structured Parzen Estimator)
- Learns from past trials to suggest better hyperparameters.
- Efficient for complex search spaces.

2️⃣ Grid Search & Random Search
- Random search: Tries random values in the defined range.
- Grid search: Tests all possible combinations (not recommended for large search spaces).

3️⃣ Pruning (Early Stopping)
- Stops bad trials early to save compute time.

In [10]:
import pickle

# Save the model
with open('best_params.pkl', 'wb') as f:
    pickle.dump(study.best_params, f)

In [11]:
import pandas as pd
pd.DataFrame(study.trials_dataframe()).to_csv('study.csv', index=False)

In [12]:
pd.DataFrame(study.best_params).iloc[:1].T

Unnamed: 0,0
learning_rate,0.000324
num_layers,0.0
base,5.0
kernels,7.0
regularization,5.2e-05
dropout_conv,0.111001
dropout_dense,0.152216


In [13]:
best_params = study.best_params.copy()
del best_params['learning_rate']
model = create_model(**best_params)

In [14]:
# WANDB
import wandb
from wandb.integration.keras import WandbMetricsLogger

# Start a new W&B run
wandb.init(project="cifar100")

# learning rate reducer
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6
)
# Early stopping for f1
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

[34m[1mwandb[0m: Currently logged in as: [33mmlahmadmostafa[0m ([33mmlahmadmostafa-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [15]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.round(tf.clip_by_value(y_pred, 0, 1))
        y_true = tf.cast(y_true, tf.float32)

        tp = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))
        fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, tf.float32))
        fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), tf.float32))

        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)


In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=study.best_params['learning_rate']), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy', F1Score()])

model.fit(train_ds, epochs=100, validation_data=(val_ds), 
          callbacks=[WandbMetricsLogger(), early_stopping, lr_scheduler])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x2150e5d8280>