In [1]:
import os
import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
from pathlib import Path
from optuna_dashboard import run_server

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DB_PATH = Path("/Users/maryamhomayoon/PycharmProjects/optuna/optuna-examples/db.sqlite3")
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
STORAGE = f"sqlite:///{DB_PATH.as_posix()}"

SEED = 42

DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {DEVICE}")
if DEVICE.type == "mps":
    torch.mps.manual_seed(SEED)
else:
    torch.manual_seed(SEED)

BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
# N_TRAIN_EXAMPLES = BATCHSIZE * 30
# N_VALID_EXAMPLES = BATCHSIZE * 10

Using device: mps


In [None]:
# run for simple model with only linear layer
def define_model(trial):
    # We optimize the number of layers, hidden units and #dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 10)
    layers = []
    in_out_features = [] 

    in_features = 28 * 28
    for i in range(n_layers):
        out_features = trial.suggest_int(f"n_units_l{i}", 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        in_out_features.append((in_features, out_features))
        # p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        # layers.append(nn.Dropout(p))

        in_features = out_features

    layers.append(nn.Linear(in_features, CLASSES))
    in_out_features.append((in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers) , in_out_features

In [None]:
# Model with conv and pooling and linear layer
def define_model(trial):
    layers = []
    # FashionMNIST is 28x28 grayscale images
    in_channels = 1
    in_height = 28
    in_width = 28

    current_features = None
    spatial_mode = True # a flag to control what architecture are allowed and that spatial sturctures of the connected layers makes sense
    used_global_pool = False # we can classify with conv -> global pooling -> classifier(log softmax)

    layer_descriptions = []

    n_layers = trial.suggest_int("n_layers", 1, 10)

    for layer_idx in range(n_layers):
        # First layer should be conv or if it's not conv then we need to flatten the input
        # last layer should either be linear or we have to apply a conv with 10 output channels and then we have to apply global pooling and then classifier
        # 10 channel conv beacause it's FashionMNIST dataset
        # if the last global pool is not with 10 out channel then we linearly have it to 10 classes output and then go for classifier

        if layer_idx == 0:
            layer_type = "conv"

        elif spatial_mode:
            layer_type = trial.suggest_categorical(
                f"layer_type_{layer_idx}",
                ["conv", "pool", "global_pool", "linear"]
            )
        else:
            # Once spatial structure is gone, only Linear is allowed
            layer_type = "linear"


        if layer_type == "conv":
            out_channels = trial.suggest_categorical(
                f"conv_out_channels_{layer_idx}", [16, 32, 64, 128, 256, 512]
            )

            kernel_size = trial.suggest_int(
                f"conv_kernel_{layer_idx}", 1, 7, step=2
            )

            stride = trial.suggest_categorical(
                f"conv_stride_{layer_idx}", [1, 2]
            )

            layers.append(
                nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding="same",
                )
            )
            layers.append(nn.ReLU())

            # SAME padding output size
            in_height = (in_height + stride - 1) // stride
            in_width = (in_width + stride - 1) // stride
            in_channels = out_channels

            layer_descriptions.append({
                "type": "conv2d",
                "out_channels": out_channels,
                "kernel_size": kernel_size,
                "stride": stride,
                "input_shape": (in_channels, in_height, in_width),
            })


        elif layer_type == "pool":
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))

            in_height //= 2
            in_width //= 2
            # channels unchanged

            layer_descriptions.append({
                "type": "maxpool2d",
                "kernel_size": 2,
                "stride": 2,
                "input_shape": (in_channels, in_height, in_width),
            })

   
        elif layer_type == "global_pool":
            layers.append(nn.AdaptiveAvgPool2d((1, 1)))

            current_features = in_channels
            in_channels = None
            in_height = None
            in_width = None

            spatial_mode = False
            used_global_pool = True

            layer_descriptions.append({
                "type": "global_avg_pool",
                "features": current_features,
            })

        else:  # linear
            if spatial_mode:
                layers.append(nn.Flatten())
                current_features = in_channels * in_height * in_width
                spatial_mode = False

            out_features = trial.suggest_int(
                f"linear_out_{layer_idx}", 16, 128
            )

            layers.append(nn.Linear(current_features, out_features))
            layers.append(nn.ReLU())

            layer_descriptions.append({
                "type": "linear",
                "in_features": current_features,
                "out_features": out_features,
            })

            current_features = out_features

    if used_global_pool:
        # If global pooling was used, we may already have features
        if current_features != CLASSES:
            layers.append(nn.Linear(current_features, CLASSES))
            layer_descriptions.append({
                "type": "linear",
                "in_features": current_features,
                "out_features": CLASSES,
            })
    else:
        # No global pool â†’ must flatten and use Linear
        if spatial_mode:
            layers.append(nn.Flatten())
            current_features = in_channels * in_height * in_width

        layers.append(nn.Linear(current_features, CLASSES))
        layer_descriptions.append({
            "type": "linear",
            "in_features": current_features,
            "out_features": CLASSES,
        })

    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers), layer_descriptions

In [3]:
def define_model(trial):
    layers = []
    in_channels = 1
    in_height = 28
    in_width = 28

    current_features = None
    spatial_mode = True
    used_global_pool = False

    layer_descriptions = []

    n_layers = trial.suggest_int("n_layers", 1, 10)

    for layer_idx in range(n_layers):
        if layer_idx == 0:
            layer_type = "conv"
        elif spatial_mode:
            layer_type = trial.suggest_categorical(
                f"layer_type_{layer_idx}",
                ["conv", "pool", "global_pool", "linear"]
            )
        else:
            layer_type = "linear"

        if layer_type == "conv":
            out_channels = trial.suggest_categorical(
                f"conv_out_channels_{layer_idx}", [16, 32, 64, 128, 256, 512]
            )
            kernel_size = trial.suggest_int(
                f"conv_kernel_{layer_idx}", 1, 7, step=2  # odd only
            )
            stride = trial.suggest_categorical(
                f"conv_stride_{layer_idx}", [1, 2]
            )

            # Save input shape BEFORE the layer (for latency)
            c_in, h_in, w_in = in_channels, in_height, in_width

            padding = kernel_size // 2  # works with odd kernels for stride 1 and 2

            layers.append(
                nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding,
                )
            )
            layers.append(nn.ReLU())

            # Output size tracking (matches your "ceil" style)
            out_h = (h_in + stride - 1) // stride
            out_w = (w_in + stride - 1) // stride

            in_channels = out_channels
            in_height = out_h
            in_width = out_w

            layer_descriptions.append({
                "type": "conv2d",
                "out_channels": out_channels,
                "kernel_size": kernel_size,
                "stride": stride,
                "input_shape": (c_in, h_in, w_in),
            })

        elif layer_type == "pool":
            c_in, h_in, w_in = in_channels, in_height, in_width

            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))

            in_height = h_in // 2
            in_width = w_in // 2

            layer_descriptions.append({
                "type": "maxpool2d",
                "kernel_size": 2,
                "stride": 2,
                "input_shape": (c_in, h_in, w_in),
            })

        elif layer_type == "global_pool":
            c_in, h_in, w_in = in_channels, in_height, in_width

            layers.append(nn.AdaptiveAvgPool2d((1, 1)))
            layers.append(nn.Flatten())  # IMPORTANT so Linear works

            current_features = c_in

            spatial_mode = False
            used_global_pool = True

            # After flatten, no more spatial dims
            in_channels = None
            in_height = None
            in_width = None

            layer_descriptions.append({
                "type": "global_avg_pool",
                "features": current_features,
                "input_shape": (c_in, h_in, w_in),
            })

        else:  # linear
            if spatial_mode:
                layers.append(nn.Flatten())
                current_features = in_channels * in_height * in_width
                spatial_mode = False

                in_channels = None
                in_height = None
                in_width = None

            out_features = trial.suggest_int(
                f"linear_out_{layer_idx}", 16, 128
            )

            layers.append(nn.Linear(current_features, out_features))
            layers.append(nn.ReLU())

            layer_descriptions.append({
                "type": "linear",
                "in_features": current_features,
                "out_features": out_features,
            })

            current_features = out_features

    # Final classifier head
    if used_global_pool:
        if current_features != CLASSES:
            layers.append(nn.Linear(current_features, CLASSES))
            layer_descriptions.append({
                "type": "linear",
                "in_features": current_features,
                "out_features": CLASSES,
            })
    else:
        if spatial_mode:
            layers.append(nn.Flatten())
            current_features = in_channels * in_height * in_width

        layers.append(nn.Linear(current_features, CLASSES))
        layer_descriptions.append({
            "type": "linear",
            "in_features": current_features,
            "out_features": CLASSES,
        })

    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers), layer_descriptions

In [4]:
def estimate_linear_latency(in_features, out_features, num_cores=1):
    forward = in_features * out_features + out_features # + bias
    backward = 2 * forward # grad w.r.t input, grad w.r.t to weight, grad w.r.t bias so we roughly say cost of backward is twice as much of forward
    optimizer = 3 * forward # adam have to do some arithmetic calculations to update each weight so we assume that the cost of those are 3
    latency = (forward + backward + optimizer) / num_cores 
    
    return latency

In [5]:
def estimate_conv_latency(input_channels, output_channels, input_height, input_width, kernel_size, stride=1, num_cores=1):
    # to formulate the cost of conv layer we check how many output positions we have and how many arithmatic calculations we need to do

    # Output spatial size
    output_height = (input_height + stride - 1) // stride
    output_width  = (input_width  + stride - 1) // stride

    # Cost of computing ONE output value
    kernel_area = kernel_size * kernel_size
    operations_per_output_value = input_channels * kernel_area + 1 # + bias

    # Number of output values
    number_of_output_values = output_channels * output_height * output_width


    forward = operations_per_output_value * number_of_output_values
    backward = 2 * forward
    optimizer = 3 * forward
    latency = (forward + backward + optimizer) / num_cores

    return latency

In [6]:
def estimate_maxpool_latency(channels, input_height, input_width, kernel_size=2, stride=2, num_cores=1):
    """
    Estimate arithmetic cost of MaxPool2d for ONE input sample.
    """

    # Output spatial size
    output_height = input_height // stride
    output_width = input_width // stride

    kernel_area = kernel_size * kernel_size

    # Comparisons per output value
    operations_per_output_value = kernel_area - 1

    number_of_output_values = channels * output_height * output_width

    forward_cost = operations_per_output_value * number_of_output_values
    backward_cost = forward_cost    # Backward (no optimizer)
    latency = (forward_cost + backward_cost) / num_cores

    return latency

In [7]:
def estimate_global_avg_pool_latency(channels, input_height, input_width, num_cores=1):
    """
    Estimate arithmetic cost of Global Average Pooling for ONE input sample.
    """

    spatial_area = input_height * input_width
    
    forward_cost = channels * spatial_area  # Forward: sum + divide
    backward_cost = forward_cost    # Backward: distribute gradient
    latency = (forward_cost + backward_cost) / num_cores

    return latency

In [8]:
def get_mnist():
    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=True, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    test_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=False, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=False,
    )

    return train_loader, test_loader

In [None]:
# run for simple linear model
def objective(trial):
    # Generate the model.
    model, out = define_model(trial)
    model = model.to(DEVICE)
    
    # Generate the optimizers.
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    # lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    # optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    lr = 0.001
    optimizer = optim.Adam(model.parameters(), lr=lr)

    latency = 0.0

    for i, (in_f, out_f) in enumerate(out):
        latency += estimate_linear_latency(in_f, out_f,num_cores=1, batch=BATCHSIZE)
    trial.set_user_attr("latency_ms_est", latency)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # # Limiting training data for faster epochs.
            # if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
            #     break

            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)    # since we only have linear layers we need to flatten the input first

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # # Limiting validation data.
                # if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                #     break
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(valid_loader.dataset)

        # trial.report(accuracy, epoch)

        # # Handle pruning based on the intermediate value.
        # if trial.should_prune():
        #     raise optuna.exceptions.TrialPruned()
        
    return accuracy, latency

In [9]:
# run for conv model
def objective(trial):
    # Generate the model.
    model, out = define_model(trial)
    model = model.to(DEVICE)
    
    # Generate the optimizers.
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    # lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    # optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    lr = 0.001
    optimizer = optim.Adam(model.parameters(), lr=lr)

    latency = 0.0

    for layer in out:

        if layer["type"] == "conv2d":
            c_in, h_in, w_in = layer["input_shape"]
            latency += estimate_conv_latency(input_channels=c_in, output_channels=layer["out_channels"], input_height=h_in, input_width=w_in, kernel_size=layer["kernel_size"], stride=layer["stride"])

        elif layer["type"] == "maxpool2d":
            c, h, w = layer["input_shape"]
            latency += estimate_maxpool_latency(channels=c, input_height=h, input_width=w, kernel_size=layer["kernel_size"], stride=layer["stride"])

        elif layer["type"] == "global_avg_pool":
            c, h, w = layer["input_shape"]
            latency += estimate_global_avg_pool_latency(channels=c, input_height=h, input_width=w)

        elif layer["type"] == "linear":
            latency += estimate_linear_latency(layer["in_features"], layer["out_features"])

    trial.set_user_attr("latency_ms_est", latency)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # # Limiting training data for faster epochs.
            # if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
            #     break

            data, target = data.to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # # Limiting validation data.
                # if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                #     break
                data, target = data.to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(valid_loader.dataset)

        # trial.report(accuracy, epoch)

        # # Handle pruning based on the intermediate value.
        # if trial.should_prune():
        #     raise optuna.exceptions.TrialPruned()
        
    return accuracy, latency

In [10]:
study = optuna.create_study(
    directions=["maximize","minimize"],
    study_name="first try for the conv model",
    storage=STORAGE,
    load_if_exists=True
)
study.optimize(objective, n_trials=100, timeout=600)

# pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
# complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# print("Study statistics: ")
# print("  Number of finished trials: ", len(study.trials))
# print("  Number of pruned trials: ", len(pruned_trials))
# print("  Number of complete trials: ", len(complete_trials))

# best_trials = study.best_trials
# print(f"\nNumber of Pareto-optimal trials: {len(best_trials)}")

# for t in best_trials:
#     print(f"  Values: accuracy={t.values[0]:.4f}, latency={t.values[1]:.4f}")
#     print("  Params:")
#     for k, v in t.params.items():
#         print(f"    {k}: {v}")

[I 2026-02-07 12:23:38,955] Using an existing study with name 'first try for the conv model' instead of creating a new one.
[I 2026-02-07 12:24:02,577] Trial 1 finished with values: [0.902, 5009820.0] and parameters: {'n_layers': 3, 'conv_out_channels_0': 16, 'conv_kernel_0': 7, 'conv_stride_0': 2, 'layer_type_1': 'pool', 'layer_type_2': 'conv', 'conv_out_channels_2': 512, 'conv_kernel_2': 1, 'conv_stride_2': 1}.
[I 2026-02-07 12:25:33,685] Trial 2 finished with values: [0.8702, 50061618.0] and parameters: {'n_layers': 8, 'conv_out_channels_0': 128, 'conv_kernel_0': 1, 'conv_stride_0': 1, 'layer_type_1': 'linear', 'linear_out_1': 81, 'linear_out_2': 17, 'linear_out_3': 56, 'linear_out_4': 108, 'linear_out_5': 32, 'linear_out_6': 28, 'linear_out_7': 34}.
[I 2026-02-07 12:26:07,838] Trial 3 finished with values: [0.8896, 17191320.0] and parameters: {'n_layers': 8, 'conv_out_channels_0': 32, 'conv_kernel_0': 7, 'conv_stride_0': 2, 'layer_type_1': 'conv', 'conv_out_channels_1': 64, 'conv_k

In [11]:
# Start the Optuna Dashboard server on localhost:8080
run_server(STORAGE)

Bottle v0.13.4 server starting up (using WSGIRefServer())...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.

127.0.0.1 - - [07/Feb/2026 12:34:47] "GET / HTTP/1.1" 302 0
127.0.0.1 - - [07/Feb/2026 12:34:47] "GET /dashboard HTTP/1.1" 200 4145
127.0.0.1 - - [07/Feb/2026 12:34:48] "GET /static/bundle.js HTTP/1.1" 200 4140378
127.0.0.1 - - [07/Feb/2026 12:34:48] "GET /favicon.ico HTTP/1.1" 200 7670
127.0.0.1 - - [07/Feb/2026 12:34:48] "GET /api/studies HTTP/1.1" 200 314
  study, target=target, evaluator=PedAnovaImportanceEvaluator()
127.0.0.1 - - [07/Feb/2026 12:34:50] "GET /api/studies/2/param_importances?evaluator=ped_anova HTTP/1.1" 200 835
127.0.0.1 - - [07/Feb/2026 12:34:50] "GET /api/studies/2?after=0 HTTP/1.1" 200 52815
127.0.0.1 - - [07/Feb/2026 12:34:50] "GET /api/meta HTTP/1.1" 200 64
127.0.0.1 - - [07/Feb/2026 12:34:50] "GET /api/studies/2/param_importances?evaluator=ped_anova HTTP/1.1" 200 835
127.0.0.1 - - [07/Feb/2026 12:35:00] "GET /api/studies/2?after=11 HTTP/1.1" 2