In [53]:
import torch
import random
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
import os
from torch import nn

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import cv2

import json
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

from sklearn.utils.class_weight import compute_class_weight

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler

# Local application/library imports
from utils import load_search_space

import optuna

from sklearn.metrics import (
    RocCurveDisplay, PrecisionRecallDisplay,
    ConfusionMatrixDisplay, roc_auc_score, average_precision_score
)

## DATASET

In [3]:
SEED = 64

# Set random seeds
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'boston'        
dataset_subpath = 'Regression/boston'       
task_type = 'Regression'

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'cmc'        
dataset_subpath = 'Multiclass/cmc'       
task_type = 'Multiclass'

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'preprocessed_heloc'        
dataset_subpath = 'Binary/heloc'       
task_type = 'Binary'

In [5]:
df = pd.read_csv(f"./data/{dataset_subpath}/{dataset_name}.csv")

In [6]:
df.shape 

(1232, 15)

In [7]:
df.head()

Unnamed: 0,Team,League,Year,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RS
0,ARI,NL,2012,688,81,0.328,0.418,0.259,0,___null___,___null___,162,0.317,0.415,734
1,ATL,NL,2012,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,700
2,BAL,AL,2012,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,712
3,BOS,AL,2012,806,69,0.315,0.415,0.26,0,___null___,___null___,162,0.331,0.428,734
4,CHC,NL,2012,759,61,0.302,0.378,0.24,0,___null___,___null___,162,0.335,0.424,613


In [8]:
reduce = True if len(df) > 20000 else False

## LOAD AND PREPROCESS

In [9]:
def prepare_target_tensor(y, task):
    task = task.lower()
    if isinstance(y, pd.Series):
        y = y.to_numpy()
    elif isinstance(y, list):
        y = np.array(y)
        
    if task == "regression" or task == "binary":
        return torch.as_tensor(y, dtype=torch.float32).reshape(-1, 1)
    elif task == "multiclass":
        return torch.as_tensor(y, dtype=torch.long)
    else:
        raise ValueError(f"Unsupported task type: {task}")

In [10]:
import os, json
from typing import Optional, Tuple, Union
import cv2
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight

def _read_split_rgb(images_folder: str, split: str, problem_type: str) -> np.ndarray:
    """Read RGB uint8 images for a split based on <split>/<problem_type>.csv (column 'images')."""
    csv_path = os.path.join(images_folder, split, f"{problem_type}.csv")
    df = pd.read_csv(csv_path)
    img_paths = [os.path.join(images_folder, split, p) for p in df["images"].tolist()]
    imgs = []
    for p in img_paths:
        im = cv2.imread(p, cv2.IMREAD_COLOR)
        if im is None:
            raise FileNotFoundError(f"Could not read image: {p}")
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)  # HxWx3 uint8
        imgs.append(im)
    return np.stack(imgs, axis=0)  # [N,H,W,3] uint8 (assumes same size as you stated)

def _pad_constant_right_bottom_batch(
    imgs_uint8: np.ndarray,
    target_size: Union[int, Tuple[int, int]],
    fill_rgb01: np.ndarray
) -> np.ndarray:
    """
    Constant pad (right/bottom) to target size with fill = TRAIN mean RGB in [0,1].
    Input:  imgs_uint8 [N,H,W,3] uint8
    Output: float32 [N,3,Ht,Wt] in [0,1]
    """
    if isinstance(target_size, int):
        tw, th = target_size, target_size
    else:
        tw, th = int(target_size[0]), int(target_size[1])

    N = imgs_uint8.shape[0]
    out = np.empty((N, 3, th, tw), dtype=np.float32)
    fill = fill_rgb01.reshape(1, 1, 3)  # (1,1,3) in [0,1]

    for i in range(N):
        im01 = imgs_uint8[i].astype(np.float32) / 255.0  # [H,W,3] in [0,1]
        h, w, _ = im01.shape
        if w > tw or h > th:
            raise ValueError(f"Image {w}x{h} larger than target {tw}x{th}. Increase target_size or resize upstream.")
        canvas = np.empty((th, tw, 3), dtype=np.float32)
        canvas[:] = fill
        canvas[:h, :w, :] = im01
        out[i] = np.transpose(canvas, (2, 0, 1))
    return out

def load_and_preprocess_data(
    df, dataset_name, images_folder,
    problem_type, task_type,
    seed: int = 42, batch_size: int = 32, device: str = 'cpu',
    pad_images: bool = False, target_size: Optional[Union[int, Tuple[int, int]]] = None,
):
    task_type = task_type.lower()

    # ----- Config -----
    with open(f"./configs/preprocess/{dataset_name}.json") as f:
        config = json.load(f)
    categorical_cols = config["categorical_cols"]
    numerical_cols = config["numerical_cols"]
    encoding = config["encoding"]

    # ----- Features / target -----
    X = df[numerical_cols + categorical_cols].copy()
    y = df.iloc[:, -1].copy()

    le = None
    if encoding.get("target") == "label":
        le = LabelEncoder()
        y = le.fit_transform(y)
        label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        label_mapping = None

    # ----- Splits (70/15/15) -----
    if task_type == "regression":
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=seed)
        X_val_raw,   X_test_raw, y_val,  y_test  = train_test_split(X_temp_raw, y_temp, test_size=0.5, random_state=seed)
    else:
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=seed, stratify=y)
        X_val_raw,   X_test_raw, y_val,  y_test  = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed, stratify=y_temp
        )

    # ----- Class weights (optional) -----
    class_weight = None
    if task_type in ["binary", "multiclass"]:
        cw_vals = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
        classes_sorted = np.sort(np.unique(y_train))
        if task_type == "binary":
            wd = dict(zip(classes_sorted, cw_vals))
            pos_weight = wd[1] / wd[0]
            class_weight = torch.tensor(pos_weight, dtype=torch.float32)
            print(f"Binary pos_weight (for BCEWithLogitsLoss): {class_weight.item():.6f}")
        else:
            class_weight = torch.tensor(cw_vals, dtype=torch.float32)
            print(f"Multiclass class weights (for CrossEntropyLoss): {class_weight.tolist()}")

    # ----- ColumnTransformer (fit on TRAIN only) -----
    transformers = []
    if encoding.get("numerical_features") == "minmax":
        transformers.append(("num", MinMaxScaler(), numerical_cols))
    elif encoding.get("numerical_features") == "standard":
        transformers.append(("num", StandardScaler(), numerical_cols))
    if categorical_cols and encoding.get("categorical_features") == "onehot":
        transformers.append(("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols))

    if transformers:
        preprocessor = ColumnTransformer(transformers=transformers)
        X_train = preprocessor.fit_transform(X_train_raw)
        X_val   = preprocessor.transform(X_val_raw)
        X_test  = preprocessor.transform(X_test_raw)

        if "cat" in preprocessor.named_transformers_:
            cat_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
            all_feature_names = numerical_cols + list(cat_feature_names)
        else:
            all_feature_names = numerical_cols + categorical_cols

        X_train_num = pd.DataFrame(X_train, columns=all_feature_names, index=X_train_raw.index)
        X_val_num   = pd.DataFrame(X_val,   columns=all_feature_names, index=X_val_raw.index)
        X_test_num  = pd.DataFrame(X_test,  columns=all_feature_names, index=X_test_raw.index)
    else:
        all_feature_names = numerical_cols + categorical_cols
        X_train_num = pd.DataFrame(X_train_raw, columns=all_feature_names, index=X_train_raw.index)
        X_val_num   = pd.DataFrame(X_val_raw,   columns=all_feature_names, index=X_val_raw.index)
        X_test_num  = pd.DataFrame(X_test_raw,  columns=all_feature_names, index=X_test_raw.index)

    print(f"Shapes — Train: {X_train_num.shape}, Val: {X_val_num.shape}, Test: {X_test_num.shape}")
    print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
    print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
    print(f"Total features: {X_train_num.shape[1]}")
    if label_mapping:
        print(f"Target label mapping: {label_mapping}")

    # ----- Images (uint8 RGB) -----
    X_train_img_u8 = _read_split_rgb(images_folder, "train", problem_type)
    X_val_img_u8   = _read_split_rgb(images_folder, "val",   problem_type)
    X_test_img_u8  = _read_split_rgb(images_folder, "test",  problem_type)

    # ----- Optional padding with TRAIN mean (no normalization) -----
    if pad_images:
        if target_size is None:
            raise ValueError("pad_images=True requires target_size (int or (W,H)).")
        train_mean_rgb01 = (X_train_img_u8.astype(np.float32) / 255.0).reshape(-1, 3).mean(axis=0).astype(np.float32)
        X_train_arr = _pad_constant_right_bottom_batch(X_train_img_u8, target_size, train_mean_rgb01)
        X_val_arr   = _pad_constant_right_bottom_batch(X_val_img_u8,   target_size, train_mean_rgb01)
        X_test_arr  = _pad_constant_right_bottom_batch(X_test_img_u8,  target_size, train_mean_rgb01)

        if isinstance(target_size, int):
            tw = th = int(target_size)
        else:
            tw, th = int(target_size[0]), int(target_size[1])
        imgs_shape = (3, th, tw)
    else:
        # Scale to [0,1] and convert to NCHW.
        X_train_arr = (X_train_img_u8.astype(np.float32) / 255.0).transpose(0, 3, 1, 2)
        X_val_arr   = (X_val_img_u8.astype(np.float32)   / 255.0).transpose(0, 3, 1, 2)
        X_test_arr  = (X_test_img_u8.astype(np.float32)  / 255.0).transpose(0, 3, 1, 2)
        _, C, H, W = X_train_arr.shape
        imgs_shape = (C, H, W)

    # ----- Tensors & DataLoaders -----
    X_train_num_tensor = torch.as_tensor(X_train_num.values, dtype=torch.float32)
    X_val_num_tensor   = torch.as_tensor(X_val_num.values,   dtype=torch.float32)
    X_test_num_tensor  = torch.as_tensor(X_test_num.values,  dtype=torch.float32)

    X_train_img_tensor = torch.from_numpy(X_train_arr)  # float32 [0,1], NCHW
    X_val_img_tensor   = torch.from_numpy(X_val_arr)
    X_test_img_tensor  = torch.from_numpy(X_test_arr)

    y_train_tensor = prepare_target_tensor(y_train, task_type)
    y_val_tensor   = prepare_target_tensor(y_val,   task_type)
    y_test_tensor  = prepare_target_tensor(y_test,  task_type)

    train_dataset = TensorDataset(X_train_num_tensor, X_train_img_tensor, y_train_tensor)
    val_dataset   = TensorDataset(X_val_num_tensor,   X_val_img_tensor,   y_val_tensor)
    test_dataset  = TensorDataset(X_test_num_tensor,  X_test_img_tensor,  y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  pin_memory=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, pin_memory=True)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, pin_memory=True)

    attributes = X_train_num.shape[1]
    print("Images shape (C,H,W):", imgs_shape)
    print("Attributes:", attributes)

    return train_loader, val_loader, test_loader, attributes, imgs_shape, le, class_weight


## MODEL ARCHITECTURES

In [11]:
def find_divisors(n):
    divisors = []
    for i in range(1, int(n**0.5) + 1):
        if n % i == 0:
            divisors.append(i)
            if i != n // i:  # Check to include both divisors if they are not the same
                divisors.append(n // i)
    divisors.sort()
    return divisors

### Vision Transformer

In [12]:
from models.vit_pytorch.simple_vit_with_register_tokens import ViT

In [13]:
import torch
from torch import nn

# --- helpers ---
def get_act(name: str):
    return nn.ReLU if str(name).lower() == "relu" else nn.GELU

class ViTMLP(nn.Module):
    """
    Hybrid ViT + Tabular MLP (simple concat fusion, no projections, no LayerNorm).
    Required params:
      ViT: patch_size, dim, depth, heads, mlp_dim, vit_dropout, vit_emb_dropout
      Tab: activation, mlp_hidden_dims, tab_dropout
      Fusion: fusion_hidden_dims, fusion_dropout
    """
    def __init__(self, imgs_shape, num_input_dim, params, task, num_classes=None):
        super().__init__()
        act = get_act(params["activation"])

        # ---- ViT branch (pooled vector of size dim) ----
        self.vit = ViT(
            image_size=imgs_shape,
            patch_size=params["patch_size"],
            dim=params["dim"],
            depth=params["depth"],
            heads=params["heads"],
            mlp_dim=params["mlp_dim"] * params["dim"],  # expansion ratio → hidden dim
            dropout=params["vit_dropout"],
            emb_dropout=params["vit_emb_dropout"]
        )
        vit_out_dim = int(params["dim"])

        # ---- Tabular MLP branch (no LayerNorm) ----
        tab_layers, in_dim = [], int(num_input_dim)
        tab_dropout = float(params.get("tab_dropout", 0.0))
        for h in params["mlp_hidden_dims"]:
            h = int(h)
            tab_layers += [
                nn.Linear(in_dim, h),
                act(),
                nn.Dropout(tab_dropout) if tab_dropout > 0 else nn.Identity()
            ]
            in_dim = h
        self.tabular_mlp = nn.Sequential(*tab_layers)
        tab_out_dim = in_dim if tab_layers else int(num_input_dim)

        # ---- Simple concat fusion head (no LayerNorm) ----
        fused_in_dim = vit_out_dim + tab_out_dim
        fusion_layers = []
        fusion_dropout = float(params.get("fusion_dropout", 0.0))
        for h in params["fusion_hidden_dims"]:
            h = int(h)
            fusion_layers += [
                nn.Linear(fused_in_dim, h),
                act(),
                nn.Dropout(fusion_dropout) if fusion_dropout > 0 else nn.Identity()
            ]
            fused_in_dim = h

        out_dim = 1 if task in ("regression", "binary") else int(num_classes)
        fusion_layers.append(nn.Linear(fused_in_dim, out_dim))
        self.fusion_mlp = nn.Sequential(*fusion_layers)

    def forward(self, num_input, vit_input):
        vit_feat = self.vit(vit_input)         # (B, dim)
        tab_feat = self.tabular_mlp(num_input) # (B, tab_out)
        fused = torch.cat([vit_feat, tab_feat], dim=1)
        return self.fusion_mlp(fused)

### CNN

In [14]:
import math
import torch
import torch.nn as nn

# Reuse your get_act helper
def get_act(name: str):
    return nn.ReLU if str(name).lower() == "relu" else nn.GELU

# ---------------- Stem ----------------
class UnifiedStem(nn.Module):
    """
    - '3x3' stem: safe for tiny images (3x3, 5x5, 32x32).
    - '7x7' stem (+ optional maxpool): classic ImageNet style for large images.
    Only apply 7x7+stride2 when max(H,W) >= 64; otherwise fallback to 3x3.
    """
    def __init__(self, C, stem_width, stem_type="3x3", use_maxpool=True, H=None, W=None):
        super().__init__()
        large = (max(H or 0, W or 0) >= 64)
        if stem_type == "7x7" and large:
            layers = [
                nn.Conv2d(C, stem_width, kernel_size=7, stride=2, padding=3, bias=False),
                nn.BatchNorm2d(stem_width),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=1) if use_maxpool else nn.Identity(),
            ]
        else:
            layers = [
                nn.Conv2d(C, stem_width, kernel_size=3, stride=1, padding=1, bias=False),
                nn.BatchNorm2d(stem_width),
                nn.ReLU(inplace=True),
            ]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# --------------- Basic Block ---------------
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_ch)
        self.relu  = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_ch)

        self.down = None
        if stride != 1 or in_ch != out_ch:
            self.down = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch),
            )

    def forward(self, x):
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.down is not None:
            identity = self.down(identity)
        out = self.relu(out + identity)
        return out

# --------------- ResNet Backbone (features only) ---------------
class ResNetBackboneAnySize(nn.Module):
    """
    Classic ResNet (BasicBlocks), size-agnostic via AdaptiveAvgPool2d(1).
    Outputs a feature vector (no classifier).
    """
    def __init__(self, params, imgs_shape):
        super().__init__()
        C, H, W = imgs_shape
        assert params["in_channels"] == C, "in_channels must match imgs_shape[0]"

        # knobs
        stem_type     = params["stem_type"]
        use_maxpool   = params["use_maxpool"]
        stem_width    = params["stem_width"]
        blocks_ps     = params["blocks_per_stage"]  # e.g., "[2,2,2,2]"
        n_stages      = len(blocks_ps)
        base_width    = params["base_width"]
        width_mul     = params["width_mul"]

        # stem
        self.stem = UnifiedStem(C, stem_width, stem_type=stem_type, use_maxpool=use_maxpool, H=H, W=W)

        # stage widths
        B = int(base_width * width_mul)
        all_out = [B, B*2, B*4, B*8][:n_stages]
        blocks_ps = [max(1, int(x)) for x in list(blocks_ps)[:n_stages]]

        # approximate current spatial size after stem
        curH, curW = H, W
        if stem_type == "7x7" and max(H, W) >= 64:
            curH = max(1, curH // 2)
            curW = max(1, curW // 2)
            if use_maxpool:
                curH = max(1, curH // 2)
                curW = max(1, curW // 2)

        in_planes = stem_width
        layers = []

        def can_downsample(h, w):
            return (h >= 4 and w >= 4)

        for si in range(n_stages):
            out_planes = all_out[si]
            n_blocks   = blocks_ps[si]
            stride = 2 if (si > 0 and can_downsample(curH, curW)) else 1
            layers.append(BasicBlock(in_planes, out_planes, stride=stride))
            in_planes = out_planes
            if stride == 2:
                curH = max(1, curH // 2)
                curW = max(1, curW // 2)
            for _ in range(n_blocks - 1):
                layers.append(BasicBlock(in_planes, out_planes, stride=1))

        self.features = nn.Sequential(*layers)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.flat = nn.Flatten()

        # infer feature dim
        with torch.no_grad():
            dummy = torch.zeros(1, C, max(1, H), max(1, W))
            x = self.stem(dummy)
            x = self.features(x)
            x = self.pool(x)
            x = self.flat(x)
            self.feat_dim = x.shape[1]

    def forward(self, x):
        x = self.stem(x)
        x = self.features(x)
        x = self.pool(x)
        x = self.flat(x)
        return x  # (B, feat_dim)

# CNN

def get_act(name: str):
    return nn.ReLU if str(name).lower() == "relu" else nn.GELU

class CNNMLP(nn.Module):
    """
    CNN backbone -> MLP head (mirrors ViTMLP: encoder + MLP head).
    Uses your ResNetBackboneAnySize.
    """
    def __init__(self, imgs_shape, num_input_dim, params, task, num_classes=None, device="cuda"):
        super().__init__()
        self.task = task.lower()
        act = get_act(params.get("activation", "relu"))

        # Backbone (reuse your class; params must include cnn knobs)
        self.backbone = ResNetBackboneAnySize(params, imgs_shape)
        feat_dim = self.backbone.feat_dim
        
        # ---- Tabular MLP branch ----
        tab_layers, in_dim = [], num_input_dim
        tab_dropout = float(params["tab_dropout"])
        for h in params["mlp_hidden_dims"]:
            tab_layers += [
                nn.Linear(in_dim, h),
                act(),
                nn.Dropout(tab_dropout) if tab_dropout > 0 else nn.Identity()
            ]
            in_dim = h
        self.tab_mlp = nn.Sequential(*tab_layers)
        tab_out = in_dim if tab_layers else num_input_dim

        # ---- Fusion (concat) ----
        fused_dim = feat_dim + tab_out
        fusion_layers = []
        fusion_dropout = float(params["fusion_dropout"])
        for h in params["fusion_hidden_dims"]:
            fusion_layers += [
                nn.Linear(fused_dim, h),
                act(),
                nn.Dropout(fusion_dropout) if fusion_dropout > 0 else nn.Identity()
            ]
            fused_dim = h

        out_dim = 1 if task in ("regression", "binary") else num_classes
        fusion_layers.append(nn.Linear(fused_dim, out_dim))
        self.fusion = nn.Sequential(*fusion_layers)

    def forward(self, num_input, img_input):
        cnn_feat = self.backbone(img_input)
        tab_feat = self.tab_mlp(num_input) if len(self.tab_mlp) else num_input
        fused = torch.cat([cnn_feat, tab_feat], dim=1)
        out = self.fusion(fused)
        return out


### Resnet50

In [15]:
import torchvision.models as models

In [16]:
class ResNetMLP(nn.Module):
    def __init__(self, imgs_shape, num_input_dim, params, task_type, num_classes=None):
        super(ResNetMLP, self).__init__()

        # Load a ResNet50 with or without pretrained weights
        base_resnet = models.resnet50(weights=None)

        self.resnet_backbone = nn.Sequential(*list(base_resnet.children())[:-1])  # (B, 2048, 1, 1)
        self.flatten = nn.Flatten()  # Converts (B, 2048, 1, 1) → (B, 2048)

        # Tabular MLP branch
        tabular_layers = []
        input_dim = num_input_dim
        for hidden_dim in params["mlp_hidden_dims"]:
            tabular_layers.append(nn.Linear(input_dim, hidden_dim))
            tabular_layers.append(nn.ReLU())
            input_dim = hidden_dim
        self.tabular_mlp = nn.Sequential(*tabular_layers)

        # Create a dummy image based on the input image shape to calculate the output size
        dummy_img = torch.randn(4, *imgs_shape)  # (B, 3, H, W)
        with torch.no_grad():
            # Pass the dummy image through ResNet to get feature map
            img_feat = self.resnet_backbone(dummy_img)  
            resnet_output_dim = self.flatten(img_feat)

        # Fusion MLP head (ResNet features + Tabular MLP)
        fusion_input_dim = resnet_output_dim.shape[1] + input_dim
        fusion_layers = []
        for hidden_dim in params.get("fusion_hidden_dims", [128]):
            fusion_layers.append(nn.Linear(fusion_input_dim, hidden_dim))
            fusion_layers.append(nn.ReLU())
            fusion_input_dim = hidden_dim

        output_dim = 1 if task_type in ["regression", "binary"] else num_classes
        fusion_layers.append(nn.Linear(fusion_input_dim, output_dim))
        self.fusion_mlp = nn.Sequential(*fusion_layers)

        # Output activation
        self.activation = nn.Identity()

    def forward(self, num_input, img_input):
        # ResNet feature extraction
        img_feat = self.resnet_backbone(img_input)  # (B, 2048, 1, 1)
        img_feat = self.flatten(img_feat)           # (B, 2048)

        # Tabular feature extraction
        tab_feat = self.tabular_mlp(num_input)      # (B, D_tabular)

        # Fusion and classification
        fusion = torch.cat([img_feat, tab_feat], dim=1)

        output = self.fusion_mlp(fusion)

        return self.activation(output)

## COMPILE AND FIT

In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import joblib
import os
import gc
import copy

from models.utils import get_loss_fn, calculate_metrics, calculate_metrics_hybrid, calculate_metrics_hybrid_manuel, calculate_metrics_from_numpy, get_class_weighted_loss_fn

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.optim.lr_scheduler import OneCycleLR
import matplotlib.pyplot as plt
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import os

def compile_and_fit(model, train_loader, val_loader, test_loader, dataset_name, 
                    model_name, image_name, trial_name=None, task='regression', epochs=200, max_lr=1, 
                    div_factor=10, final_div_factor=1, device='cuda', weight_decay=1e-2, pct_start=0.3, save_model=False, class_weights=None, save_dir=None, study=None, patch=None, verbose=False):
    model = model.to(device)
    
    if class_weights != None:
        loss_fn = get_class_weighted_loss_fn(task, class_weights)
    else:
        loss_fn = get_loss_fn(task)

    # Compute min_lr from max_lr and div_factor
    min_lr = max_lr / div_factor

    optimizer = optim.AdamW(model.parameters(), lr=min_lr, weight_decay=weight_decay)
    
    total_steps = epochs * len(train_loader)
    scheduler = OneCycleLR(optimizer, max_lr=max_lr, div_factor=div_factor, final_div_factor=final_div_factor, total_steps=total_steps, pct_start=pct_start, anneal_strategy="cos")
    
    best_val_loss = float('inf')
    best_model = None
    best_epoch = 0
    #early_stopping_counter = 0
    #patience = 10  # Early stopping patience

    history = {'train_loss': [], 'val_loss': [], 'learning_rate': [], 'epoch_time': []}

    if task == 'regression':
        history.update({'train_mse': [],  'val_mse': [], 'train_mae': [],  'val_mae': [], 'train_rmse': [], 'val_rmse': [], 'train_r2': [], 'val_r2': []})
    elif task in ['binary', 'multiclass']:
        history.update({'train_accuracy': [], 'val_accuracy': [], 'train_precision': [], 'val_precision': [], 'train_recall': [], 'val_recall': [], 'train_f1': [], 'val_f1': []})

    start_time = time.time()
    
    for epoch in range(epochs):
        epoch_start_time = time.time()

        model.train()
        train_loss = 0.0
        train_preds = []
        train_targets = []

        for num_data, img_data, targets in train_loader:
            num_data, img_data, targets = num_data.to(device, non_blocking=True), img_data.to(device, non_blocking=True), targets.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            outputs = model(num_data, img_data)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
            train_preds.extend(outputs.cpu().detach().numpy())
            train_targets.extend(targets.cpu().numpy())


        train_loss /= len(train_loader)
        if task == 'multiclass':
            y_train_pred = np.vstack(train_preds)
            y_train_true = train_targets
        else:
            y_train_pred = np.concatenate(train_preds)
            y_train_true = np.concatenate(train_targets)
        train_metrics = calculate_metrics_from_numpy(y_train_true, y_train_pred, task)

        model.eval()
        val_loss = 0.0
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for num_data, img_data, targets in val_loader:
                num_data, img_data, targets = num_data.to(device, non_blocking=True), img_data.to(device, non_blocking=True), targets.to(device, non_blocking=True)
                outputs = model(num_data, img_data)
                loss = loss_fn(outputs, targets)
                
                val_loss += loss.item()
                val_preds.extend(outputs.cpu().numpy())
                val_targets.extend(targets.cpu().numpy())

        val_loss /= len(val_loader)
        if task == 'multiclass':
            y_val_pred = np.vstack(val_preds)
            y_val_true = val_targets
        else:
            y_val_pred = np.concatenate(val_preds)
            y_val_true = np.concatenate(val_targets)
        val_metrics = calculate_metrics_from_numpy(y_val_true, y_val_pred, task)
        
        # Get the current learning rate
        current_lr = scheduler.get_last_lr()

        epoch_time = time.time() - epoch_start_time

        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['learning_rate'].append(current_lr)
        history['epoch_time'].append(epoch_time)

        for k, v in train_metrics.items():
            history[f'train_{k}'].append(v)
        for k, v in val_metrics.items():
            history[f'val_{k}'].append(v)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            best_epoch = epoch + 1
            #early_stopping_counter = 0
        #else:
        #    early_stopping_counter += 1
        #    if early_stopping_counter >= patience:
        #        print(f"Early stopping at epoch {epoch + 1}")
        #        break

    total_time = time.time() - start_time
    model.load_state_dict(best_model)

    # Recompute metrics using the best model
    train_metrics, y_true_train, y_pred_train, y_prob_train = calculate_metrics_hybrid(model, train_loader, device, class_weights, task)
    val_metrics, y_true_val, y_pred_val, y_prob_val  = calculate_metrics_hybrid(model, val_loader, device, class_weights, task)
    test_metrics, y_true_test, y_pred_test, y_prob_test = calculate_metrics_hybrid(model, test_loader, device, class_weights, task)

    # Store recomputed metrics
    metrics = {
        'train_loss': train_metrics['loss'],
        'val_loss': val_metrics['loss'],
        'test_loss': test_metrics['loss'],
        'min_lr': min_lr,
        'max_lr': max_lr,
        'total_time': total_time,
        'average_epoch_time': sum(history['epoch_time']) / len(history['epoch_time'])
    }

    # Add task-specific metrics
    for k in train_metrics:
        if k != 'loss':
            metrics[f'train_{k}'] = train_metrics[k]
    for k in val_metrics:
        if k != 'loss':
            metrics[f'val_{k}'] = val_metrics[k]
    for k in test_metrics:
        if k != 'loss':
            metrics[f'test_{k}'] = test_metrics[k]
            
    if verbose:     
        print(f"\nTraining completed in {total_time:.2f} seconds")
        print(f"Best model found at epoch {best_epoch}/{epochs}")
        print(f"Best Train Loss: {metrics['train_loss']:.4f}, Best Val Loss: {metrics['val_loss']:.4f}")
        print(metrics)
    
    if save_model:
        if model_name == "CNN_hybrid":
            save_path = os.path.join(save_dir, f"{model_name}/{image_name}/best_model/{trial_name}")
        else:
            save_path = os.path.join(save_dir, f"{model_name}/{image_name}/best_model/{trial_name}")
        os.makedirs(save_path, exist_ok=True)

        plot_metric(history['train_loss'], history['val_loss'], 'Loss', save_path)
        if task == 'regression':
            plot_metric(history['train_mse'], history['val_mse'], 'MSE', save_path)
            plot_metric(history['train_rmse'], history['val_rmse'], 'RMSE', save_path)
        else:
            plot_metric(history['train_accuracy'], history['val_accuracy'], 'Accuracy', save_path)
            plot_metric(history['train_f1'], history['val_f1'], 'F1', save_path)

        plot_learning_rate(history['learning_rate'], save_path)

        # Save metrics
        os.makedirs(save_path, exist_ok=True)
        with open(f'{save_path}/best_model_metrics.txt', 'w') as f:
            for key, value in metrics.items():
                f.write(f'{key}: {value}\n')

        # Save model
        torch.save(best_model, f"{save_path}/best_model.pth")
        print(f"Best model saved to {save_path}/best_model.pth")

        # Additional plots for classification
        if task in ["binary"]:
            plot_extra("Train", y_true_train, y_pred_train, y_prob_train, save_path)
            plot_extra("Validation", y_true_val, y_pred_val, y_prob_val, save_path)
            plot_extra("Test", y_true_test, y_pred_test, y_prob_test, save_path)

    del model
    torch.cuda.empty_cache()
    gc.collect()

    return metrics


def plot_extra(split_name, y_true, y_pred, y_prob, save_path):
    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

    # ROC Curve
    RocCurveDisplay.from_predictions(y_true, y_prob)
    auc_score = roc_auc_score(y_true, y_prob)
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
    plt.title(f"{split_name} ROC Curve (AUC = {auc_score:.2f})")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(save_path, f"{split_name.lower()}_roc_curve.png"))
    plt.close("all")

    # Precision-Recall Curve
    PrecisionRecallDisplay.from_predictions(y_true, y_prob)
    avg_prec = average_precision_score(y_true, y_prob)
    plt.title(f"{split_name} PR Curve (AP = {avg_prec:.2f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.savefig(os.path.join(save_path, f"{split_name.lower()}_pr_curve.png"))
    plt.close("all")

    # Normalized confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, normalize='true').plot(cmap='Blues')
    plt.title(f"{split_name} Confusion Matrix (Normalized)")
    plt.grid(False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.savefig(os.path.join(save_path, f"{split_name.lower()}_confusion_matrix_normalized.png"))
    plt.close("all")

    # Raw confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, normalize=None).plot(cmap='Blues')
    plt.title(f"{split_name} Confusion Matrix (Counts)")
    plt.grid(False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.savefig(os.path.join(save_path, f"{split_name.lower()}_confusion_matrix_counts.png"))
    plt.close("all")


def plot_metric(train_metric, val_metric, metric_name, save_path):
    plt.figure()
    plt.plot(train_metric, label=f'Train {metric_name}')
    plt.plot(val_metric, label=f'Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend()
    plt.title(f'{metric_name} vs. Epoch')
    save_path = f"{save_path}/{metric_name.lower()}_plot.png"
    plt.savefig(save_path)
    plt.close("all")

def plot_learning_rate(learning_rates, save_path):
    plt.figure()
    plt.plot(learning_rates)
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.title('Learning Rate vs. Epoch')
    save_path = f"{save_path}/learning_rate_plot.png"
    plt.savefig(save_path)
    plt.close("all")

# EXPERIMENTS

## HyViT

In [19]:
save_dir =  os.path.join("logs", task_type, dataset_name)
model_name = "ViT_hybrid"

# Load config
with open(f"./configs/preprocess/{dataset_name}.json") as f:
    config = json.load(f)
batch_size = config["batch_size"]

epochs = 50
n_trials = 50

if task_type.lower() == 'multiclass':
    num_classes = df.iloc[:,-1].nunique()
else:
    num_classes = 1

device='cuda:0' if torch.cuda.is_available() else 'cpu'

In [20]:
save_dir =  os.path.join("logs", task_type, dataset_name)
model_name = "CNN_hybrid"

# Load config
with open(f"./configs/preprocess/{dataset_name}.json") as f:
    config = json.load(f)
batch_size = config["batch_size"]

epochs = 50
n_trials = 50

if task_type.lower() == 'multiclass':
    num_classes = df.iloc[:,-1].nunique()
else:
    num_classes = 1

device='cuda:0' if torch.cuda.is_available() else 'cpu'

In [21]:
def objective(trial, model_name, image_name, task_type, 
              train_loader, val_loader, test_loader,
              divisors, attributes, imgs_shape, num_classes=None,
              device='cuda', save_dir=None, class_weight=None, epochs=100, path_vision=None, path_mlp=None):
    
    if model_name == "ViT_hybrid":
        task = task_type.lower()
        
        params = load_search_space(model_name, trial)

        with open(f"{path_vision}/best_params.json", "r") as f:
            params_best_vit = json.load(f)
            
        params_best_vit = {
            k: v for k, v in params_best_vit.items()
            if k in ["patch_size", "dim", "depth", "heads", "mlp_dim", "dropout", "emb_dropout"]
        }
        
        with open(f"{path_mlp}/best_params.json", "r") as f:
            params_best_mlp = json.load(f)
            
        params_best_mlp = {
            k: v for k, v in params_best_mlp.items()
            if k in ["mlp_hidden_dims", "dropout"]
        }
        
        params_best_vit["vit_dropout"] = params_best_vit.pop("dropout")
        params_best_vit["vit_emb_dropout"] = params_best_vit.pop("emb_dropout")
        params_best_mlp["tab_dropout"] = params_best_mlp.pop("dropout")
            
        params = {**params, **params_best_vit, **params_best_mlp}

        params["mlp_hidden_dims"] = json.loads(params["mlp_hidden_dims"])

        params["fusion_hidden_dims"] = json.loads(params["fusion_hidden_dims"])

        with open(f"configs/optuna_search/{model_name}.json", "r") as f:
            full_config = json.load(f)

        config = full_config[model_name]["fit"]  # Access the model key

        # Initialize model
        model = ViTMLP(imgs_shape[1], attributes, params, task, num_classes)
    else:
        task = task_type.lower()

        params = load_search_space(model_name, trial)

        with open(f"{path_vision}/best_params.json", "r") as f:
            params_best_cnn = json.load(f)
            
        params_best_cnn = {
            k: v for k, v in params_best_cnn.items()
            if k in ["in_channels", "activation", "stem_type", "use_maxpool", "stem_width",
                     "blocks_per_stage", "base_width", "width_mul"]
        }
        
        with open(f"{path_mlp}/best_params.json", "r") as f:
            params_best_mlp = json.load(f)
        
        params_best_mlp = {
            k: v for k, v in params_best_mlp.items()
            if k in ["mlp_hidden_dims", "dropout"]
        }
        
        params_best_mlp["tab_dropout"] = params_best_mlp.pop("dropout")
        
        with open(f"configs/optuna_search/{model_name}.json", "r") as f:
            full_config = json.load(f)
            
        params = {**params, **params_best_cnn, **params_best_mlp}
            
        # parse head dims safely (keeps your JSON format)
        params["mlp_hidden_dims"] = json.loads(params["mlp_hidden_dims"])
        
        params["fusion_hidden_dims"] = json.loads(params["fusion_hidden_dims"])
                    
        params["blocks_per_stage"] = json.loads(params["blocks_per_stage"])
            
        config = full_config[model_name]["fit"]  # Access the model key
        
        # Build and train model
        model = CNNMLP(imgs_shape, attributes, params, task, num_classes)
        
    metrics = compile_and_fit(
        model,
        train_loader, val_loader, test_loader,
        dataset_name=dataset_name,
        model_name=f"trial_{trial.number}",
        image_name=image_name,
        task=task,  # assumed to be defined externally
        max_lr=trial.suggest_float("max_lr", config["max_lr"][1], config["max_lr"][2], log=True),
        div_factor=trial.suggest_int("div_factor", config["div_factor"][1], config["div_factor"][2]),
        final_div_factor=trial.suggest_int("final_div_factor", config["final_div_factor"][1], config["final_div_factor"][2]),
        weight_decay=trial.suggest_float("weight_decay", config["weight_decay"][1], config["weight_decay"][2], log=True),
        pct_start=trial.suggest_float("pct_start", config["pct_start"][1], config["pct_start"][2]),
        epochs=epochs,
        save_model=False,
        class_weights=class_weight
    )

    save_dir = os.path.join(save_dir, model_name, image_name, "optuna")
    os.makedirs(save_dir, exist_ok=True)

    if task == 'regression':
        score = metrics["val_rmse"]
        with open(f"{save_dir}/optuna_trials_log.txt", "a") as f:
            f.write(f"Trial {trial.number} - VAL-RMSE: {score:.4f}, Params: {params}\n")
            f.write("=" * 60 + "\n")
    
    elif task == 'binary':
        score = metrics["val_roc_auc"]
        with open(f"{save_dir}/optuna_trials_log.txt", "a") as f:
            f.write(f"Trial {trial.number} - VAL-AUC: {score:.4f}, Params: {params}\n")
            f.write("=" * 60 + "\n")

    elif task == 'multiclass':
        score = metrics["val_accuracy"]
        with open(f"{save_dir}/optuna_trials_log.txt", "a") as f:
            f.write(f"Trial {trial.number} - VAL-Accuracy: {score:.4f}, Params: {params}\n")
            f.write("=" * 60 + "\n")
    else:
        raise ValueError(f"Unsupported task type: {task_type}")
    
    return score



In [22]:
import random
import numpy as np
import torch

def set_model_seed(seed: int):
    # Python built-in RNG
    random.seed(seed)
    # NumPy RNG
    np.random.seed(seed)
    # Torch RNG
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you use multi-GPU
    
    # For reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [23]:
# === benchmark_eval.py ========================================================
from numbers import Number
import os
import json
import numpy as np
import optuna
import torch
from torch import nn

# -----------------------------------------------------------------------------
# NEW: optional calflops (robust fallback if not installed)
# -----------------------------------------------------------------------------
from calflops import calculate_flops as _calflops_calc
_HAVE_CALFLOPS = True

# -----------------------------------------------------------------------------
# Config (adjust as needed)
# -----------------------------------------------------------------------------
TOP_K = 5
SINGLE_PASS_SEED = 0              # seed for one-time eval of top-K
FINAL_SEEDS = [0, 1, 2, 3, 4]     # seeds for the final winner
FULL_EPOCHS = 100

# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
def _count_params(model: nn.Module, trainable_only: bool = False) -> int:
    if trainable_only:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    return sum(p.numel() for p in model.parameters())

def _ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)
    return p

def _is_minimize_study(study):
    try:
        return study.direction == optuna.study.StudyDirection.MINIMIZE
    except Exception:
        try:
            return study.directions[0] == optuna.study.StudyDirection.MINIMIZE
        except Exception:
            return True  # fallback

def primary_val_key_for_task(task_type: str):
    t = task_type.lower()
    if t == "regression":   # lower is better
        return "val_rmse", True
    if t == "binary":       # higher is better
        return "val_roc_auc", False
    if t == "multiclass":   # higher is better
        return "val_accuracy", False
    return "val_loss", True

def _sort_trials(trials, minimize: bool):
    return sorted(trials, key=lambda t: t.value, reverse=not minimize)

def _metric_or_default(m: dict, key: str, minimize: bool):
    if key in m and isinstance(m[key], (Number, np.floating, np.integer)):
        return float(m[key])
    return (np.inf if minimize else -np.inf)

# -----------------------------------------------------------------------------
# NEW: FLOPs/MACs helpers (hybrid-aware)
# -----------------------------------------------------------------------------
def _humanize(n: float, unit: str = "") -> str:
    try:
        n = float(n)
        for u in ["", "K", "M", "G", "T", "P"]:
            if abs(n) < 1000.0:
                return f"{n:.3f}{u}{unit}"
            n /= 1000.0
        return f"{n:.3f}E{unit}"
    except Exception:
        return str(n)

def _try_flops_single_input(model: nn.Module, input_shape):
    """
    calflops on a single-input submodule (e.g., vit, tabular_mlp, fusion_mlp).
    Returns dict or None.
    """
    if not _HAVE_CALFLOPS:
        return None
    try:
        flops, macs, params_cf = _calflops_calc(
            model=model,
            input_shape=tuple(int(x) for x in input_shape),
            output_as_string=False
        )
        return {
            "flops": float(flops),
            "macs": float(macs),
            "params_from_calflops": float(params_cf),
            "flops_str": _humanize(flops),
            "macs_str": _humanize(macs),
            "input_shape": tuple(int(x) for x in input_shape),
            "tool": "calflops"
        }
    except Exception:
        return None

def _probe_branch_dims(model, imgs_shape, attributes):
    """
    Infer (visual_out_dim, tabular_out_dim) by a tiny no-grad forward.
    Handles:
      - ViTMLP: model.vit + model.tabular_mlp (may be empty)
      - CNNMLP: model.backbone (with .feat_dim or forward) + model.tab_mlp (may be empty)
      - Fallback: model.cnn if present
    """
    with torch.no_grad():
        dev = next(model.parameters()).device

        # Resolve image shape (C,H,W)
        if isinstance(imgs_shape, (list, tuple)) and len(imgs_shape) >= 3:
            C, H, W = int(imgs_shape[-3]), int(imgs_shape[-2]), int(imgs_shape[-1])
        else:
            C, H, W = 3, 224, 224

        dummy_img = torch.zeros(1, C, H, W, device=dev)
        dummy_tab = torch.zeros(1, int(attributes), device=dev)

        # ---- Visual branch out dim ----
        if hasattr(model, "vit"):
            vis_feat = model.vit(dummy_img)
            vis_out_dim = int(vis_feat.shape[1])
        elif hasattr(model, "backbone"):
            # Prefer cached feat_dim if provided by the backbone
            if hasattr(model.backbone, "feat_dim"):
                vis_out_dim = int(model.backbone.feat_dim)
            else:
                vis_feat = model.backbone(dummy_img)
                vis_out_dim = int(vis_feat.shape[1])
        elif hasattr(model, "cnn"):
            vis_feat = model.cnn(dummy_img)
            vis_out_dim = int(vis_feat.shape[1])
        else:
            raise RuntimeError("Visual branch not found (expected .vit, .backbone, or .cnn).")

        # ---- Tabular branch out dim ----
        if hasattr(model, "tab_mlp"):
            tab_mlp = model.tab_mlp
            if isinstance(tab_mlp, nn.Sequential) and len(tab_mlp) == 0:
                tab_out_dim = int(attributes)
            else:
                tab_feat = tab_mlp(dummy_tab)
                tab_out_dim = int(tab_feat.shape[1])
        else:
            # No explicit tab MLP—assume raw tab features are used
            tab_out_dim = int(attributes)

    return vis_out_dim, tab_out_dim

        

def _try_compute_flops_hybrid(model, imgs_shape, attributes, vit_or_cnn_out_dim, tab_out_dim, batch_size: int = 1):
    """
    Compute FLOPs/MACs for a hybrid model by summing three parts:
      - Visual branch with image input (B,C,H,W)
      - Tabular MLP with input (B, attributes)
      - Fusion MLP with input (B, vit_or_cnn_out_dim + tab_out_dim)
    Works for:
      - ViTMLP:    .vit + .tabular_mlp + .fusion_mlp
      - CNNMLP:    .backbone + .tab_mlp + .fusion
    Returns a dict with totals and a per-part breakdown, or None if unavailable.
    """
    if not _HAVE_CALFLOPS:
        return None

    # Resolve image shape (C,H,W)
    if isinstance(imgs_shape, (list, tuple)) and len(imgs_shape) >= 3:
        C, H, W = imgs_shape[-3], imgs_shape[-2], imgs_shape[-1]
    else:
        C, H, W = 3, 224, 224

    # --- Visual module (ViT or CNN backbone)
    vis_mod = None
    if hasattr(model, "vit"):
        vis_mod = model.vit
    elif hasattr(model, "backbone"):
        vis_mod = model.backbone
    elif hasattr(model, "cnn"):
        vis_mod = model.cnn  # if you ever expose a raw cnn module

    vis_info = _try_flops_single_input(vis_mod, (batch_size, int(C), int(H), int(W))) if vis_mod is not None else None

    # --- Tabular module
    tab_mod = None
    if hasattr(model, "tabular_mlp"):
        tab_mod = model.tabular_mlp
    elif hasattr(model, "tab_mlp"):
        tab_mod = model.tab_mlp

    tab_info = _try_flops_single_input(tab_mod, (batch_size, int(attributes))) if tab_mod is not None else None

    # --- Fusion module
    fusion_mod = None
    if hasattr(model, "fusion_mlp"):
        fusion_mod = model.fusion_mlp
    elif hasattr(model, "fusion"):
        fusion_mod = model.fusion

    fused_in = int(vit_or_cnn_out_dim) + int(tab_out_dim)
    fusion_info = _try_flops_single_input(fusion_mod, (batch_size, fused_in)) if fusion_mod is not None else None

    parts = {"vision": vis_info, "tabular": tab_info, "fusion": fusion_info}
    if not any(p is not None for p in parts.values()):
        return None

    total_flops = sum(p["flops"] for p in parts.values() if p is not None)
    total_macs  = sum(p["macs"]  for p in parts.values() if p is not None)

    return {
        "flops": float(total_flops),
        "macs": float(total_macs),
        "flops_str": _humanize(total_flops),
        "macs_str": _humanize(total_macs),
        "parts": parts,
        "tool": "calflops(hybrid-sum)",
        "inputs": {
            "image_input_shape": (batch_size, int(C), int(H), int(W)),
            "tabular_input_shape": (batch_size, int(attributes)),
            "fusion_input_shape":  (batch_size, fused_in),
        }
    }


# -----------------------------------------------------------------------------
# Build + train + return metrics (with param counts + FLOPs) for a trial
# -----------------------------------------------------------------------------
def evaluate_best_model(
    best_trial,
    train_loader, val_loader, test_loader,
    dataset_name, image_name, task_type,
    save_dir, imgs_shape, attributes, trial_name,
    class_weight=None, num_classes=None, epochs=10,
    path_vision=None, path_mlp=None,
):
    """
    Builds model from the trial, counts params, trains, saves, and returns metrics.
    - For ViT_hybrid: loads frozen best vision+MLP branches from path_vision/path_mlp; ONLY fusion comes from best_trial.
    - Else (CNN+MLP hybrid): loads frozen best CNN from path_vision and MLP from path_mlp; ONLY fusion comes from best_trial.
    """
    task = task_type.lower()
    best_params = best_trial.params

    print(f"\nBest Trial: {best_trial.number}")
    print(f"  Best Score: {best_trial.value:.4f}")
    print("  Best Hyperparameters:")
    for k, v in best_params.items():
        print(f"    {k}: {v}")

    # ---------------- Build model params ----------------
    if model_name == "ViT_hybrid":
        # fusion hyperparams that come from the trial
        architecture_params = {
            "activation": best_params["activation"],
            "fusion_hidden_dims": best_params["fusion_hidden_dims"],
            "fusion_dropout": best_params["fusion_dropout"],
        }

        if (path_vision is None) or (path_mlp is None):
            raise RuntimeError("ViT_hybrid requires path_vision and path_mlp.")

        # 1) Load frozen best ViT branch
        with open(os.path.join(path_vision, "best_params.json"), "r") as f:
            params_best_v = json.load(f)
        params_best_v = {
            k: v for k, v in params_best_v.items()
            if k in ["patch_size", "dim", "depth", "heads", "mlp_dim", "dropout", "emb_dropout"]
        }
        params_best_v["vit_dropout"]     = params_best_v.pop("dropout")
        params_best_v["vit_emb_dropout"] = params_best_v.pop("emb_dropout")

        # 2) Load frozen best tabular MLP
        with open(os.path.join(path_mlp, "best_params.json"), "r") as f:
            params_best_t = json.load(f)
        params_best_t = {
            k: v for k, v in params_best_t.items()
            if k in ["mlp_hidden_dims", "dropout", "activation"]
        }
        params_best_t["tab_dropout"] = params_best_t.pop("dropout")
        if isinstance(params_best_t.get("mlp_hidden_dims"), str):
            params_best_t["mlp_hidden_dims"] = json.loads(params_best_t["mlp_hidden_dims"])

        # merge
        architecture_params = {**architecture_params, **params_best_v, **params_best_t}

        # ensure lists are lists
        architecture_params["fusion_hidden_dims"] = json.loads(architecture_params["fusion_hidden_dims"])

        patch = architecture_params["patch_size"]
        model = ViTMLP(imgs_shape[1], attributes, architecture_params, task, num_classes)

        # Fit params (tuned for fusion in hybrid HPO)
        fit_params = {
            "max_lr": best_params["max_lr"],
            "div_factor": best_params["div_factor"],
            "final_div_factor": best_params["final_div_factor"],
            "weight_decay": best_params["weight_decay"],
            "pct_start": best_params["pct_start"],
        }

        # FLOPs dims: probe actual branch outputs
        vit_or_cnn_out_dim, tab_out_dim = _probe_branch_dims(model, imgs_shape, attributes)

        # ---------------- Count & save param stats ----------------
        total_params = _count_params(model, trainable_only=False)
        trainable_params = _count_params(model, trainable_only=True)
        print(f"  Params: total={total_params:,}  trainable={trainable_params:,}")

        base_dir = _ensure_dir(os.path.join(save_dir, f"{model_name}/{image_name}/best_model/{trial_name}"))

        # ---------------- NEW: FLOPs/MACs (hybrid breakdown, saved; no prints) ----------------
        flops_info = _try_compute_flops_hybrid(
            model, imgs_shape=imgs_shape, attributes=attributes,
            vit_or_cnn_out_dim=vit_or_cnn_out_dim, tab_out_dim=tab_out_dim, batch_size=1
        )

        # Save compact stats + compute summary
        model_stats = {
            "total_params": int(total_params),
            "trainable_params": int(trainable_params),
            "architecture_params": architecture_params,
        }
        if flops_info is not None:
            model_stats.update({
                "flops": flops_info["flops"],
                "macs": flops_info["macs"],
                "flops_str": flops_info["flops_str"],
                "macs_str": flops_info["macs_str"],
            })
        with open(os.path.join(base_dir, "model_stats.json"), "w") as f:
            json.dump(model_stats, f, indent=4)

        # Save detailed FLOPs breakdown
        if flops_info is not None:
            with open(os.path.join(base_dir, "flops_details.json"), "w") as f:
                json.dump(flops_info, f, indent=4)

    else:
        # CNN+MLP hybrid (vision = CNN; tabular = MLP). Fusion hyperparams from trial.
        architecture_params = {
            "activation": best_params["activation"],
            "fusion_hidden_dims": best_params["fusion_hidden_dims"],
            "fusion_dropout": best_params["fusion_dropout"],
        }
        
        if (path_vision is None) or (path_mlp is None):
            raise RuntimeError("CNN hybrid requires path_vision and path_mlp.")

        # 1) Frozen best CNN
        with open(os.path.join(path_vision, "best_params.json"), "r") as f:
            params_best_cnn = json.load(f)
        params_best_cnn = {
            k: v for k, v in params_best_cnn.items()
            if k in ["in_channels", "activation", "stem_type", "use_maxpool", "stem_width",
                     "n_stages", "blocks_per_stage", "base_width", "width_mul"]
        }

        # 2) Frozen best MLP (tabular)
        with open(os.path.join(path_mlp, "best_params.json"), "r") as f:
            params_best_t = json.load(f)
        params_best_t = {
            k: v for k, v in params_best_t.items()
            if k in ["mlp_hidden_dims", "dropout", "activation"]
        }
        params_best_t["tab_dropout"] = params_best_t.pop("dropout")
        if isinstance(params_best_t.get("mlp_hidden_dims"), str):
            params_best_t["mlp_hidden_dims"] = json.loads(params_best_t["mlp_hidden_dims"])

        # merge
        architecture_params = {**architecture_params, **params_best_cnn, **params_best_t}

        # ensure lists are lists
        architecture_params["fusion_hidden_dims"] = json.loads(architecture_params["fusion_hidden_dims"])
        architecture_params["blocks_per_stage"] = json.loads(architecture_params["blocks_per_stage"])

        patch = ""
        model = CNNMLP(imgs_shape, attributes, architecture_params, task, num_classes)

        fit_params = {
            "max_lr": best_params["max_lr"],
            "div_factor": best_params["div_factor"],
            "final_div_factor": best_params["final_div_factor"],
            "weight_decay": best_params["weight_decay"],
            "pct_start": best_params["pct_start"],
        }

        # FLOPs dims: probe actual branch outputs
        vit_or_cnn_out_dim, tab_out_dim = _probe_branch_dims(model, imgs_shape, attributes)

        # ---------------- Count & save param stats ----------------
        total_params = _count_params(model, trainable_only=False)
        trainable_params = _count_params(model, trainable_only=True)
        print(f"  Params: total={total_params:,}  trainable={trainable_params:,}")

        base_dir = _ensure_dir(os.path.join(save_dir, f"{model_name}/{image_name}/best_model/{trial_name}"))

        # ---------------- NEW: FLOPs/MACs (hybrid breakdown, saved; no prints) ----------------
        flops_info = _try_compute_flops_hybrid(
            model, imgs_shape=imgs_shape, attributes=attributes,
            vit_or_cnn_out_dim=vit_or_cnn_out_dim, tab_out_dim=tab_out_dim, batch_size=1
        )

        # Save compact stats + compute summary
        model_stats = {
            "total_params": int(total_params),
            "trainable_params": int(trainable_params),
            "architecture_params": architecture_params,
        }
        if flops_info is not None:
            model_stats.update({
                "flops": flops_info["flops"],
                "macs": flops_info["macs"],
                "flops_str": flops_info["flops_str"],
                "macs_str": flops_info["macs_str"],
            })
        with open(os.path.join(base_dir, "model_stats.json"), "w") as f:
            json.dump(model_stats, f, indent=4)

        # Save detailed FLOPs breakdown
        if flops_info is not None:
            with open(os.path.join(base_dir, "flops_details.json"), "w") as f:
                json.dump(flops_info, f, indent=4)

    # ---------------- Train & evaluate ----------------
    metrics = compile_and_fit(
        model,
        train_loader, val_loader, test_loader,
        dataset_name=dataset_name,
        image_name=image_name,
        model_name=model_name,
        trial_name=trial_name,
        task=task,
        max_lr=fit_params["max_lr"],
        div_factor=fit_params["div_factor"],
        final_div_factor=fit_params["final_div_factor"],
        weight_decay=fit_params["weight_decay"],
        pct_start=fit_params["pct_start"],
        epochs=epochs,
        save_model=True,
        class_weights=class_weight,
        save_dir=save_dir,
        patch=(architecture_params["patch_size"] if model_name == "ViT_hybrid" else "")
    )

    # Augment metrics with parameter counts + (if available) FLOPs
    metrics["total_params"] = int(total_params)
    metrics["trainable_params"] = int(trainable_params)
    if flops_info is not None:
        metrics["flops"] = flops_info["flops"]
        metrics["macs"] = flops_info["macs"]
        metrics["flops_str"] = flops_info["flops_str"]
        metrics["macs_str"] = flops_info["macs_str"]
    return metrics

# -----------------------------------------------------------------------------
# Top-K → single-pass → winner → multi-seed driver
# -----------------------------------------------------------------------------
def run_topk_and_multiseed(
    study, model_name, dataset_name, name, task_type, save_dir,
    imgs_shape, attributes, num_classes, class_weight,
    train_loader, val_loader, test_loader,
    path_vision=None, path_mlp=None,
):
    # Load patch_size from the frozen best ViT (used only for printing headers in hybrid)
    if model_name == "ViT_hybrid":
        with open(os.path.join(path_vision, "best_params.json"), "r") as f:
            params_best_vit = json.load(f)
        patch_size = params_best_vit["patch_size"]
    else:
        patch_size = None

    minimize = _is_minimize_study(study)
    completed = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed:
        raise RuntimeError("No completed trials in the study.")

    top_trials = _sort_trials(completed, minimize)[:TOP_K]
    primary_key, primary_minimize = primary_val_key_for_task(task_type)
    maximize_primary = not primary_minimize

    print(f"\nEvaluating top-{len(top_trials)} trials once at {FULL_EPOCHS} epochs (seed={SINGLE_PASS_SEED})...\n")

    # Single-pass over top-K
    single_pass_results = []  # list of (trial, trial_name, metrics_dict)
    for trial in top_trials:
        if model_name == "ViT_hybrid":
            trial_name = f"trial_{trial.number}_patch{patch_size}"
            header = f"(Trial {trial.number}, ValObjective: {trial.value:.4f}, patch_size={patch_size})"
        else:
            trial_name = f"trial_{trial.number}"
            header = f"(Trial {trial.number}, ValObjective: {trial.value:.4f})"

        print(f"→ Single-pass full run {header}")
        set_model_seed(SINGLE_PASS_SEED)

        metrics = evaluate_best_model(
            best_trial=trial,
            train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
            dataset_name=dataset_name,
            image_name=name,
            task_type=task_type,
            save_dir=save_dir,
            imgs_shape=imgs_shape,
            attributes=attributes,
            class_weight=class_weight,
            num_classes=num_classes,
            epochs=FULL_EPOCHS,
            trial_name=trial_name,
            path_vision=path_vision, path_mlp=path_mlp
        )
        if not isinstance(metrics, dict):
            raise TypeError(f"evaluate_best_model must return dict, got: {type(metrics)}")

        # brief printout (metric + params)
        if primary_key in metrics:
            print(f"   {primary_key}={float(metrics[primary_key]):.6f}")
        tp = metrics.get("total_params"); trp = metrics.get("trainable_params")
        if tp is not None:
            print(f"   params: total={tp:,}, trainable={trp:,}")
        single_pass_results.append((trial, trial_name, metrics))

    # Winner by primary metric
    if maximize_primary:
        winner_tuple = max(single_pass_results, key=lambda x: _metric_or_default(x[2], primary_key, primary_minimize))
    else:
        winner_tuple = min(single_pass_results, key=lambda x: _metric_or_default(x[2], primary_key, primary_minimize))

    best_trial, best_trial_name, best_single_metrics = winner_tuple
    best_primary_val = _metric_or_default(best_single_metrics, primary_key, primary_minimize)
    print(f"\nWinner after single-pass: Trial {best_trial.number} ({best_trial_name}) "
          f"by {primary_key}={best_primary_val:.6f}")

    # Multi-seed evaluation of winner
    print(f"\nRe-running winner with seeds {FINAL_SEEDS} at {FULL_EPOCHS} epochs...\n")
    winner_save_path = _ensure_dir(os.path.join(save_dir, f"{model_name}/{name}/best_model/{best_trial_name}"))

    per_seed_metrics = []
    numeric_keys = None

    for s in FINAL_SEEDS:
        set_model_seed(s)
        m = evaluate_best_model(
            best_trial=best_trial,
            train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
            dataset_name=dataset_name,
            image_name=name,
            task_type=task_type,
            save_dir=save_dir,
            imgs_shape=imgs_shape,
            attributes=attributes,
            class_weight=class_weight,
            num_classes=num_classes,
            epochs=FULL_EPOCHS,
            trial_name=f"{best_trial_name}_seed{s}",
            path_vision=path_vision, path_mlp=path_mlp
        )
        if not isinstance(m, dict):
            raise TypeError(f"evaluate_best_model must return dict, got: {type(m)}")

        if numeric_keys is None:
            numeric_keys = [k for k, v in m.items() if isinstance(v, (Number, np.floating, np.integer))]
        per_seed_metrics.append(m)

        # quick line
        pk_val = _metric_or_default(m, primary_key, primary_minimize)
        extras = []
        for k in ["test_loss", "test_accuracy", "test_roc_auc", "test_rmse", "val_loss"]:
            if k in m and isinstance(m[k], (Number, np.floating, np.integer)):
                extras.append(f"{k}={float(m[k]):.6f}")
        print(f"   Seed {s}: {primary_key}={pk_val:.6f}" + (", " + ", ".join(extras) if extras else ""))

    # Aggregate across seeds
    aggregates = {}
    for k in (numeric_keys or []):
        vals = [float(m[k]) for m in per_seed_metrics if k in m]
        if not vals:
            continue
        mean_k = float(np.mean(vals))
        std_k = float(np.std(vals, ddof=1)) if len(vals) > 1 else 0.0
        aggregates[k] = {"mean": mean_k, "std": std_k}

    # Param counts (same across seeds) — take from single-pass winner metrics
    winner_total_params = best_single_metrics.get("total_params")
    winner_train_params = best_single_metrics.get("trainable_params")
    # Optional: FLOPs (if evaluate saved them in metrics)
    winner_flops_str = best_single_metrics.get("flops_str")
    winner_macs_str  = best_single_metrics.get("macs_str")
    winner_flops_num = best_single_metrics.get("flops")
    winner_macs_num  = best_single_metrics.get("macs")

    # Save summary (includes compute if available)
    out_file = os.path.join(winner_save_path, "winner_multi_seed_summary.txt")
    with open(out_file, "w", encoding="utf-8") as f:
        f.write("# Final winner multi-seed evaluation\n")
        if model_name == "ViT_hybrid":
            f.write(f"patch_size: {patch_size}\n")
        f.write(f"trial_number: {best_trial.number}\n")
        f.write(f"primary_metric: {primary_key}\n")
        f.write(f"seeds: {FINAL_SEEDS}\n")
        if (winner_total_params is not None) or (winner_train_params is not None):
            f.write("model_size:\n")
            if winner_total_params is not None:
                f.write(f"  total_params: {winner_total_params}\n")
            if winner_train_params is not None:
                f.write(f"  trainable_params: {winner_train_params}\n")
        if winner_flops_str is not None and winner_macs_str is not None:
            f.write("compute:\n")
            f.write(f"  flops: {winner_flops_str}\n")
            f.write(f"  macs: {winner_macs_str}\n")
            if winner_flops_num is not None and winner_macs_num is not None:
                f.write(f"  flops_num: {winner_flops_num}\n")
                f.write(f"  macs_num: {winner_macs_num}\n")
        f.write("per_seed_metrics:\n")
        for s, m in zip(FINAL_SEEDS, per_seed_metrics):
            f.write(f"  - seed: {s}\n")
            for k in (numeric_keys or []):
                if k in m:
                    f.write(f"      {k}: {float(m[k]):.6f}\n")
        f.write("aggregates:\n")
        for k, mm in aggregates.items():
            f.write(f"  {k}:\n")
            f.write(f"    mean: {mm['mean']:.6f}\n")
            f.write(f"    std: {mm['std']:.6f}\n")

    # Console summary
    if primary_key in aggregates:
        print(f"\nWinner aggregated {primary_key}: {aggregates[primary_key]['mean']:.6f} "
              f"± {aggregates[primary_key]['std']:.6f}")
    elif "val_loss" in aggregates:
        print(f"\nWinner aggregated val_loss: {aggregates['val_loss']['mean']:.6f} "
              f"± {aggregates['val_loss']['std']:.6f}")
    if winner_total_params is not None:
        print(f"Model params: total={winner_total_params:,}, trainable={winner_train_params:,}")
    print(f"Saved multi-seed summary to: {out_file}")

    # Return winner identifiers & aggregates for downstream use
    return {
        "winner_trial_number": best_trial.number,
        "winner_trial_name": best_trial_name,
        "primary_metric": primary_key,
        "aggregates": aggregates,
        "total_params": winner_total_params,
        "trainable_params": winner_train_params,
        "flops": winner_flops_num,
        "macs": winner_macs_num,
        "summary_path": out_file,
    }


In [24]:
from torch.utils.data import DataLoader, Subset
import torch
import numpy as np

def reduce_dataloader(train_loader, fraction=0.25, stratify=True, seed=42):
    """
    Return a new DataLoader that draws from ~fraction of the original train dataset.
    For classification (TensorDataset(..., y)), uses a stratified subsample.
    """
    assert 0 < fraction <= 1.0
    ds = train_loader.dataset
    n = len(ds)
    num_keep = max(1, int(round(n * fraction)))
    idx = np.arange(n)

    # Try stratified pick if labels are available (TensorDataset last tensor is y)
    subset_idx = None
    if stratify and hasattr(ds, "tensors") and len(ds.tensors) >= 2:
        y = ds.tensors[-1].cpu().numpy().ravel()
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
            sss = StratifiedShuffleSplit(n_splits=1, train_size=fraction, random_state=seed)
            chosen, _ = next(sss.split(idx, y))
            subset_idx = idx[chosen]
        except Exception:
            subset_idx = None  # fallback to random below

    # Fallback: random subset with a fixed seed
    if subset_idx is None:
        g = torch.Generator().manual_seed(seed)
        subset_idx = torch.randperm(n, generator=g)[:num_keep].tolist()

    # Build subset dataset and a new DataLoader (reuse original loader settings)
    subset = Subset(ds, subset_idx)  # official Subset utility
    new_loader = DataLoader(
        subset,
        batch_size=train_loader.batch_size,
        shuffle=True,                               # shuffle within the subset
        num_workers=getattr(train_loader, "num_workers", 0),
        pin_memory=getattr(train_loader, "pin_memory", False),
        drop_last=getattr(train_loader, "drop_last", False),
        persistent_workers=getattr(train_loader, "persistent_workers", False),
    )
    return new_loader

In [25]:
if model_name == "ViT_hybrid":
    vision_name = "vit"
else:
    vision_name = "CNN"

### EXPERIMENT: TINTO

In [38]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"
name = f"TINTO_blur"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [39]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 20, 20)
Attributes: 72


In [40]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 4, 5, 10, 20]

In [41]:
divisors = [2, 4, 5, 10]

In [42]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_11"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [43]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 01:39:38,819] A new study created in memory with name: no-name-66e44275-5d5b-46c1-b6e2-2f687d8290db
[I 2025-12-16 01:39:57,044] Trial 0 finished with value: 24.34951592498812 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[64,64]', 'fusion_dropout': 0.23590550280075764, 'max_lr': 0.001673273132425617, 'div_factor': 475, 'final_div_factor': 497, 'weight_decay': 0.0022338035414797485, 'pct_start': 0.2204038817190335}. Best is trial 0 with value: 24.34951592498812.
[I 2025-12-16 01:40:13,855] Trial 1 finished with value: 46.55179721792436 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[32,32]', 'fusion_dropout': 0.04348186011664775, 'max_lr': 5.758367890492339e-05, 'div_factor': 871, 'final_div_factor': 744, 'weight_decay': 0.00045543363743479363, 'pct_start': 0.356977655318339}. Best is trial 0 with value: 24.34951592498812.
[I 2025-12-16 01:40:31,605] Trial 2 finished with value: 21.951059872840208 and parameters: {'activation': 'relu', 'fusio

In [44]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 34, ValObjective: 20.1786)

Best Trial: 34
  Best Score: 20.1786
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [256]
    fusion_dropout: 0.03362183376308595
    max_lr: 0.009923163465679907
    div_factor: 449
    final_div_factor: 610
    weight_decay: 0.0003778396434377797
    pct_start: 0.31936554705694625
  Params: total=577,689  trainable=577,689

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                 

### EXPERIMENT: IGTD

In [45]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"
name = f"IGTD"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [46]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device, pad_images=False, target_size=12)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 9, 9)
Attributes: 72


In [47]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 3, 9]

In [48]:
divisors = [1]

In [49]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_52"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [50]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 02:01:33,071] A new study created in memory with name: no-name-71b26f6c-429a-40b7-bcdf-97fdfcd39cc6
[I 2025-12-16 02:01:56,236] Trial 0 finished with value: 21.504423883983296 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[]', 'fusion_dropout': 0.03300149089649712, 'max_lr': 0.00043434159417251293, 'div_factor': 793, 'final_div_factor': 260, 'weight_decay': 0.008548004624615758, 'pct_start': 0.21992522403162432}. Best is trial 0 with value: 21.504423883983296.
[I 2025-12-16 02:02:18,804] Trial 1 finished with value: 20.013870007466853 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[128,128,64]', 'fusion_dropout': 0.16044842147991284, 'max_lr': 0.0007379689221641153, 'div_factor': 589, 'final_div_factor': 202, 'weight_decay': 0.001299019882240533, 'pct_start': 0.11920953511236854}. Best is trial 1 with value: 20.013870007466853.
[I 2025-12-16 02:02:40,146] Trial 2 finished with value: 20.777877281526422 and parameters: {'activation': 'relu', 

In [51]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 28, ValObjective: 19.3027)

Best Trial: 28
  Best Score: 19.3027
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [256,128,64]
    fusion_dropout: 0.015863516678051887
    max_lr: 0.00018948370634940513
    div_factor: 11
    final_div_factor: 537
    weight_decay: 2.3609548800261087e-06
    pct_start: 0.3453619262098891
  Params: total=5,501,761  trainable=5,501,761

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                    

### REFINED

In [52]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"
name = f"REFINED"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [53]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device, pad_images=False, target_size=12)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 9, 9)
Attributes: 72


In [54]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 3, 9]

In [55]:
divisors = [1]

In [56]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_64"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [57]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 02:27:56,210] A new study created in memory with name: no-name-09f399b1-8034-4740-8691-7fc3ee6d508b
[I 2025-12-16 02:28:34,398] Trial 0 finished with value: 33.52452897459933 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[128,128,64]', 'fusion_dropout': 0.26174050067022525, 'max_lr': 4.776127755142375e-05, 'div_factor': 80, 'final_div_factor': 847, 'weight_decay': 0.0007952349191815828, 'pct_start': 0.24799464568528354}. Best is trial 0 with value: 33.52452897459933.
[I 2025-12-16 02:29:09,859] Trial 1 finished with value: 61.4334505595791 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[128]', 'fusion_dropout': 0.12553688051819545, 'max_lr': 3.1452418124908296e-05, 'div_factor': 57, 'final_div_factor': 640, 'weight_decay': 0.0022708618267781406, 'pct_start': 0.22943556239518603}. Best is trial 0 with value: 33.52452897459933.
[I 2025-12-16 02:29:44,645] Trial 2 finished with value: 26.485397058081247 and parameters: {'activation': 'relu', 'f

In [58]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 25, ValObjective: 19.3616)

Best Trial: 25
  Best Score: 19.3616
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [512,256]
    fusion_dropout: 0.08592854270805945
    max_lr: 0.0020836507415057464
    div_factor: 402
    final_div_factor: 620
    weight_decay: 1.2573808170279042e-05
    pct_start: 0.18129438627387234
  Params: total=880,041  trainable=880,041

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                           

### EXPERIMENT: BarGraph

In [59]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"BarGraph"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [60]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device, pad_images=False, target_size=24)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 72, 72)
Attributes: 72


In [61]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 3, 4, 6, 8, 9, 12, 18, 24, 36, 72]

In [62]:
divisors = [2, 4]

In [63]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_56"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [64]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 03:09:47,672] A new study created in memory with name: no-name-fdb2c8d4-7fe1-4b22-90b5-b6e84b944aba
[I 2025-12-16 03:10:36,200] Trial 0 finished with value: 37.33165501046414 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[64,32,16]', 'fusion_dropout': 0.2626508011548726, 'max_lr': 0.0007031910841264855, 'div_factor': 840, 'final_div_factor': 717, 'weight_decay': 0.00028016279905767526, 'pct_start': 0.2160919273967954}. Best is trial 0 with value: 37.33165501046414.
[I 2025-12-16 03:11:20,662] Trial 1 finished with value: 20.565744068755755 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256,128]', 'fusion_dropout': 0.1378821235215841, 'max_lr': 0.003851951475482363, 'div_factor': 353, 'final_div_factor': 506, 'weight_decay': 0.005518502763427382, 'pct_start': 0.12404499282863476}. Best is trial 1 with value: 20.565744068755755.
[I 2025-12-16 03:12:06,240] Trial 2 finished with value: 69.25297898037492 and parameters: {'activation': 'relu', '

In [65]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 18, ValObjective: 19.7555)

Best Trial: 18
  Best Score: 19.7555
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [256]
    fusion_dropout: 0.04377821931181073
    max_lr: 0.0018178552104879883
    div_factor: 800
    final_div_factor: 788
    weight_decay: 0.009744240826576232
    pct_start: 0.1414015012083698
  Params: total=621,249  trainable=621,249

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  

### EXPERIMENT: DistanceMatrix

In [66]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"DistanceMatrix"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [67]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device, pad_images=False, target_size=24)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 72, 72)
Attributes: 72


In [68]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 3, 4, 6, 8, 9, 12, 18, 24, 36, 72]

In [69]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_60"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [70]:
divisors = [2, 4]

In [71]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 04:04:13,948] A new study created in memory with name: no-name-b623e1f5-6d97-48ab-a57a-1892e6c22370
[I 2025-12-16 04:05:05,877] Trial 0 finished with value: 30.654077793200084 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[128,64]', 'fusion_dropout': 0.2191149287526321, 'max_lr': 9.46519678801181e-05, 'div_factor': 975, 'final_div_factor': 460, 'weight_decay': 0.0008418483970265773, 'pct_start': 0.3799860201700934}. Best is trial 0 with value: 30.654077793200084.
[I 2025-12-16 04:05:59,650] Trial 1 finished with value: 24.025840835068212 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[32,32]', 'fusion_dropout': 0.10041174618472498, 'max_lr': 0.0006746871952544025, 'div_factor': 224, 'final_div_factor': 459, 'weight_decay': 1.7062528550646727e-06, 'pct_start': 0.36656110711226475}. Best is trial 1 with value: 24.025840835068212.
[I 2025-12-16 04:06:52,370] Trial 2 finished with value: 21.4450300243236 and parameters: {'activation': 'relu', 'f

In [72]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 36, ValObjective: 18.9038)

Best Trial: 36
  Best Score: 18.9038
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [512,256]
    fusion_dropout: 0.09978174634647179
    max_lr: 0.0036171957405806074
    div_factor: 813
    final_div_factor: 796
    weight_decay: 0.009951159409282585
    pct_start: 0.2411762114666556
  Params: total=814,593  trainable=814,593

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                              

### EXPERIMENT: Combination

In [73]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"Combination"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [74]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device, pad_images=False, target_size=24)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 72, 72)
Attributes: 72


In [75]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 3, 4, 6, 8, 9, 12, 18, 24, 36, 72]

In [76]:
divisors = [2, 4]

In [77]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_91"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [78]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 05:01:58,987] A new study created in memory with name: no-name-a85b3d55-c6a6-419b-94db-b6a2eafa404a
[I 2025-12-16 05:02:59,359] Trial 0 finished with value: 25.13047009794584 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256,128,64]', 'fusion_dropout': 0.2540054955085003, 'max_lr': 8.822101135082419e-05, 'div_factor': 871, 'final_div_factor': 902, 'weight_decay': 9.399855179367282e-06, 'pct_start': 0.16240713635424858}. Best is trial 0 with value: 25.13047009794584.
[I 2025-12-16 05:04:00,335] Trial 1 finished with value: 526.9720106419315 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[128]', 'fusion_dropout': 0.10882525812723393, 'max_lr': 1.9993133623816988e-05, 'div_factor': 519, 'final_div_factor': 349, 'weight_decay': 3.6964184832200727e-05, 'pct_start': 0.29601790972849185}. Best is trial 0 with value: 25.13047009794584.
[I 2025-12-16 05:05:02,278] Trial 2 finished with value: 21.24306742959593 and parameters: {'activation': 'relu', 

In [79]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 37, ValObjective: 19.5858)

Best Trial: 37
  Best Score: 19.5858
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [512]
    fusion_dropout: 0.12127302372628164
    max_lr: 0.0036376390246940513
    div_factor: 817
    final_div_factor: 595
    weight_decay: 3.245030382463053e-06
    pct_start: 0.35992753648560194
  Params: total=782,945  trainable=782,945

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                

### EXPERIMENT: SuperTML

In [32]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"SuperTML"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [33]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 224, 224)
Attributes: 72


In [34]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 4, 7, 8, 14, 16, 28, 32, 56, 112, 224]

In [35]:
divisors = [16, 28, 32, 56, 112]

In [36]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_71"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [37]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 16:35:31,635] A new study created in memory with name: no-name-a5187c94-9164-4927-83f3-e1d364eddffe
[I 2025-12-16 16:36:09,314] Trial 0 finished with value: 24.54771790007296 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[64]', 'fusion_dropout': 0.27557766707284714, 'max_lr': 0.0029811140756228166, 'div_factor': 65, 'final_div_factor': 159, 'weight_decay': 4.908968978839107e-06, 'pct_start': 0.308077718620802}. Best is trial 0 with value: 24.54771790007296.
[I 2025-12-16 16:36:46,189] Trial 1 finished with value: 23.90646190674056 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256,256]', 'fusion_dropout': 0.1981266007290174, 'max_lr': 0.00013264703039176558, 'div_factor': 163, 'final_div_factor': 699, 'weight_decay': 1.569961149841879e-05, 'pct_start': 0.12219358858880182}. Best is trial 1 with value: 23.90646190674056.
[I 2025-12-16 16:37:22,522] Trial 2 finished with value: 25.526604630175797 and parameters: {'activation': 'relu', 'fusion

In [38]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 31, ValObjective: 20.5985)

Best Trial: 31
  Best Score: 20.5985
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [256]
    fusion_dropout: 0.10542356406752211
    max_lr: 0.0038593839426685596
    div_factor: 165
    final_div_factor: 509
    weight_decay: 0.00012504161601556435
    pct_start: 0.26771610126421264
  Params: total=12,489,777  trainable=12,489,777

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                         

### EXPERIMENT: FeatureWrap

In [39]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"FeatureWrap"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [40]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 10, 10)
Attributes: 72


In [41]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 5, 10]

In [42]:
divisors = [1, 2]

In [43]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_58"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [44]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 17:18:36,074] A new study created in memory with name: no-name-9273b031-6f61-4f7a-9242-49da27ff044c
[I 2025-12-16 17:18:50,016] Trial 0 finished with value: 23.983639544912954 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256]', 'fusion_dropout': 0.11070268565962542, 'max_lr': 0.00034370255985647605, 'div_factor': 544, 'final_div_factor': 627, 'weight_decay': 2.3765414762184266e-06, 'pct_start': 0.27128468879789597}. Best is trial 0 with value: 23.983639544912954.
[I 2025-12-16 17:19:04,136] Trial 1 finished with value: 675.2891278926382 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256]', 'fusion_dropout': 0.11987182810733296, 'max_lr': 1.1127345453114277e-05, 'div_factor': 678, 'final_div_factor': 272, 'weight_decay': 0.003212128155429053, 'pct_start': 0.16071755081564296}. Best is trial 0 with value: 23.983639544912954.
[I 2025-12-16 17:19:17,430] Trial 2 finished with value: 24.541278553232384 and parameters: {'activation': 'relu', 'f

In [45]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 11, ValObjective: 20.7489)

Best Trial: 11
  Best Score: 20.7489
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [128,64]
    fusion_dropout: 0.001955269809990917
    max_lr: 0.007447532173072955
    div_factor: 304
    final_div_factor: 585
    weight_decay: 0.000521013207818956
    pct_start: 0.2272947214860106
  Params: total=3,941,145  trainable=3,941,145

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                           

### EXPERIMENT: BIE

In [46]:
#Select the model and the parameters
if task_type.lower() == "regression":
    problem_type = "regression"
else:
    problem_type = "supervised"

name = f"BIE"

#Define the dataset path and the folder where the images will be saved
images_folder = f"SyntheticImages/{task_type}/{dataset_name}/{name}"

In [47]:
train_loader, val_loader, test_loader, attributes, imgs_shape, label_encoder, class_weight  = load_and_preprocess_data(df, dataset_name, images_folder, problem_type, task_type, seed=SEED, batch_size=batch_size, device=device)

Shapes — Train: (862, 72), Val: (185, 72), Test: (185, 72)
Numerical features: 8 — ['Year', 'RA', 'W', 'OBP', 'SLG', 'BA', 'OOBP', 'OSLG']
Categorical features: 6 — ['Team', 'League', 'Playoffs', 'RankSeason', 'RankPlayoffs', 'G']
Total features: 72
Images shape (C,H,W): (3, 72, 72)
Attributes: 72


In [48]:
# Determine possible patch sizes for the Vision Transformer by finding divisors of the image width
divisors = find_divisors(imgs_shape[1])
divisors

[1, 2, 3, 4, 6, 8, 9, 12, 18, 24, 36, 72]

In [49]:
divisors = [8, 16, 32]

In [50]:
path_vision=f"./logs/{task_type}/{dataset_name}/{vision_name}/{name}/best_model/trial_84"
path_mlp=f"./logs/{task_type}/{dataset_name}/mlp/best_model/trial_38"

In [51]:
import optuna
study = optuna.create_study(direction="minimize" if task_type.lower() == "regression" else "maximize")
study.optimize(lambda trial: objective(
    trial=trial,
    model_name=model_name,
    image_name=name,
    task_type=task_type,
    num_classes=num_classes,
    train_loader=reduce_dataloader(train_loader) if reduce else train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    divisors=divisors,
    attributes=attributes,
    imgs_shape=imgs_shape,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    epochs=epochs,
    path_vision=path_vision,
    path_mlp=path_mlp
), n_trials=n_trials)

[I 2025-12-16 17:34:30,216] A new study created in memory with name: no-name-fad0e737-bf2b-4792-9474-f6c4f20fdbb4
[I 2025-12-16 17:35:00,676] Trial 0 finished with value: 86.1462379701299 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[256]', 'fusion_dropout': 0.1989233051605483, 'max_lr': 2.6254626712152412e-05, 'div_factor': 420, 'final_div_factor': 134, 'weight_decay': 1.4897183163940669e-06, 'pct_start': 0.20982068409536475}. Best is trial 0 with value: 86.1462379701299.
[I 2025-12-16 17:35:30,758] Trial 1 finished with value: 40.737690757358685 and parameters: {'activation': 'relu', 'fusion_hidden_dims': '[512,256]', 'fusion_dropout': 0.2912242218212105, 'max_lr': 1.6632873090856905e-05, 'div_factor': 668, 'final_div_factor': 126, 'weight_decay': 4.12546592252506e-06, 'pct_start': 0.20449003000747082}. Best is trial 1 with value: 40.737690757358685.
[I 2025-12-16 17:36:01,134] Trial 2 finished with value: 22.523572314676308 and parameters: {'activation': 'relu', 'fu

In [52]:
result = run_topk_and_multiseed(
     study=study,
     model_name=model_name,
     dataset_name=dataset_name,
     name=name,
     task_type=task_type,
     save_dir=save_dir,
     imgs_shape=imgs_shape,
     attributes=attributes,
     num_classes=num_classes,
     class_weight=None,
     train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
     path_vision=path_vision, path_mlp=path_mlp,
 )
print(result)


Evaluating top-5 trials once at 100 epochs (seed=0)...

→ Single-pass full run (Trial 40, ValObjective: 20.0878)

Best Trial: 40
  Best Score: 20.0878
  Best Hyperparameters:
    activation: relu
    fusion_hidden_dims: [256]
    fusion_dropout: 0.1474103954497869
    max_lr: 0.006688288664302725
    div_factor: 590
    final_div_factor: 349
    weight_decay: 7.172937510867053e-06
    pct_start: 0.39377741704711483
  Params: total=3,480,017  trainable=3,480,017

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                              