In [1]:
# %%
import os, sys, importlib, time, json
import numpy as np
import torch

print("Python:", sys.version)
print("Torch :", torch.__version__)
print("CUDA  :", torch.version.cuda)
print("GPU   :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

# 프로젝트 루트가 현재가 아니라면 아래처럼 경로 추가
# sys.path.append("/path/to/your/project")

# 0) 패키지 세팅
from pathlib import Path
import sys, importlib

PROJECT_ROOT = Path("/caefs/user/mmingyeong/2508_slchallence")
SRC_DIR = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

# 2) 올바른 임포트 (패키지 경로 사용)
import utils as utils
import model as model
import data_loader as data_loader

importlib.reload(utils)
importlib.reload(model)
importlib.reload(data_loader)

from src.model import convnextv2_atto, convnextv2_nano, convnextv2_tiny
from src.data_loader import get_dataloaders, LensFITSBinaryDataset

print("✅ imports via 'src.*' ready")



Python: 3.12.0 | packaged by Anaconda, Inc. | (main, Oct  2 2023, 17:29:18) [GCC 11.2.0]
Torch : 2.6.0+cu124
CUDA  : 12.4
GPU   : Quadro RTX 5000


  from .autonotebook import tqdm as notebook_tqdm


✅ imports via 'src.*' ready


In [2]:
from model import convnextv2_atto, convnextv2_nano, convnextv2_tiny

In [3]:
# %%
import types, sys, os, json
from pathlib import Path
import torch
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------
# 프로젝트 경로
# -----------------------
PROJECT_ROOT = Path("/caefs/user/mmingyeong/2508_slchallence")
SRC_DIR = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

import predict
import evaluate

# -----------------------
# 데이터 경로
# -----------------------
SLSIM_LENSES_DIR      = "/caefs/data/IllustrisTNG/slchallenge/slsim_lenses/slsim_lenses"
SLSIM_NONLENSES_DIR   = "/caefs/data/IllustrisTNG/slchallenge/slsim_nonlenses/slsim_nonlenses"
HSC_DEG_LENSES_DIR    = "/caefs/data/IllustrisTNG/slchallenge/hsc_lenses/hsc_lenses"
HSC_DEG_NONLENSES_DIR = "/caefs/data/IllustrisTNG/slchallenge/hsc_nonlenses/hsc_nonlenses"

In [4]:
# %% Optuna wrapper that calls your existing train.py (super simple)
import os, csv, time, shutil, types, optuna, math
import numpy as np
import torch

import train as train_mod   # <-- uses your src/train.py already on sys.path

# === USER-PROVIDED PATHS must exist as variables in your notebook ===
# SLSIM_LENSES_DIR, SLSIM_NONLENSES_DIR, HSC_DEG_LENSES_DIR, HSC_DEG_NONLENSES_DIR

def _mk_args_for_train(trial, save_dir, *, 
                       arch, lr, batch, weight_decay, drop_path,
                       epochs=20, patience=8, seed=42,
                       take_train_frac=0.10, take_val_fraction=0.20,
                       num_workers=8):
    """Build an argparse-like object for train.main()."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    ns = types.SimpleNamespace(
        # data dirs
        slsim_lenses=SLSIM_LENSES_DIR,
        slsim_nonlenses=SLSIM_NONLENSES_DIR,
        hsc_lenses=HSC_DEG_LENSES_DIR,
        hsc_nonlenses=HSC_DEG_NONLENSES_DIR,
        # dataloader
        batch_size=batch,
        num_workers=num_workers,
        no_augment=True,                 # HPO에는 증강 끄고 고정
        take_train_frac=take_train_frac, # 5–10% sweep
        take_val_fraction=take_val_fraction,
        take_test_fraction=None,
        # split
        train_frac=0.70, val_frac=0.15, test_frac=0.15,
        # model/optim
        model_size=arch, drop_path=drop_path,
        lr=lr, weight_decay=weight_decay,
        cosine=False, warmup_epochs=0,
        # train
        epochs=epochs, patience=patience, min_delta=0.0,
        seed=seed, device=device, log_every=200,
        # save
        save_dir=save_dir,
        # preprocessing toggles: your plan = Gaussian + normalization, no padding
        apply_padding=False,
        out_size_when_padded=64,
        apply_normalization=True,
        clip_q=0.997,
        low_clip_q=None,
        use_mad=False,
        # smoothing
        smoothing_mode="gaussian",
        gaussian_sigma=1.0,
        guided_radius=2,
        guided_eps=1e-2,
    )
    return ns

def _read_best_val_auc(csv_path: str) -> float:
    """Parse training_log.csv and return max val_auc (ignoring NaN)."""
    if not os.path.exists(csv_path):
        return float("nan")
    best = float("-inf")
    with open(csv_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                v = float(row["val_auc"])
                if math.isfinite(v) and v > best:
                    best = v
            except Exception:
                continue
    return best if best != float("-inf") else float("nan")

def run_hpo_with_train(
    n_trials=8,
    epochs=20,
    patience=8,
    take_train_frac=0.10,
    take_val_fraction=0.20,   # 검증도 20%만 사용해서 속도 ↑ (원하면 None)
    seed=42,
    num_workers=8,
    out_root="./optuna_runs",
):
    os.makedirs(out_root, exist_ok=True)

    def objective(trial: optuna.trial.Trial) -> float:
        # --- minimal search space (matches your CLI) ---
        arch = trial.suggest_categorical("arch", ["atto", "nano", "tiny"])
        lr   = trial.suggest_float("lr", 3e-4, 1e-3, log=True)
        batch= trial.suggest_categorical("batch_size", [128, 256])
        wd   = trial.suggest_float("weight_decay", 3e-5, 3e-4, log=True)
        dp   = trial.suggest_float("drop_path", 0.0, 0.10)

        save_dir = os.path.join(out_root, f"trial_{trial.number:03d}_{arch}_bs{batch}")
        os.makedirs(save_dir, exist_ok=True)

        args = _mk_args_for_train(
            trial, save_dir,
            arch=arch, lr=lr, batch=batch,
            weight_decay=wd, drop_path=dp,
            epochs=epochs, patience=patience, seed=seed,
            take_train_frac=take_train_frac,
            take_val_fraction=take_val_fraction,
            num_workers=num_workers,
        )

        # --- train using your train.main() ---
        t0 = time.time()
        train_mod.main(args)
        dur = time.time() - t0

        # --- read best val AUC from training_log.csv ---
        csv_path = os.path.join(save_dir, "training_log.csv")
        best_val_auc = _read_best_val_auc(csv_path)
        if not (best_val_auc == best_val_auc):  # NaN check
            best_val_auc = 0.5

        # attach extras for later inspection
        trial.set_user_attr("save_dir", save_dir)
        trial.set_user_attr("duration_sec", int(dur))
        return float(best_val_auc)

    study = optuna.create_study(direction="maximize")  # in-memory
    print("[Optuna] starting fresh in-memory study")
    study.optimize(objective, n_trials=n_trials)

    print("\n=== HPO finished ===")
    print(f"Best AUC: {study.best_value:.6f}")
    print("Best params:", study.best_trial.params)
    print("Artifacts in:", study.best_trial.user_attrs["save_dir"])

    return study

# ---- EXAMPLE RUN ----
# study = run_hpo_with_train(
#     n_trials=4,
#     epochs=20, patience=8,
#     take_train_frac=0.10,   # 10% of train only
#     take_val_fraction=0.20, # 20% of val only (speed)
#     num_workers=8,
# )


In [5]:
study = run_hpo_with_train(
     n_trials=4,
     epochs=20, patience=8,
     take_train_frac=0.10,   # 10% of train only
     take_val_fraction=0.20, # 20% of val only (speed)
     num_workers=8,
 )


[I 2025-09-05 21:25:15,492] A new study created in memory with name: no-name-dad64b6b-ec31-4bd2-a64d-cc202f9b9721


[Optuna] starting fresh in-memory study
2025-09-05 21:25:15 [INFO] [train] Logger initialized -> ./optuna_runs/trial_000_nano_bs256/train.log
2025-09-05 21:25:15 [INFO] [train] 🚀 Configuration
2025-09-05 21:25:15 [INFO] [train]   slsim_lenses: /caefs/data/IllustrisTNG/slchallenge/slsim_lenses/slsim_lenses
2025-09-05 21:25:15 [INFO] [train]   slsim_nonlenses: /caefs/data/IllustrisTNG/slchallenge/slsim_nonlenses/slsim_nonlenses
2025-09-05 21:25:15 [INFO] [train]   hsc_lenses: /caefs/data/IllustrisTNG/slchallenge/hsc_lenses/hsc_lenses
2025-09-05 21:25:15 [INFO] [train]   hsc_nonlenses: /caefs/data/IllustrisTNG/slchallenge/hsc_nonlenses/hsc_nonlenses
2025-09-05 21:25:15 [INFO] [train]   batch_size: 256
2025-09-05 21:25:15 [INFO] [train]   num_workers: 8
2025-09-05 21:25:15 [INFO] [train]   no_augment: True
2025-09-05 21:25:15 [INFO] [train]   take_train_frac: 0.1
2025-09-05 21:25:15 [INFO] [train]   take_val_fraction: 0.2
2025-09-05 21:25:15 [INFO] [train]   take_test_fraction: None
2025-0

                                                                     

2025-09-05 21:31:01 [INFO] [train] ✅ Epoch 1: best model updated (val_loss=0.503336)
2025-09-05 21:31:01 [INFO] [train] 📉 Epoch 001/20 | Train Loss 0.6140 Acc 68.06% | Val Loss 0.5033 Acc 76.28% AUC 0.8453 | LR 5.36e-04 | 342.0s


[W 2025-09-05 21:33:22,432] Trial 0 failed with parameters: {'arch': 'nano', 'lr': 0.0005358804106038669, 'batch_size': 256, 'weight_decay': 5.6345802047845344e-05, 'drop_path': 0.0022994550074793008} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/users/mmingyeong/.local/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_116667/712984078.py", line 108, in objective
    train_mod.main(args)
  File "/caefs/user/mmingyeong/2508_slchallence/src/train.py", line 308, in main
    train_loss, train_acc = train_one_epoch(
                            ^^^^^^^^^^^^^^^^
  File "/caefs/user/mmingyeong/2508_slchallence/src/train.py", line 141, in train_one_epoch
    for i, batch in enumerate(pbar, 1):
  File "/home/users/mmingyeong/.local/lib/python3.12/site-packages/tqdm/std.py", line 1181, in __iter__
    for obj in iterable:

Unexpected exception formatting exception. Falling back to standard exception


In [None]:
# %% HPO visualization utilities (save figures/HTML and a compact JSON report)
import os, csv, json, math
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from optuna.trial import TrialState

def trials_df(study: optuna.Study) -> pd.DataFrame:
    rows = []
    for t in study.trials:
        if t.state != TrialState.COMPLETE:
            continue
        r = dict(t.params)
        r.update({
            "trial": t.number,
            "val_auc": t.value,
            "duration_sec": t.user_attrs.get("duration_sec", None),
            "save_dir": t.user_attrs.get("save_dir", None),
        })
        rows.append(r)
    return pd.DataFrame(rows) if rows else pd.DataFrame(
        columns=["trial","val_auc","duration_sec","save_dir","arch","lr","batch_size","weight_decay","drop_path"]
    )

def _safe_title(s: str) -> str:
    return "".join(c if c.isalnum() or c in "._- " else "_" for c in s)

def plot_opt_history_matplotlib(df: pd.DataFrame, out_path: str):
    plt.figure()
    x = df["trial"].values
    y = df["val_auc"].values
    order = np.argsort(x)
    plt.plot(x[order], y[order], marker="o")
    plt.xlabel("Trial")
    plt.ylabel("Validation AUC")
    plt.title("Optimization History (matplotlib)")
    plt.tight_layout()
    plt.savefig(out_path, dpi=180)
    plt.close()

def plot_box_auc_by_arch(df: pd.DataFrame, out_path: str):
    if "arch" not in df.columns: return
    plt.figure()
    groups = [g["val_auc"].values for _, g in df.groupby("arch")]
    labels = [k for k, _ in df.groupby("arch")]
    plt.boxplot(groups, labels=labels, showmeans=True)
    plt.ylabel("Validation AUC")
    plt.title("AUC distribution by architecture")
    plt.tight_layout()
    plt.savefig(out_path, dpi=180)
    plt.close()

def plot_scatter_lr_auc_per_arch(df: pd.DataFrame, out_dir: str):
    if "arch" not in df.columns or "lr" not in df.columns: return
    for arch, g in df.groupby("arch"):
        plt.figure()
        # plot per batch_size to see cluster separation; default color cycle is fine
        if "batch_size" in g.columns:
            for b, gb in g.groupby("batch_size"):
                plt.scatter(np.log10(gb["lr"].values), gb["val_auc"].values, label=f"batch={b}", s=30)
            plt.legend()
        else:
            plt.scatter(np.log10(g["lr"].values), g["val_auc"].values, s=30)
        plt.xlabel("log10(lr)")
        plt.ylabel("Validation AUC")
        plt.title(f"LR vs AUC ({arch})")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f"scatter_lr_auc_{_safe_title(arch)}.png"), dpi=180)
        plt.close()

def plot_heatmap_arch_batch_best(df: pd.DataFrame, out_path: str):
    if "arch" not in df.columns or "batch_size" not in df.columns: return
    pivot = df.pivot_table(index="arch", columns="batch_size", values="val_auc", aggfunc="max")
    vals = pivot.values
    plt.figure()
    im = plt.imshow(vals, aspect="auto")
    plt.xticks(ticks=np.arange(pivot.shape[1]), labels=list(pivot.columns))
    plt.yticks(ticks=np.arange(pivot.shape[0]), labels=list(pivot.index))
    plt.colorbar(im, label="Best val AUC")
    plt.title("Best AUC by (arch × batch)")
    plt.tight_layout()
    plt.savefig(out_path, dpi=180)
    plt.close()

def plot_topk_learning_curves(study: optuna.Study, out_dir: str, top_k: int = 5):
    # Collect (val_auc, save_dir) and draw curves from training_log.csv
    pool: List[Tuple[float, str]] = []
    for t in study.trials:
        if t.state != TrialState.COMPLETE: 
            continue
        sd = t.user_attrs.get("save_dir")
        if not sd: 
            continue
        csv_path = os.path.join(sd, "training_log.csv")
        if os.path.exists(csv_path):
            pool.append((float(t.value), csv_path))
    pool.sort(key=lambda x: -x[0])
    pool = pool[:top_k]

    for rank, (auc, csv_path) in enumerate(pool, 1):
        epochs, val_auc = [], []
        with open(csv_path, "r") as f:
            reader = csv.DictReader(f)
            for row in reader:
                try:
                    ep = int(row["epoch"])
                    va = float(row["val_auc"])
                except Exception:
                    continue
                if math.isfinite(va):
                    epochs.append(ep); val_auc.append(va)
        if not epochs: 
            continue
        plt.figure()
        plt.plot(epochs[:len(val_auc)], val_auc, marker="")
        plt.xlabel("Epoch"); plt.ylabel("Validation AUC")
        plt.title(f"Val AUC curve (rank {rank}, best={auc:.4f})")
        plt.tight_layout()
        base = os.path.basename(os.path.dirname(csv_path))
        plt.savefig(os.path.join(out_dir, f"val_auc_curve_rank{rank}_{_safe_title(base)}.png"), dpi=180)
        plt.close()

def export_report(df: pd.DataFrame, out_dir: str, top_k: int = 10):
    os.makedirs(out_dir, exist_ok=True)
    # Full table
    df_sorted = df.sort_values("val_auc", ascending=False)
    df_sorted.to_csv(os.path.join(out_dir, "trials.csv"), index=False)
    # Compact JSON
    best = df_sorted.iloc[0].to_dict() if len(df_sorted) else {}
    board = df_sorted.head(top_k).to_dict(orient="records")
    report = {"best": best, "leaderboard": board}
    with open(os.path.join(out_dir, "report.json"), "w") as f:
        json.dump(report, f, indent=2)

def viz_hpo(study: optuna.Study, out_dir: str = "./hpo_figs", top_k_curves: int = 5):
    os.makedirs(out_dir, exist_ok=True)
    df = trials_df(study)
    if df.empty:
        print("[viz] No completed trials to visualize.")
        return None

    # 1) Save a compact report + CSV
    export_report(df, out_dir, top_k=10)

    # 2) Plotly (if available)
    try:
        from optuna.visualization import (
            plot_optimization_history,
            plot_param_importances,
            plot_parallel_coordinate,
        )
        plot_optimization_history(study).write_html(os.path.join(out_dir, "opt_history.html"))
        plot_param_importances(study).write_html(os.path.join(out_dir, "param_importance.html"))
        plot_parallel_coordinate(study).write_html(os.path.join(out_dir, "parallel_coord.html"))
    except Exception as e:
        print(f"[viz] Plotly visuals unavailable or failed: {e}")

    # 3) Matplotlib fallbacks / complementary views
    plot_opt_history_matplotlib(df, os.path.join(out_dir, "opt_history_matplotlib.png"))
    plot_box_auc_by_arch(df, os.path.join(out_dir, "auc_box_by_arch.png"))
    plot_scatter_lr_auc_per_arch(df, out_dir)
    plot_heatmap_arch_batch_best(df, os.path.join(out_dir, "heatmap_arch_batch_best.png"))

    # 4) Learning curves for top-K trials (reads training_log.csv)
    plot_topk_learning_curves(study, out_dir, top_k=top_k_curves)

    # Return DataFrame for interactive inspection
    return df


In [None]:
df = viz_hpo(study, out_dir="./hpo_figs", top_k_curves=5)
df.head()
