In [13]:
import os

KAGGLE =  os.getenv("KAGGLE_URL_BASE") is not None
COLAB = os.getenv("COLAB_GPU") is not None
TPU = os.getenv("XRT_TPU_CONFIG") is not None
LOCAL = not KAGGLE and not COLAB

if not LOCAL:
    !git clone https://github.com/nclibz/MRKnee/

if COLAB:
    os.chdir('/content/MRKnee/')
    !git checkout v3
    from google.colab import drive
    drive.mount('/content/drive')
    DATADIR = "/content/drive/MyDrive/MRKnee/data"
    if TPU:
        !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

if KAGGLE:
    os.chdir('/kaggle/working/MRKnee/')
    !git checkout v3
    DATADIR = "/kaggle/input/mrknee/MRNet"
    
    if TPU:
        !pip install torchtext==0.9
        !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
        !python pytorch-xla-env-setup.py --version 1.8

if not LOCAL:
    !pip install --quiet "pytorch-lightning>=1.4.9" "torchmetrics>=0.5" "timm" "neptune-client" "optuna" "PyMySql" "torch-tb-profiler"
    !pip install albumentations --upgrade --quiet
    BACKBONE = "tf_efficientnetv2_s_in21k"

if LOCAL:
    DATADIR = "data"
    BACKBONE = "tf_mobilenetv3_small_minimal_100"
    %load_ext autoreload
    %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
DIAGNOSIS = "acl"
PLANE = "sagittal"

In [15]:
from src.model import MRKnee
from src.data import MRKneeDataModule
from src.augmentations import Augmentations
from src.callbacks import Callbacks
from src.cfg import Cfg
import pytorch_lightning as pl
import optuna

pl.seed_everything(123)

Global seed set to 123


123

In [16]:

def objective(trial, diagnosis=DIAGNOSIS, plane=PLANE, backbone=BACKBONE, datadir=DATADIR, profile = False):

    model = MRKnee(
        backbone=backbone,
        drop_rate=0.0,
        final_drop=0.0,
        learning_rate=0.0001,
        log_auc=True,
        log_ind_loss=False,
        adam_wd=0.01,
        max_epochs=20,
        precision=32,
    )

    augs = Augmentations(
        model,
        shift_limit=0.20,
        scale_limit=0.20,
        rotate_limit=30,
        reverse_p=0.5,
        same_range=True,
        indp_normalz=True,
    )

    dm = MRKneeDataModule(
        datadir=datadir,
        diagnosis=diagnosis,
        plane=plane,
        transforms=augs,
        clean=True,
        num_workers=2,
        pin_memory=True,
        trim_train=True,
    )

    configs = Cfg(model = model, dm = dm, augs = augs)
    cfg = configs.get_cfg()
    
    if trial is not None:
        callbacks = Callbacks(cfg, trial, neptune_name="tester")
        neptune_logger = callbacks.get_neptune_logger()
        list_of_cbs = callbacks.get_callbacks()
        fast_dev_run = False
    else:
        neptune_logger = False
        list_of_cbs = None
        fast_dev_run = 50

    profiler = pl.profiler.PyTorchProfiler(dirpath = "src/logs", filename = "profiler") if profile else False
        


    trainer = pl.Trainer(
        gpus=1,
        precision=cfg["precision"],
        max_epochs=cfg["max_epochs"],
        logger=neptune_logger,
        log_every_n_steps=100,
        num_sanity_val_steps=0,
        callbacks=list_of_cbs,
        progress_bar_refresh_rate=20,
        deterministic=True,
        profiler = profiler, 
        fast_dev_run = fast_dev_run,
    )

    trainer.fit(model, dm)

    ## UPLOAD BEST CHECKPOINTS TO LOG
    if trial is not None:
        callbacks.upload_best_checkpoints()

    return callbacks.model_checkpoint.best_model_score.item()


In [17]:
# For testing
#objective(trial = None)

In [18]:

pruner = optuna.pruners.HyperbandPruner(min_resource=10)
sampler = optuna.samplers.TPESampler(multivariate=True)
storage = optuna.storages.RDBStorage(
    url="mysql+pymysql://admin:Testuser1234@database-1.c17p2riuxscm.us-east-2.rds.amazonaws.com/optuna",
    heartbeat_interval=120,
    grace_period=360,
)
study_name = f"{DIAGNOSIS}_{PLANE}_{BACKBONE}"

study = optuna.create_study(
    storage=storage,
    study_name=study_name,
    load_if_exists=True,
    sampler=sampler,
    pruner=pruner,
    direction="minimize",
)
#study.enqueue_trial({
#    'dropout': 55,
#    'lr': 3.e-4,
#    'rotate': 25,
#    'scale': 8,
#    'shift': 10,
#    'adam_wd': 0.0900
#    })


study.optimize(objective, n_trials=40, timeout=8 * 60 * 60)

[32m[I 2021-10-07 14:50:54,257][0m Using an existing study with name 'acl_sagittal_tf_mobilenetv3_small_minimal_100' instead of creating a new one.[0m
NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type        | Params
-------------------------------------------
0 | backbone   | MobileNetV3 | 1.0 M 
1 | final_drop | Dropout     | 0     
2 | clf        | Linear      | 1.0 K 
-------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.082     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]https://ui.neptune.ai/nclibz/tester/e/TES-6
Epoch 0:  92%|█████████▏| 1140/1241 [01:16<00:06, 14.98it/s, loss=1.01, v_num=ES-6]



Epoch 0: 100%|██████████| 1241/1241 [01:21<00:00, 15.22it/s, loss=1.17, v_num=ES-6, val_loss=0.727, val_auc=0.727]

Epoch 0, global step 1120: val_loss reached 0.72745 (best 0.72745), saving model to "/home/nicolai/OneDrive/Forskning/Projekter/MRKnee/checkpoints/trial3/epoch=00-val_loss=0.73-val_auc=0.73.ckpt" as top 3


Epoch 1:   0%|          | 0/1241 [00:00<00:01, 1083.80it/s, loss=1.17, v_num=ES-6, val_loss=0.727, val_auc=0.727] 



Epoch 1:  92%|█████████▏| 1140/1241 [01:10<00:06, 16.08it/s, loss=0.671, v_num=ES-6, val_loss=0.727, val_auc=0.727, train_loss=1.180]



Epoch 1: 100%|██████████| 1241/1241 [01:15<00:00, 16.36it/s, loss=0.582, v_num=ES-6, val_loss=0.647, val_auc=0.788, train_loss=1.180]

Epoch 1, global step 2241: val_loss reached 0.64667 (best 0.64667), saving model to "/home/nicolai/OneDrive/Forskning/Projekter/MRKnee/checkpoints/trial3/epoch=01-val_loss=0.65-val_auc=0.79.ckpt" as top 3


Epoch 2:   0%|          | 0/1241 [00:00<00:00, 1373.38it/s, loss=0.582, v_num=ES-6, val_loss=0.647, val_auc=0.788, train_loss=1.180] 



Epoch 2:  92%|█████████▏| 1140/1241 [01:15<00:06, 15.03it/s, loss=0.485, v_num=ES-6, val_loss=0.647, val_auc=0.788, train_loss=0.778]



Epoch 2: 100%|██████████| 1241/1241 [01:21<00:00, 15.31it/s, loss=0.517, v_num=ES-6, val_loss=0.674, val_auc=0.817, train_loss=0.778]

Epoch 2, global step 3362: val_loss reached 0.67356 (best 0.64667), saving model to "/home/nicolai/OneDrive/Forskning/Projekter/MRKnee/checkpoints/trial3/epoch=02-val_loss=0.67-val_auc=0.82.ckpt" as top 3


Epoch 3:   0%|          | 0/1241 [00:00<00:05, 238.27it/s, loss=0.517, v_num=ES-6, val_loss=0.674, val_auc=0.817, train_loss=0.778] 



Epoch 3:  92%|█████████▏| 1140/1241 [01:16<00:06, 14.86it/s, loss=0.364, v_num=ES-6, val_loss=0.674, val_auc=0.817, train_loss=0.490]



Epoch 3: 100%|██████████| 1241/1241 [01:22<00:00, 15.08it/s, loss=0.419, v_num=ES-6, val_loss=0.858, val_auc=0.668, train_loss=0.490]

Epoch 3, global step 4483: val_loss was not in top 3


Epoch 4:   0%|          | 0/1241 [00:00<00:02, 424.70it/s, loss=0.419, v_num=ES-6, val_loss=0.858, val_auc=0.668, train_loss=0.490] 



Epoch 4:  92%|█████████▏| 1140/1241 [01:19<00:07, 14.30it/s, loss=0.0698, v_num=ES-6, val_loss=0.858, val_auc=0.668, train_loss=0.257]



Epoch 4: 100%|██████████| 1241/1241 [01:25<00:00, 14.46it/s, loss=0.0688, v_num=ES-6, val_loss=1.150, val_auc=0.725, train_loss=0.257]

Epoch 4, global step 5604: val_loss was not in top 3


Epoch 5:   0%|          | 0/1241 [00:00<00:02, 525.93it/s, loss=0.0688, v_num=ES-6, val_loss=1.150, val_auc=0.725, train_loss=0.257] 



Epoch 5:  92%|█████████▏| 1140/1241 [01:14<00:06, 15.42it/s, loss=0.0224, v_num=ES-6, val_loss=1.150, val_auc=0.725, train_loss=0.0965]



Epoch 5: 100%|██████████| 1241/1241 [01:19<00:00, 15.67it/s, loss=0.0227, v_num=ES-6, val_loss=0.764, val_auc=0.809, train_loss=0.0965]

Epoch 5, global step 6725: val_loss was not in top 3


Epoch 6:   0%|          | 0/1241 [00:00<00:03, 340.01it/s, loss=0.0227, v_num=ES-6, val_loss=0.764, val_auc=0.809, train_loss=0.0965] 



Epoch 6:  92%|█████████▏| 1140/1241 [01:18<00:06, 14.58it/s, loss=0.00837, v_num=ES-6, val_loss=0.764, val_auc=0.809, train_loss=0.0266]



Epoch 6: 100%|██████████| 1241/1241 [01:23<00:00, 14.86it/s, loss=0.00837, v_num=ES-6, val_loss=1.060, val_auc=0.808, train_loss=0.0266]

Epoch 6, global step 7846: val_loss was not in top 3


Epoch 7:   0%|          | 0/1241 [00:00<00:00, 1351.26it/s, loss=0.00837, v_num=ES-6, val_loss=1.060, val_auc=0.808, train_loss=0.0266] 



Epoch 7:  92%|█████████▏| 1140/1241 [01:16<00:06, 14.83it/s, loss=0.00155, v_num=ES-6, val_loss=1.060, val_auc=0.808, train_loss=0.00803]



Epoch 7: 100%|██████████| 1241/1241 [01:22<00:00, 15.11it/s, loss=0.00156, v_num=ES-6, val_loss=1.010, val_auc=0.836, train_loss=0.00803]

Epoch 7, global step 8967: val_loss was not in top 3


Epoch 8:   0%|          | 0/1241 [00:00<00:02, 512.44it/s, loss=0.00156, v_num=ES-6, val_loss=1.010, val_auc=0.836, train_loss=0.00803] 



Epoch 8:  92%|█████████▏| 1140/1241 [01:15<00:06, 15.03it/s, loss=0.000838, v_num=ES-6, val_loss=1.010, val_auc=0.836, train_loss=0.00306]



Epoch 8: 100%|██████████| 1241/1241 [01:21<00:00, 15.31it/s, loss=0.000838, v_num=ES-6, val_loss=0.919, val_auc=0.836, train_loss=0.00306]

Epoch 8, global step 10088: val_loss was not in top 3


Epoch 9:   0%|          | 0/1241 [00:00<00:03, 316.15it/s, loss=0.000838, v_num=ES-6, val_loss=0.919, val_auc=0.836, train_loss=0.00306] 



Epoch 9:  73%|███████▎  | 900/1241 [01:02<00:23, 14.34it/s, loss=0.00126, v_num=ES-6, val_loss=0.919, val_auc=0.836, train_loss=0.00128] 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Epoch 9:  73%|███████▎  | 900/1241 [01:13<00:27, 12.32it/s, loss=0.00126, v_num=ES-6, val_loss=0.919, val_auc=0.836, train_loss=0.00128]

[32m[I 2021-10-07 15:04:39,402][0m Trial 3 finished with value: 0.6466671228408813 and parameters: {}. Best is trial 3 with value: 0.646667.[0m
NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type        | Params
-------------------------------------------
0 | backbone   | MobileNetV3 | 1.0 M 
1 | final_drop | Dropout     | 0     
2 | clf        | Linear      | 1.0 K 
-------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.082     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Epoch 9:  73%|███████▎  | 900/1241 [01:23<00:31, 10.77it/s, loss=0.00126, v_num=ES-6, val_loss=0.919, val_auc=0.836, train_loss=0.00128]
https://ui.neptune.ai/nclibz/tester/e/TES-7
Epoch 0:   0%|          | 0/1241 [00:00<00:03, 392.47it/s]  



Epoch 0:  60%|█████▉    | 740/1241 [00:50<00:33, 14.79it/s, loss=1.21, v_num=ES-7] 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
[33m[W 2021-10-07 15:05:40,106][0m Trial 4 failed because of the following error: AttributeError("'NoneType' object has no attribute 'tolist'")[0m
Traceback (most recent call last):
  File "/home/nicolai/miniconda3/envs/dl/lib/python3.9/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_9268/379676973.py", line 71, in objective
    callbacks.upload_best_checkpoints()
  File "/home/nicolai/OneDrive/Forskning/Projekter/MRKnee/src/callbacks.py", line 63, in upload_best_checkpoints
    "best_val_loss", self.model_checkpoint.best_model_score.tolist()
AttributeError: 'NoneType' object has no attribute 'tolist'


AttributeError: 'NoneType' object has no attribute 'tolist'

Epoch 0:  60%|█████▉    | 740/1241 [01:07<00:45, 11.03it/s, loss=1.21, v_num=ES-7]

Info (NVML): The operating system has blocked the request.. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 
Info (NVML): The operating system has blocked the request.. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 
