In [1]:
import shutil
import torch
import segmentation_models_pytorch as smp
from functools import partial
from omegaconf import OmegaConf
from pytorch_lightning import Trainer
from pytorch_lightning.logging import TestTubeLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from src.dl.lightning_model import SegModel
from src.conf.conf_schema import Schema
from src.conf.config import CONFIG


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192


can't resolve package from __spec__ or __package__, falling back on __name__ and __path__


numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192


can't resolve package from __spec__ or __package__, falling back on __name__ and __path__



In [2]:
class TuneReportCallback(Callback):
    def on_validation_end(self, trainer, pl_module):
        tune.report(
            loss=trainer.callback_metrics["avg_val_loss"].item(),
            mean_accuracy=trainer.callback_metrics["avg_val_accuracy"].item())


class CheckpointCallback(Callback):
    def on_validation_end(self, trainer, pl_module):
        with tune.checkpoint_dir(step=trainer.global_step) as checkpoint_dir:
            trainer.save_checkpoint(os.path.join(checkpoint_dir, "checkpoint"))

In [3]:
# g = pl_load(lightning_model.fm.model_checkpoint("last").as_posix())

In [4]:
# lightning_model._load_model_state

In [5]:
# trainer.current_epoch

In [6]:
config = CONFIG

def train_tune_checkpoint(
    training_args,
    dataset_args,
    experiment_args,
    checkpoint_dir=None,
    num_epochs=10,
    num_gpus=0):
    
    tt_logger = TestTubeLogger(
        save_dir=tune.get_trial_dir(),
        name=config.experiment_args.model_name,
        version=config.experiment_args.experiment_version
    )
    
    trainer = Trainer(
        default_root_dir=config.experiment_args.experiment_root_dir,
        max_epochs=config.training_args.num_epochs,
        gpus=config.training_args.num_gpus,  
        logger=tt_logger,
        progress_bar_refresh_rate=0,
        callbacks=[CheckpointCallback(), TuneReportCallback()],
        profiler=True
    )
    
    # Get the model from checkpoint or from 0
    if checkpoint_dir:
        base_model = smp.Unet(
            encoder_name="resnext50_32x4d", 
            classes=2
        )
        
        pl_model = SegModel(
            base_model, 
            config.dataset_args,
            config.experiment_args,
            config.training_args
        )
        
        # get the ckpt
        checkpoint = pl_load(checkpoint_dir, map_location=lambda storage, loc: storage)
        #checkpoint = torch.load(checkpoint_dir, map_location = lambda storage, loc : storage)
        pl_model.load_state_dict(checkpoint['state_dict'])
        trainer.current_epoch = checkpoint["epoch"]
    else:
        base_model = smp.Unet(
            encoder_name="resnext50_32x4d", 
            classes=2
        )
        
        pl_model = SegModel(
            base_model, 
            config.dataset_args,
            config.experiment_args,
            config.training_args
        )


    trainer.fit(pl_model)
    
#train_tune_checkpoint(
#    config.dataset_args,
#    config.experiment_args,
#    config.training_args,
#    checkpoint_dir="/home/leos/Dippa/results/tests/Unet/version_test_pannuke_unet2/epoch=6.ckpt",
#    data_dir="/home/leos/Dippa/patches/hdf5/pannuke/patch256_train_pannuke.pytable",
#    num_epochs=10,
#    num_gpus=1
#)

In [7]:
gpus_per_trial=1
partial(
    train_tune_checkpoint,
    dataset_args=config.dataset_args,
    experiment_args=config.experiment_args,
    num_epochs=config.training_args.num_epochs,
    num_gpus=gpus_per_trial
)

functools.partial(<function train_tune_checkpoint at 0x7fd62258b840>, dataset_args={'dataset': 'kumar', 'class_types': 'binary', 'patches_dtype': 'hdf5', 'hdf5_patches_root_dir': '/home/local/leos/Dippa_test/patches/hdf5', 'npy_patches_root_dir': '/home/local/leos/Dippa_test/patches/npy', 'phases': ['train', 'valid', 'test'], 'tissues': []}, experiment_args={'model_name': 'Unet', 'experiment_version': 'test_pannuke_unet2', 'experiment_root_dir': '/home/local/leos/Dippa_test/results/tests'}, num_epochs=7, num_gpus=1)

In [8]:
def tune_pbt(
    config, 
    num_samples=10, 
    num_epochs=10, 
    gpus_per_trial=1) -> None:
    
    tune.register_trainable('train_tune_checkpoint', train_tune_checkpoint)
    
    # import sys
    # print(sys.path)
    train_config = {
        "edge_weight":1,
        "lr": 1e-3,
        "batch_size": 6,
    }

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="loss",
        mode="min",
        perturbation_interval=4,
        hyperparam_mutations={
            "lr": lambda: tune.loguniform(1e-4, 1e-1).func(None),
            "batch_size": [4, 8, 16],
            "edge_weight":[1.1, 1.2, 1.5, 2]
        })

    reporter = CLIReporter(
        parameter_columns=["edge_weight", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"]
    )

    tune.run(
        partial(
            train_tune_checkpoint,
            dataset_args=config.dataset_args,
            experiment_args=config.experiment_args,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial
        ),
        resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
        config=train_config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_pbt"
    )


In [None]:
tune_pbt(config, num_samples=1, num_epochs=1, gpus_per_trial=1)


unclosed file <_io.TextIOWrapper name='/proc/driver/nvidia/gpus/0000:01:00.0/information' mode='r' encoding='UTF-8'>

2020-09-23 19:05:47,466	INFO resource_spec.py:231 -- Starting Ray with 14.4 GiB memory available for workers and up to 7.22 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).

unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2020-09-23_19-05-47_464850_16710/logs/redis-shard_0.err' mode='a' encoding='utf-8'>


unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2020-09-23_19-05-47_464850_16710/logs/redis-shard_0.out' mode='a' encoding='utf-8'>


unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2020-09-23_19-05-47_464850_16710/logs/redis.err' mode='a' encoding='utf-8'>


unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2020-09-23_19-05-47_464850_16710/logs/redis.out' mode='a' encoding='utf-8'>


unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2020-09-23_19-05-47_464850_16710/logs/gcs

== Status ==
Memory usage on this node: 8.4/31.2 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 1/12 CPUs, 1/1 GPUs, 0.0/14.4 GiB heap, 0.0/4.98 GiB objects (0/1.0 GPUType:RTX)
Result logdir: /home/leos/ray_results/tune_pbt
Number of trials: 1 (1 RUNNING)
+---------------------+----------+-------+---------------+-------+--------------+
| Trial name          | status   | loc   |   edge_weight |    lr |   batch_size |
|---------------------+----------+-------+---------------+-------+--------------|
| DEFAULT_a5081_00000 | RUNNING  |       |             1 | 0.001 |            6 |
+---------------------+----------+-------+---------------+-------+--------------+




[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m can't resolve package from __spec__ or __package__, falling back on __name__ and __path__
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m numpy.ufunc size changed, may indicate binary inco

[2m[36m(pid=16786)[0m ['/home/leos/.local/lib/python3.6/site-packages/git/ext/gitdb', '/home/leos/.local/lib/python3.6/site-packages/ray/thirdparty_files', '/home/local/leos/Dippa_test/notebooks', '/home/leos/.local/lib/python3.6/site-packages', '/home/leos/.local/lib/python3.6/site-packages/ray/pickle5_files', '/home/local/leos/.local/lib/python3.6/site-packages/ray/workers', '/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '/home/leos/.local/lib/python3.6/site-packages', '/home/local/leos/Dippa_test', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages', '/home/leos/.local/lib/python3.6/site-packages/IPython/extensions', '/home/leos/.local/lib/python3.6/site-packages/gitdb/ext/smmap']
Validation sanity check: 0it [00:00, ?it/s]


[2m[36m(pid=16786)[0m 
[2m[36m(pid=16786)[0m   | Name  | Type             | Params
[2m[36m(pid=16786)[0m -------------------------------------------
[2m[36m(pid=16786)[0m 0 | model | Unet             | 31 M  
[2m[36m(pid=16786)[0m 1 | CE    | CrossEntropyLoss | 0     


Validation sanity check:  50%|█████     | 1/2 [00:02<00:02,  2.66s/it]
Validation sanity check: 100%|██████████| 2/2 [00:03<00:00,  2.01s/it]
Epoch 1:   0%|          | 0/845 [00:00<?, ?it/s]                      
Epoch 1:   0%|          | 1/845 [00:03<46:18,  3.29s/it, loss=0.860, v_num=test_pannuke_unet2, train_loss=0.86]
Epoch 1:   0%|          | 2/845 [00:04<28:09,  2.00s/it, loss=0.865, v_num=test_pannuke_unet2, train_loss=0.871]
Epoch 1:   0%|          | 3/845 [00:04<22:09,  1.58s/it, loss=0.862, v_num=test_pannuke_unet2, train_loss=0.856]
Epoch 1:   0%|          | 4/845 [00:06<23:05,  1.65s/it, loss=0.869, v_num=test_pannuke_unet2, train_loss=0.888]
Epoch 1:   1%|          | 5/845 [00:07<20:19,  1.45s/it, loss=0.875, v_num=test_pannuke_unet2, train_loss=0.899]
Epoch 1:   1%|          | 6/845 [00:07<18:29,  1.32s/it, loss=0.875, v_num=test_pannuke_unet2, train_loss=0.874]
Epoch 1:   1%|          | 7/845 [00:08<16:55,  1.21s/it, loss=0.874, v_num=test_pannuke_unet2, train_loss=0.87