# You must be in the `pytorch-pycm-template` folder to run raytune. Please move the file location.

```   
pytorch-pycm-template  
L raytuner.ipynb    
```   

# 0. Set up preparation
## Auto-Reload

In [1]:
%load_ext autoreload
%autoreload 2
# 파이썬 코드를 실행하기 전에 항상 모든 모듈을 Reload

## Change items

In [3]:
from functools import partial
import ray
from ray import tune as raytune
from ray.tune.schedulers import ASHAScheduler
from ray.train import get_checkpoint

In [4]:
module_dir = 'your workspace directory' 
config_path = 'train config file path' 

# raytune setting
tmp_dir = 'raytune save dir'
resources_per_trial = {"cpu": 4, "gpu":1}
MAXEPOCHS = 10
raytune_trial_num = 2

ray_tune_config = {
    "batch_size": raytune.choice([8, 16, 32, 64, 128, 512]),
    "dropout": raytune.choice([0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    "loss": raytune.choice(['ce_loss', 'bce_loss']),
    "otimp": raytune.choice(['AdamW', 'Lamb', 'Lion']),
    "lr": raytune.loguniform(1e-4, 1e-1),
    "weight_decay": raytune.choice([0, 0.1, 0.2, 0.3, 0.4, 0.5])
}

ray.init(dashboard_host='127.0.0.1', dashboard_port=1130, num_cpus=8, num_gpus=2)

2025-01-17 10:18:22,041	INFO worker.py:1812 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:1130 [39m[22m


0,1
Python version:,3.10.12
Ray version:,2.40.0
Dashboard:,http://127.0.0.1:1130


## Import workspace

In [None]:
import os
import sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

sys.path.append(module_dir)
print(sys.path[-1])

In [6]:
import argparse
import collections
import torch
import numpy as np

from torchinfo import summary
import model.model as module_arch

from torchvision import transforms
import data_loader.transforms as module_transforms
import data_loader.npz_loaders as module_data
import model.optim as module_optim
import model.lr_scheduler as module_lr_scheduler
import model.loss as module_loss
import model.metric as module_metric

from parse_config import ConfigParser
from runner import Trainer
from utils import prepare_device, reset_device, fix_random_seed

In [7]:
# fix random seeds for reproducibility
# SEED = 123
# torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# np.random.seed(SEED)
fix_random_seed()

In [13]:
def ray_tune(config, train_config):
    logger = train_config.get_logger('train')
    
    # setup data_loader instances
    if 'trsfm' in train_config['data_loader']['args'].keys():
        tf_list = []
        for k, v in train_config['data_loader']['args']['trsfm'].items():
            if v is None: tf_list.append(getattr(module_transforms, k)())
            else: tf_list.append(getattr(module_transforms, k)(**v))
        train_config['data_loader']['args']['trsfm'] = transforms.Compose(tf_list)
    ''' raytune '''
    if 'batch_size' in config.keys():
        train_config['data_loader']['args']['batch_size'] = config['batch_size']
    train_data_loader = train_config.init_obj('data_loader', module_data, **{'mode':'train'}).dataloader
    valid_data_loader = train_config.init_obj('data_loader', module_data, **{'mode':'valid'}).dataloader

    # build model architecture, then print to console
    classes = train_data_loader.dataset.classes
    ''' raytune '''
    if 'dropout' in config.keys():
        train_config['arch']['args']['drop_rate'] = config['dropout']
    model = train_config.init_obj('arch', module_arch)
    
    # prepare for (multi-device) GPU training
    device, device_ids = prepare_device(train_config['n_gpu'])
    model = model.to(device)
    if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids)

    # get function handles of loss and metrics
    ''' raytune '''
    # criterion = getattr(module_loss, train_config['loss'])
    if 'loss' in config.keys(): criterion = getattr(module_loss, config['loss'])
    else: criterion = getattr(module_loss, train_config['loss'])
    metrics = [getattr(module_metric, met) for met in train_config['metrics'].keys()]    
        
    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    ''' raytune '''
    if 'otimp' in config.keys():
        train_config['optimizer']['type'] = config['otimp']
    if 'lr' in config.keys():
        train_config['optimizer']['args']['lr'] = config['lr']
    if 'weight_decay' in config.keys():
        train_config['optimizer']['args']['weight_decay'] = config['weight_decay']
    
    optimizer = train_config.init_obj('optimizer', module_optim, trainable_params)
    
    lr_scheduler = None
    if 'lr_scheduler' in train_config.config.keys():
        lr_scheduler = train_config.init_obj('lr_scheduler', module_lr_scheduler, optimizer)
    if lr_scheduler is None: print('lr_scheduler is not set.\n')

    ''' raytune '''
    train_config['trainer']['epochs'] = MAXEPOCHS
    train_config['trainer']['early_stop'] = MAXEPOCHS
    train_config['trainer']['tensorboard'] = False
    train_config['trainer']['tensorboard_projector']['train'] = False
    train_config['trainer']['tensorboard_projector']['valid'] = False
    train_config['trainer']['tensorboard_pred_plot'] = False
    train_config['trainer']['save_performance_plot'] = True

    train_kwargs = {
        'model': model,
        'criterion': criterion,
        'metric_ftns': metrics,
        'plottable_metric_ftns': None,
        'optimizer': optimizer,
        'lr_scheduler': lr_scheduler,
        'config': train_config,
        'classes': classes,
        'device': device,
        'data_loader': train_data_loader,
        'valid_data_loader': valid_data_loader,
        'da_ftns': None,
        'raytune':True
    }
    trainer = Trainer(**train_kwargs)

    trainer.train()

    # print the model infomation
    # Option. Use after training because data flows into the model and calculates it
    use_data = next(iter(train_data_loader))[0].to(device)
    input_size = use_data.shape
    logger.info('\nInput_size: {}'.format(input_size))
    model_info = str(summary(model, input_size=input_size, verbose=0))
    logger.info('{}\n'.format(model_info))

    reset_device('cache')

In [9]:
args = argparse.ArgumentParser(description='PyTorch Template')
args.add_argument('-c', '--config', default=config_path, type=str, help='config file path (default: None)')
args.add_argument('-r', '--resume', default=None, type=str, help='path to latest checkpoint (default: None)')
args.add_argument('-d', '--device', default='0, 1', type=str, help='indices of GPUs to enable (default: all)')
args.add_argument('-t', '--test', default=False, type=bool, help='Whether to enable test mode (default: False)')

# custom cli options to modify configuration from default values given in json file.
CustomArgs = collections.namedtuple('CustomArgs', 'flags type target')
options = [
    CustomArgs(['--lr', '--learning_rate'], type=float, target='optimizer;args;lr'),
    CustomArgs(['--bs', '--batch_size'], type=int, target='data_loader;args;batch_size')
]
train_config = ConfigParser.from_args(args, options)
print(f"setting epoch: {train_config['trainer']['epochs']}")

setting epoch: 100


In [10]:
scheduler = ASHAScheduler(
    metric="BACC",
    mode="max",
    max_t=MAXEPOCHS,
    grace_period=1,
    reduction_factor=2,
)

In [None]:
result = raytune.run(
    partial(ray_tune, train_config=train_config),
    resources_per_trial=resources_per_trial,
    config=ray_tune_config,
    num_samples=raytune_trial_num,
    scheduler=scheduler,
    storage_path=tmp_dir)

In [None]:
best_trial = result.get_best_trial("BACC", "max", "last")
# best_trial

print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
print(f"Best trial final validation Baccuracy: {best_trial.last_result['BACC']}")