## ray training with pytorch

In [2]:
# !pip install 'ray[default]' --quiet
!pip install tensorboardX --quiet



In [4]:
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms

import ray
from ray.util.sgd.torch import TorchTrainer
from ray.util.sgd.torch import TrainingOperator
# https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py
from ray.util.sgd.torch.resnet import ResNet18


## initialize a remote Ray cluster

In [3]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 2
cpu_core_per_worker = 7
ram_gb_per_worker = 6 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)

deleting pod ray-worker-5e72135e-563b-40c3-b148-cc539f2fda7b
deleting pod ray-worker-6af3e495-afd7-4968-acdd-37975c8c1ace
👉 Hyperplane: selecting worker node pool
best pool spec {'pool_env_var': 'DASK_POOL_16_16', 'allocatable_cores': 15.0, 'allocatable_ram': 12.0}




Waiting for worker ray-worker-1274a214-9c4a-4935-b0be-8b7bca9a7175...
Waiting for worker ray-worker-32267120-50f5-479e-b2e0-70873efba81a...


[2m[33m(raylet, ip=10.1.157.3)[0m [2021-12-08 05:44:43,964 E 16 16] agent_manager.cc:134: Not all required Ray dependencies for the runtime_env feature were found. To install the required dependencies, please run `pip install 'ray[default]'`.
[2m[33m(raylet, ip=10.1.157.3)[0m [2021-12-08 05:44:43,964 E 16 16] worker_pool.cc:566: [Eagerly] Couldn't create a runtime environment for job 01000000.


In [5]:
def cifar_creator(config):
    """Returns dataloaders to be used in `train` and `validate`."""
    tfms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])  # meanstd transformation
    train_loader = DataLoader(
        CIFAR10(root="~/data", download=True, transform=tfms), batch_size=config["batch"])
    validation_loader = DataLoader(
        CIFAR10(root="~/data", download=True, transform=tfms), batch_size=config["batch"])
    return train_loader, validation_loader

def optimizer_creator(model, config):
    """Returns an optimizer (or multiple)"""
    return torch.optim.SGD(model.parameters(), lr=config["lr"])

CustomTrainingOperator = TrainingOperator.from_creators(
    model_creator=ResNet18, # A function that returns a nn.Module
    optimizer_creator=optimizer_creator, # A function that returns an optimizer
    data_creator=cifar_creator, # A function that returns dataloaders
    loss_creator=torch.nn.CrossEntropyLoss  # A loss function
    )


In [6]:
trainer = TorchTrainer(
    training_operator_cls=CustomTrainingOperator,
    config={"lr": 0.01, # used in optimizer_creator
            "batch": 64 # used in data_creator
           },
    num_workers=2,  # amount of parallelism
    use_gpu=torch.cuda.is_available(),
    use_tqdm=True)

[2m[36m(pid=55, ip=10.1.158.3)[0m 2021-12-08 05:45:12,295	INFO distributed_torch_runner.py:58 -- Setting up process group for: tcp://10.1.158.3:37499 [rank=0]
[2m[36m(pid=54, ip=10.1.157.3)[0m 2021-12-08 05:45:12,621	INFO distributed_torch_runner.py:58 -- Setting up process group for: tcp://10.1.158.3:37499 [rank=1]
0it [00:00, ?it/s]ip=10.1.158.3)[0m 
0it [00:00, ?it/s]ip=10.1.157.3)[0m 


[2m[36m(DistributedTorchRunner pid=55, ip=10.1.158.3)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/data/cifar-10-python.tar.gz
[2m[36m(DistributedTorchRunner pid=54, ip=10.1.157.3)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 434176/170498071 [00:00<00:43, 3926175.35it/s]
  0%|          | 434176/170498071 [00:00<00:44, 3825303.08it/s]
  3%|▎         | 4677632/170498071 [00:00<00:06, 25587248.60it/s]
  2%|▏         | 3563520/170498071 [00:00<00:08, 19004184.77it/s]
  6%|▌         | 10182656/170498071 [00:00<00:04, 38470912.76it/s]
  4%|▍         | 6725632/170498071 [00:00<00:06, 24562229.29it/s]
  9%|▉         | 15458304/170498071 [00:00<00:03, 43997172.69it/s]
  6%|▌         | 9904128/170498071 [00:00<00:05, 27313987.95it/s]
 12%|█▏        | 20471808/170498071 [00:00<00:03, 46054359.53it/s]
  8%|▊         | 13066240/170498071 [00:00<00:05, 28663002.55it/s]
 15%|█▌        | 25829376/170498071 [00:00<00:02, 48473165.10it/s]
 10%|▉         | 16310272/170498071 [00:00<00:05, 29828920.03it/s]
 18%|█▊        | 31006720/170498071 [00:00<00:02, 49532843.98it/s]
 11%|█▏        | 19472384/170498071 [00:00<00:0

[2m[36m(DistributedTorchRunner pid=54, ip=10.1.157.3)[0m Extracting /root/data/cifar-10-python.tar.gz to /root/data


170500096it [00:03, 42925697.02it/s]                               


[2m[36m(DistributedTorchRunner pid=55, ip=10.1.158.3)[0m Extracting /root/data/cifar-10-python.tar.gz to /root/data
[2m[36m(DistributedTorchRunner pid=54, ip=10.1.157.3)[0m Files already downloaded and verified
[2m[36m(DistributedTorchRunner pid=55, ip=10.1.158.3)[0m Files already downloaded and verified


In [None]:
stats = trainer.train()
print(trainer.validate())

In [None]:
torch.save(trainer.state_dict(), "checkpoint.pt")
trainer.shutdown()
print("success!")

In [7]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-1274a214-9c4a-4935-b0be-8b7bca9a7175
Deleting ray-worker-32267120-50f5-479e-b2e0-70873efba81a
