# PyTorch and Dask

Creating a cluster and performing some computation

In [1]:
from dask_jobqueue import SLURMCluster
from distributed import Client, LocalCluster
from dask import delayed
import dask

cluster = SLURMCluster(
    memory="64g", processes=1, cores=2
)
num_nodes = 4

cluster.scale(num_nodes)
# cluster = LocalCluster(processes=False)
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://192.168.0.234:8787/status,

0,1
Dashboard: http://192.168.0.234:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://192.168.0.234:35139,Workers: 0
Dashboard: http://192.168.0.234:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [2]:
# Note how dask spins our jobs up in anticipation for work
!squeue

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              1281     batch dask-wor mhar0048 PD       0:00      1 (None)
              1280     batch dask-wor mhar0048 PD       0:00      1 (None)
              1279     batch dask-wor mhar0048 PD       0:00      1 (None)
              1278     batch dask-wor mhar0048 PD       0:00      1 (None)
              1219     batch Jupyter      bpal  R    2:00:17      1 mlerp-node05
              1215     batch Jupyter    yiliao  R 1-02:40:35      1 mlerp-node09
              1214     batch Jupyter    yiliao  R 1-02:44:11      1 mlerp-node05
              1221     batch Jupyter  mhar0048  R    1:28:19      1 mlerp-node09


In [3]:
# The adapt method will let us scale out as we need the compute
# ...and scale back when we're idle
cluster.adapt(minimum=0, maximum=num_nodes)

<distributed.deploy.adaptive.Adaptive at 0x7f26e65f96c0>

In [4]:
!squeue

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              1219     batch Jupyter      bpal  R    2:00:18      1 mlerp-node05
              1215     batch Jupyter    yiliao  R 1-02:40:36      1 mlerp-node09
              1214     batch Jupyter    yiliao  R 1-02:44:12      1 mlerp-node05
              1221     batch Jupyter  mhar0048  R    1:28:20      1 mlerp-node09
              1278     batch dask-wor mhar0048  R       0:01      1 mlerp-node05
              1279     batch dask-wor mhar0048  R       0:01      1 mlerp-node05
              1280     batch dask-wor mhar0048  R       0:01      1 mlerp-node05
              1281     batch dask-wor mhar0048  R       0:01      1 mlerp-node05


In [5]:
# da lets us scale out to the cluster more efficiently than npy
import dask.array as da
x = da.random.random((1000, 1000, 1000))
x

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,119.21 MiB
Shape,"(1000, 1000, 1000)","(250, 250, 250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.45 GiB 119.21 MiB Shape (1000, 1000, 1000) (250, 250, 250) Count 64 Tasks 64 Chunks Type float64 numpy.ndarray",1000  1000  1000,

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,119.21 MiB
Shape,"(1000, 1000, 1000)","(250, 250, 250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray


In [6]:
# dask evaluates lazily, retuning 'futures'
# they can then be computed later for its value
x.compute()

array([[[0.80405356, 0.9411835 , 0.36007594, ..., 0.02030094,
         0.21212197, 0.04754458],
        [0.37128642, 0.41310666, 0.3325612 , ..., 0.03798152,
         0.97033965, 0.2630301 ],
        [0.95302605, 0.60717619, 0.62279257, ..., 0.02378131,
         0.70891056, 0.81519154],
        ...,
        [0.30911321, 0.73415001, 0.82579783, ..., 0.7332855 ,
         0.18230327, 0.76288013],
        [0.65094855, 0.05512554, 0.26758382, ..., 0.35832722,
         0.20738108, 0.90697372],
        [0.26231634, 0.25318755, 0.69022836, ..., 0.71864553,
         0.37859832, 0.76530203]],

       [[0.34433539, 0.66980218, 0.68812066, ..., 0.32081888,
         0.56447307, 0.12123862],
        [0.55811689, 0.12125049, 0.65635326, ..., 0.73970724,
         0.19334334, 0.98051351],
        [0.48441865, 0.21664218, 0.58182796, ..., 0.07877517,
         0.14306654, 0.25959297],
        ...,
        [0.35796157, 0.09189718, 0.26532135, ..., 0.3219118 ,
         0.5925287 , 0.76672455],
        [0.4

In [29]:
# Let's switch to a localcluster for easier active development
# This will make all code execute locally
# We need to make proccesses=False to allow for multiprocessing inside Dask jobs
# for the local cluster to work with PyTorch
client.shutdown()
cluster = LocalCluster(processes=False)
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 32809 instead


### Let's see how Dask works with a typical PyTorch workflow
Content adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

In [40]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.multiprocessing as mp

# Define data transformations
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Define dataset and dataloader
batch_size = 1024
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2, multiprocessing_context=mp.get_context("fork"))

Files already downloaded and verified


In [31]:
# Define a simple conv net
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(16, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(32, 64, 3, stride=2, padding=1)
        self.conv6 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.fc1 = nn.Linear(4 * 4 * 64, 4 * 64)
        self.fc2 = nn.Linear(4 * 64, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [32]:
import torch.optim as optim
from tqdm.notebook import tqdm
criterion = nn.CrossEntropyLoss()

# Train one epoch
def train(loader, path="./model", load=False, test=False):
    # Initialise model, optimizer and device
    model = Net()
    optimizer = optim.Adam(model.parameters(), lr=3e-4)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load state from disk so that we can split up the job
    if load: 
        state = torch.load(path)
        model.load_state_dict(state["model"])
        model.to(device)
        optimizer.load_state_dict(state["optimizer"])
    else:
        model.to(device)
    
    # A typical PyTorch training loop
    running_loss = 0
    for i, (inputs, labels) in enumerate(trainloader):
        # put the inputs on the device
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.detach().item()
        
        # Stop after one batch when testing        
        if test: 
            print("When running in a local cluster you can see print statements")
            break
    
    torch.save({
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict()
        }, path)
    
    return running_loss / len(trainloader) if not test else loss.detach().item()

In [33]:
# Test our code locally first
client.submit(train, trainloader, test=True).result()

When running in a local cluster you can see print statements


2.3047292232513428

In [36]:
# We need to turn off the nanny to allow for multiprocessing inside Dask jobs for the cluster to work with PyTorch
# We can pass in SLURM requirements to ensure we get a GPU for our jobs
client.shutdown()
cluster = SLURMCluster(
    memory="64g", processes=1, cores=2, job_extra_directives=["--gres=gpu:1"], nanny=False
)

client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42077 instead


0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://192.168.0.234:42077/status,

0,1
Dashboard: http://192.168.0.234:42077/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://192.168.0.234:37145,Workers: 0
Dashboard: http://192.168.0.234:42077/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [37]:
# Test our code on the SLURM cluster
client.submit(train, trainloader, test=True).result()

2.30440616607666

In [38]:
# Run the training loop
epochs = 2

with tqdm(total=(epochs)) as pbar:
    for epoch in range(epochs):
        loss = client.submit(train, trainloader, load=epochs).result()
        pbar.update()
        pbar.set_postfix(loss=loss)
        print(f"epoch: {epoch} loss: {loss : .3f}")
client.shutdown()

  0%|          | 0/2 [00:00<?, ?it/s]

2023-02-03 05:54:49,086 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


epoch: 0 loss:  2.250
epoch: 1 loss:  1.999
