# 01 Distributed Dask Example

#### Objective: 
- Demonstrate how to spin up multiple nodes for Distributed Dask using the Cloudera CDSW workers API
- Show how to kill workers once complete

#### Creating the Dask Cluster with the utils module

In [1]:
import cdsw_dask_utils
import cdsw

# Run a Dask cluster with three workers and return an object containing
# a description of the cluster. 
# 
# Note that the scheduler will run in the current session, and the Dask
# dashboard will become available in the nine-dot menu at the upper
# right corner of the CDSW app.

cluster = cdsw_dask_utils.run_dask_cluster(
  n=3, \
  cpu=1, \
  memory=1, \
  nvidia_gpu=0
)

# Connect a Dask client to the scheduler address in the cluster
# description.
from dask.distributed import Client
client = Client(cluster["scheduler_address"])

Waiting for Dask scheduler to become ready...
Dask scheduler is ready
IDs ['ar4nrrt7q0wmarcv', 'sn10s6r349gpjy7q', '4sytsz6kd1wcw580']


#### Dask Scheduler UI

In [None]:
#Get the Dask Scheduler UI
import os 
engine_id = os.environ.get('CDSW_ENGINE_ID')
cdsw_domain = os.environ.get('CDSW_DOMAIN')

from IPython.core.display import HTML
HTML('<a  target="_blank" rel="noopener noreferrer" href="http://read-only-{}.{}">http://read-only-{}.{}</a>'
     .format(engine_id,cdsw_domain,engine_id,cdsw_domain))

#### Notice we passed the cluster we created via the utils module into the instance of Dask Client

In [2]:
import time 
time.sleep(10)
import json
print(json.dumps(client.scheduler_info(), indent=4))

{
    "type": "Scheduler",
    "id": "Scheduler-89c55540-0e96-4aa4-b250-07fcbcce6df7",
    "address": "tcp://10.0.85.15:2323",
    "services": {
        "dashboard": 8100
    },
    "workers": {
        "tcp://10.0.108.198:43015": {
            "type": "Worker",
            "id": "tcp://10.0.108.198:43015",
            "host": "10.0.108.198",
            "resources": {},
            "local_directory": "/home/cdsw/worker-lfbi8z02",
            "name": "tcp://10.0.108.198:43015",
            "nthreads": 16,
            "memory_limit": 999997440,
            "last_seen": 1601163360.4828508,
            "services": {
                "dashboard": 35573
            },
            "metrics": {
                "cpu": 0.0,
                "memory": 103501824,
                "time": 1601163360.4820123,
                "read_bytes": 286.03887227945313,
                "write_bytes": 812.1103646535522,
                "num_fds": 27,
                "executing": 0,
                "in_memory": 0,


#### Just some basic operations

In [3]:
def square(x):
    return x ** 2

def neg(x):
    return -x

In [4]:
A = client.map(square, range(10))
print(client.gather(A))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [5]:
B = client.map(neg, A)
print(client.gather(B))

[0, -1, -4, -9, -16, -25, -36, -49, -64, -81]


In [6]:
total = client.submit(sum, B)
print("Result: ", total.result())

Result:  -285


In [7]:
total

#### Printing Scheduler and Worker Metadata

In [8]:
cluster

{'scheduler_address': '10.0.85.15:2323',
 'scheduler_proc': <subprocess.Popen at 0x7ff264a4b860>,
 'workers': [{'adhoc': False,
   'app_url': 'http://ar4nrrt7q0wmarcv.ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/',
   'assets_cdn_root': 'http://consoles.ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/0/644/ar4nrrt7q0wmarcv/',
   'batch': True,
   'biller': {'html_url': 'http://ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/pauldefusco',
    'id': 9,
    'name': 'Paul de Fusco',
    'url': 'http://ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/api/v1/users/pauldefusco',
    'username': 'pauldefusco'},
   'cluster_id': 1,
   'cost': 0,
   'cpu': 1,
   'created_at': '2020-09-26T23:35:33.844Z',
   'creator': {'html_url': 'http://ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/pauldefusco',
    'id': 9,
    'name': 'Paul de Fusco',
    'url': 'http://ml-2f4cffbb-91e.demo-aws.ylcu-atmi.cloudera.site/api/v1/users/pauldefusco',
    'username': 'pauldefusco'},
   'deleted': False,
   

#### Stopping CDSW Workers

#### Note: if you have multiple notebooks with different Dask clusters, make sure to use stop_workers by passing your workers as method arguments

In [9]:
## stop CDSW workers
#Parameter
#worker_id (int, optional) - The ID numbers of the worker engines that must be stopped. 
#If an ID is not provided, all the worker engines on the cluster will be stopped.

cdsw.stop_workers()

[<Response [204]>, <Response [204]>, <Response [204]>]

#### Next we will get an introduction to Dask Dataframes