## S3 plugin to download data to dask workers

In [None]:
!pip install boto3 --quiet
!pip install s3urls --quiet

### customized dask worker plugin for files and gz files

In [1]:
from distributed.diagnostics.plugin import WorkerPlugin
class S3DownloadPlugin(WorkerPlugin):
    def __init__(self, s3Url, filename):
        self.s3Url = s3Url
        self.filename = filename
        
    def setup(self, worker):
        self.worker = worker
        import boto3
        import click
        from s3urls import parse_url
        import tarfile

        import logging, traceback, sys, os    
        logger = logging.getLogger("embeddings microservice")
        logger.debug("downloading file...")
        vocab_parsed_url = parse_url(self.s3Url)
        s3 = boto3.client('s3')
        s3.download_file(vocab_parsed_url['bucket'], vocab_parsed_url['key'], self.filename)
        logger.debug("done downloading....")
        logger.debug("extracting....")
        if self.filename.endswith("tar.gz") or self.filename.endswith("tgz"):
            tar = tarfile.open(self.filename, "r:gz")
            tar.extractall(path='/tmp/')
            tar.close()
            os.remove(self.filename)
            logger.debug("done extracting....")

In [2]:
## this is the file you want to download to dask workers, here assumes it's sitting in a s3 bucket 
tgz_file_global = "aclImdb_v1.tar.gz"
s3Url=f"s3://d2v-tmp/demo/data/{tgz_file_global}"
filename=tgz_file_global
print(s3Url)

s3://d2v-tmp/demo/data/aclImdb_v1.tar.gz


## Download data to local 

In [9]:
import boto3
import click
from s3urls import parse_url
import tarfile

import logging, traceback, sys, os    
vocab_parsed_url = parse_url(s3Url)
s3 = boto3.client('s3')
s3.download_file(vocab_parsed_url['bucket'], vocab_parsed_url['key'], filename)

## Upload data to dask wokers

In [4]:
from hyperplane import notebook_common as nc
client, cluster = nc.initialize_cluster(num_workers=1)

👉 Hyperplane: selecting worker node pool
👉 Hyperplane: selecting scheduler node pool
Creating scheduler pod on cluster. This may take some time.
👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.
👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://ds.hyperplane.dev/dask-cluster-45548bff-fd74-4136-8efa-8fde70d27961/status
👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`


## Upload file to remote workers and extract if it's a compressed file
- first install necessary packages on the workers 
- wait for 30 seconds for all workers to finish installation before moving to the next step

In [None]:
from dask.distributed import PipInstall
import time
plugin = PipInstall(packages=["boto3", "s3urls", "botocore", "click"], pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

In [None]:
client.register_worker_plugin(S3DownloadPlugin(s3Url=s3Url, filename=tgz_file_global))

In [17]:
cluster.close()