## import libraries

In [6]:
!pip install mxnet==1.8.0 --quiet
!pip install autogluon.tabular --quiet

[0m

In [2]:
import warnings
warnings.filterwarnings("ignore")
from autogluon.tabular import TabularDataset, TabularPredictor

## Read in data

In [3]:
## parameter cell
data_path = "s3://shakdemo-aws/data/iot/iot_telemetry_data.csv"
label = 'motion'
device = "001"

In [4]:
data = pd.read_csv(data_path)
data_nolab = data.drop(columns=[label, 'device'])
data_inference = pd.concat([data_nolab]*1)
data_inference.shape

(405184, 7)

## Start a Dask cluster for fast distributed inference

In [5]:
import hyperplane.notebook_common as nc


## choose number of proces and processes
num_workers = 1
nprocs = 15

## node size
cors_per_worker = 15
total_memory = 110


ram_gb_per_proc = total_memory/nprocs
nthreads = int(cors_per_worker/nprocs)


client, cluster = nc.initialize_cluster(
        nprocs = nprocs,
        nthreads = int(cors_per_worker/nprocs),
        ram_gb_per_proc = total_memory/nprocs,
        cores_per_worker = cors_per_worker,
        num_workers = num_workers,
        node_selector = {}
    )

print(f'spinning up {num_workers} {total_memory} G workers nprocs = {nprocs} nthreads = {nthreads} ram_per_proc = {ram_gb_per_proc} G')


👉 Hyperplane: selecting worker node pool
👉 Hyperplane: selecting scheduler node pool
Creating scheduler pod on cluster. This may take some time.
👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.
👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://shakdemo2.hyperplane.dev/dask-cluster-8633cfe2-b58a-4d0c-9a8e-96a25201659c/status
👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`
spinning up 1 110 G workers nprocs = 15 nthreads = 1 ram_per_proc = 7.333333333333333 G


<a href="#results">Go to Results</a>

In [7]:
%%time
from dask.distributed import PipInstall
plugin = PipInstall(packages=["mxnet==1.8.0","autogluon"])
client.register_worker_plugin(plugin)

CPU times: user 339 ms, sys: 78.6 ms, total: 418 ms
Wall time: 6min 3s


{'tcp://10.0.178.172:33463': {'status': 'OK'},
 'tcp://10.0.178.172:35079': {'status': 'OK'},
 'tcp://10.0.178.172:35351': {'status': 'OK'},
 'tcp://10.0.178.172:35411': {'status': 'OK'},
 'tcp://10.0.178.172:36197': {'status': 'OK'},
 'tcp://10.0.178.172:38159': {'status': 'OK'},
 'tcp://10.0.178.172:38277': {'status': 'OK'},
 'tcp://10.0.178.172:41825': {'status': 'OK'},
 'tcp://10.0.178.172:43091': {'status': 'OK'},
 'tcp://10.0.178.172:43215': {'status': 'OK'},
 'tcp://10.0.178.172:43775': {'status': 'OK'},
 'tcp://10.0.178.172:44483': {'status': 'OK'},
 'tcp://10.0.178.172:46523': {'status': 'OK'},
 'tcp://10.0.178.172:46685': {'status': 'OK'},
 'tcp://10.0.178.172:46765': {'status': 'OK'}}

## download the trained model to workers

In [8]:
def download_from_cloud(local_file_name, remote_file_name):
    """
    Download a file to gcp or s3.
    """
    import os
    import s3fs
    import gcsfs
    cloud_name = remote_file_name.split('://')[0]
    if cloud_name =='gs':
        fs = gcsfs.GCSFileSystem(project=os.environ['GCP_PROJECT'])
    elif cloud_name =='s3':
        fs = s3fs.S3FileSystem()
    else:
        raise NameError(f'cloud name {cloud_name} unknown')
    try:    
        print(f'downloading from {remote_file_name} to {local_file_name}...')
        fs.get(remote_file_name, local_file_name, recursive=True)
        print("done downloading!")
    except Exception as exp:
        print(f"download failed: {exp}")

    return

In [9]:
def download_model():
    local_file_path = 'models'
    remote_file_path =f"s3://shakdemo-aws/demo/iot/{local_file_path}"
    download_from_cloud(local_file_path, remote_file_path)
    return 'success'

In [10]:
%%time
client.run(download_model)

CPU times: user 2.7 ms, sys: 747 µs, total: 3.45 ms
Wall time: 1.26 s


{'tcp://10.0.178.172:33463': 'success',
 'tcp://10.0.178.172:35079': 'success',
 'tcp://10.0.178.172:35351': 'success',
 'tcp://10.0.178.172:35411': 'success',
 'tcp://10.0.178.172:36197': 'success',
 'tcp://10.0.178.172:38159': 'success',
 'tcp://10.0.178.172:38277': 'success',
 'tcp://10.0.178.172:41825': 'success',
 'tcp://10.0.178.172:43091': 'success',
 'tcp://10.0.178.172:43215': 'success',
 'tcp://10.0.178.172:43775': 'success',
 'tcp://10.0.178.172:44483': 'success',
 'tcp://10.0.178.172:46523': 'success',
 'tcp://10.0.178.172:46685': 'success',
 'tcp://10.0.178.172:46765': 'success'}

## Read data onto the dask workers and do distributed inference 

In [None]:
from dask import dataframe as dd
df = dd.from_pandas(data_inference, npartitions= len(client.has_what()))
print(len(df), df.npartitions)
df.head(2)

In [None]:
def inference_function(df: pd.DataFrame, model_path) ->pd.DataFrame:
    from autogluon.tabular import TabularDataset, TabularPredictor
    data = TabularDataset(df)
    predictor = TabularPredictor.load(model_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
    y_pred = predictor.predict(df)
    df['pred'] = y_pred.values
    return df

In [None]:
from numpy import dtype
meta = {
    'ts': dtype('float64'),
    'co': dtype('float64'),
    'humidity': dtype('float64'),
    'light': dtype('bool'),
    'lpg': dtype('float64'),
    'smoke': dtype('float64'),
    'temp': dtype('float64'),
    'pred': dtype('bool')
}

<a id='results'></a>

In [None]:
df_pred = df.map_partitions(inference_function, model_path = '/root/models/', meta= meta)

In [None]:
%%time
df_pred_local = df_pred.compute()
print(df_pred_local.shape)
df_pred_local.head(2)


In [None]:
client.close()
cluster.close()

## inference in pandas 

In [None]:
import time
starttime = time.time()
df_pred_local_pandas = inference_function(data_inference, model_path= './models/')
print(f'finished in {time.time() - starttime} seconds')

With distributed dask nodes, the inference is done with **2 Dask nodes** in parallel, which took about **28 seconds** in total. That's **over 2x speed up** compared to pandas and an ability to handle data **larger than memory**