In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import dask
import dask.dataframe as dd
from dask.distributed import Client

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from yellowbrick import ClassBalance

from typing import List



In [3]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    dask_key: str = "dask_key",
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots",
    alt_scheduler: str = None
) -> None:
    """Summarize a table
    
    Connects to dask client through the function context, or through an optional
    user-supplied scheduler.

    :param context:         the function context
    :param dask_key:        key of dataframe in dask client "datasets" attribute
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    :param alt_scheduler:   (None) an alternative scheduler file to connect with
    """
    if alt_scheduler:
        dask_client = Client(scheduler_file=str(alt_scheduler))
    elif hasattr(context, "dask_client"):
        dask_client = Client(scheduler_file=str(context.dask_client))
    else:
        raise Exception("out of luck, no dask_client or scheduler file!")
        
    if dask_key in dask_client.datasets:
        table = dask_client.get_dataset(dask_key)
    else:
        context.logger.info(f"only these datasets are available {dask_client.datasets} in client {dask_client}")
        raise Exception("dataset not found on dask cluster")
    header = table.columns.values
    
    gcf_clear(plt)
    table = table.compute()
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={'bw': 1.5})
    context.log_artifact(PlotArtifact('histograms',  body=plt.gcf()), 
                         local_path=f"{plots_dest}/hist.html")

    gcf_clear(plt)   
    labels = table.pop(label_column)
    if not class_labels:
        class_labels = labels.unique()
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), 
                         local_path=f"{plots_dest}/imbalance.html")
    
    gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), 
                         local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:
    gcf_clear(plt)

In [4]:
# nuclio: end-code

### mlconfig

In [5]:
from mlrun import mlconf

In [6]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [7]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [8]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

### save

In [9]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function('describe_dask', kind='job', with_doc=True,
                      handler=summarize, 
                      image='mlrun/ml-models')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'summarize'
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ['models', 'visualization']
fn.metadata.labels = {'author': 'yjb'}

fn.save()
fn.export('function.yaml')

[mlrun] 2020-04-25 14:43:45,705 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb198e92550>

## tests

In [14]:
from mlrun import import_function, NewTask, run_local

func = import_function("hub://describe_dask")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))



In [15]:
task = NewTask(name="tasks describe dask", 
               handler=summarize, 
               inputs={'dask_key': "dask_key",
                       "alt_scheduler" :"/User/artifacts/scheduler.json"})
run = run_local(task)

[mlrun] 2020-04-25 14:48:24,349 starting run tasks describe dask uid=ecb199f94f714f8f994abf8dd5275536  -> http://mlrun-api:8080



python
+-------------------------+---------------+
|                         | version       |
+-------------------------+---------------+
| client                  | 3.6.8.final.0 |
| scheduler               | 3.7.6.final.0 |
| tcp://10.200.0.52:41323 | 3.7.6.final.0 |
| tcp://10.200.0.54:39080 | 3.7.6.final.0 |
| tcp://10.200.0.55:43835 | 3.7.6.final.0 |
| tcp://10.200.0.56:41005 | 3.7.6.final.0 |
+-------------------------+---------------+


KeyboardInterrupt: 

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


Error in callback <function flush_figures at 0x7fb1926c92f0> (for post_execute):


KeyboardInterrupt: 

In [16]:
!mlrun clean -p -r

[mlrun] 2020-04-25 14:52:39,955 using in-cluster config.
state      started          type     name
Running    Apr 25 14:31:54  dask     mlrun-load-dask-0376f52b-7hr6ss
Running    Apr 25 14:31:47  dask     mlrun-load-dask-0376f52b-7k8tkr
Running    Apr 25 14:31:54  dask     mlrun-load-dask-0376f52b-7p8dnt
Running    Apr 25 14:31:54  dask     mlrun-load-dask-0376f52b-7qbxh2
Running    Apr 25 14:31:55  dask     mlrun-load-dask-0376f52b-7rvxpc
Succeeded  Apr 25 14:32:47  job      tasks-archive-to-parquet-5jwq7
Succeeded  Apr 25 14:44:12  job      tasks-describe-w59ps
Succeeded  Apr 25 14:31:30  job      tasks-generate-classification-data-bfmjm
Succeeded  Apr 25 14:28:58  job      tasks-generate-classification-data-tcx24
Succeeded  Apr 25 14:28:57  job      tasks-load-toy-dataset-ghkbv
Succeeded  Apr 25 14:50:46  job      tasks-train-a-classifier-slcjx
