In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# nuclio: ignore
import nuclio

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from yellowbrick import ClassBalance

from typing import List

In [4]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: str,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = str(table)
    if table.endswith("csv"):
        table = pd.read_csv(table)
    elif table.endswith("pq") or table.endswith("parquet"): 
        table = pd.read_parquet(table)
    else:
        raise Exception("unknown file extension, should be one of csv, pq or parquet")
    header = table.columns.values
    
    gcf_clear(plt)
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    gcf_clear(plt)   
    labels = table.pop(label_column)
    if not class_labels:
        class_labels = labels.unique()
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:

    gcf_clear(plt)

In [5]:
# nuclio: end-code

### mlconfig

In [6]:
from mlrun import mlconf

In [7]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [8]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [9]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

In [10]:
import os
TAG = os.environ['MLRUN_COMMIT']

### save

In [11]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("describe", kind="job", with_doc=True,
                      handler=summarize, 
                      image=f"mlrun/ml-models:{TAG}")

# add metadata (for templates and reuse)
fn.spec.default_handler = "summarize"
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["models", "visualization"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-27 23:49:48,845 saving function: describe, tag: latest
[mlrun] 2020-04-27 23:49:48,884 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f7e2de710b8>

## tests

In [12]:
from mlrun import import_function

func = import_function("hub://describe")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [13]:
from mlrun import NewTask, run_local

table_path = os.path.join(mlconf.artifact_path, "iris.parquet")

task_params = {
    "name"   : "tasks describe", 
    "params" : {"dataset"     : "wine"}}

In [14]:
task = NewTask(
    name="tasks describe", 
    handler=summarize, 
    inputs={"table": table_path})

### run local where artifact path is fixed 

In [15]:
run = run_local(task, artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-27 23:49:49,664 starting run tasks describe uid=19812eb3d6f44fb4abb6ef4d802a56b1  -> http://mlrun-api:8080


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


[mlrun] 2020-04-27 23:49:52,835 log artifact histograms at /User/artifacts/plots/hist.html, size: 177021, db: Y
[mlrun] 2020-04-27 23:49:53,538 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 7464, db: Y
[mlrun] 2020-04-27 23:49:53,732 log artifact correlation at /User/artifacts/plots/corr.html, size: 20942, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...802a56b1,0,Apr 27 23:49:49,completed,tasks describe,v3io_user=adminkind=handlerowner=adminhost=jupyter-6dc6ff466f-q56kd,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 19812eb3d6f44fb4abb6ef4d802a56b1 --project default , !mlrun logs 19812eb3d6f44fb4abb6ef4d802a56b1 --project default
[mlrun] 2020-04-27 23:49:53,839 run executed, status=completed


### run remote where artifact path includes the run id

In [17]:
func.run(task, inputs={"table": "iris.parquet"},  workdir=mlconf.artifact_path)

[mlrun] 2020-04-27 23:50:14,225 starting run tasks describe uid=66ec4e2116334fd4bf2afba266a7fbd8  -> http://mlrun-api:8080
[mlrun] 2020-04-27 23:50:14,319 Job is running in the background, pod: tasks-describe-x5z26
[mlrun] 2020-04-27 23:50:21,550 log artifact histograms at /User/artifacts/66ec4e2116334fd4bf2afba266a7fbd8/plots/hist.html, size: 284413, db: Y
[mlrun] 2020-04-27 23:50:22,032 log artifact imbalance at /User/artifacts/66ec4e2116334fd4bf2afba266a7fbd8/plots/imbalance.html, size: 11716, db: Y
[mlrun] 2020-04-27 23:50:22,211 log artifact correlation at /User/artifacts/66ec4e2116334fd4bf2afba266a7fbd8/plots/corr.html, size: 30642, db: Y

[mlrun] 2020-04-27 23:50:22,282 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...66a7fbd8,0,Apr 27 23:50:18,completed,tasks describe,host=tasks-describe-x5z26kind=jobowner=adminv3io_user=admin,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 66ec4e2116334fd4bf2afba266a7fbd8  , !mlrun logs 66ec4e2116334fd4bf2afba266a7fbd8 
[mlrun] 2020-04-27 23:50:23,490 run executed, status=completed


<mlrun.model.RunObject at 0x7f7e2e69d9e8>