In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from yellowbrick import ClassBalance

from typing import List

In [None]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: DataItem,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           MLRun input pointing to pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = table.as_df()
    header = table.columns.values
    
    gcf_clear(plt)
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    gcf_clear(plt)   
    labels = table.pop(label_column)
    if not class_labels:
        class_labels = labels.unique()
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:

    gcf_clear(plt)

In [None]:
# nuclio: end-code

### mlconfig

In [13]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [10]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("describe")

# add metadata (for templates and reuse)
fn.spec.default_handler = "summarize"
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["analysis"]
fn.metadata.labels = {"author": "yjb"}
fn.export("function.yaml")

[mlrun] 2020-05-01 21:59:41,028 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc31fad1f28>

## tests

In [9]:
from mlrun import import_function

func = import_function("hub://describe")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [10]:
from mlrun import NewTask, run_local

table_path = os.path.join(mlconf.artifact_path, "iris.parquet")

task_params = {
    "name"   : "tasks describe", 
    "params" : {"dataset"     : "wine"}}

In [11]:
task = NewTask(
    name="tasks describe", 
    handler=summarize, 
    inputs={"table": table_path})

### run local where artifact path is fixed 

In [12]:
run = run_local(task, artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-30 20:42:48,492 starting run tasks describe uid=5422a62f09fe4b29892567dae13aef2b  -> http://mlrun-api:8080


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


[mlrun] 2020-04-30 20:42:51,761 log artifact histograms at /User/artifacts/plots/hist.html, size: 177021, db: Y
[mlrun] 2020-04-30 20:42:52,277 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 7464, db: Y
[mlrun] 2020-04-30 20:42:52,508 log artifact correlation at /User/artifacts/plots/corr.html, size: 20942, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...e13aef2b,0,Apr 30 20:42:48,completed,tasks describe,v3io_user=adminkind=handlerowner=adminhost=jupyter-6c5fccf844-gxlrw,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5422a62f09fe4b29892567dae13aef2b --project default , !mlrun logs 5422a62f09fe4b29892567dae13aef2b --project default
[mlrun] 2020-04-30 20:42:52,670 run executed, status=completed


### run remote where artifact path includes the run id

In [13]:
func.run(task, inputs={"table": "iris.parquet"},  workdir=mlconf.artifact_path)

[mlrun] 2020-04-30 20:42:52,684 starting run tasks describe uid=34df62738ee94a9c9db0468a5b931440  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:42:52,770 Job is running in the background, pod: tasks-describe-ns8bg
[mlrun] 2020-04-30 20:43:00,424 log artifact histograms at /User/artifacts/34df62738ee94a9c9db0468a5b931440/plots/hist.html, size: 284413, db: Y
[mlrun] 2020-04-30 20:43:00,896 log artifact imbalance at /User/artifacts/34df62738ee94a9c9db0468a5b931440/plots/imbalance.html, size: 11716, db: Y
[mlrun] 2020-04-30 20:43:01,094 log artifact correlation at /User/artifacts/34df62738ee94a9c9db0468a5b931440/plots/corr.html, size: 30642, db: Y

[mlrun] 2020-04-30 20:43:01,172 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...5b931440,0,Apr 30 20:42:57,completed,tasks describe,host=tasks-describe-ns8bgkind=jobowner=adminv3io_user=admin,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 34df62738ee94a9c9db0468a5b931440  , !mlrun logs 34df62738ee94a9c9db0468a5b931440 
[mlrun] 2020-04-30 20:43:04,954 run executed, status=completed


<mlrun.model.RunObject at 0x7f1163612fd0>