In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from sklearn.preprocessing import StandardScaler
from yellowbrick import ClassBalance

from typing import List

pd.set_option("display.float_format", lambda x: "%.2f" % x)

def _gcf_clear(plt):
    plt.cla()
    plt.clf()
    plt.close() 

def summarize(
    context: MLClientCtx,
    table: str,
    label_column: str = 'labels',
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = 'plots'
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = str(table)
    if table.endswith('.csv'):
        table = pd.read_csv(table)
    else:
        print(table)
        print(os.path.isfile(table))
        table = pd.read_parquet(table)
    header = table.columns.values
    
    _gcf_clear(plt)
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={'bw': 1.5})
    context.log_artifact(PlotArtifact('histograms',  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    _gcf_clear(plt)   
    labels = table.pop(label_column)
    if not class_labels:
        class_labels = labels.unique()
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:
    _gcf_clear(plt)



In [3]:
# nuclio: end-code

In [4]:
from mlrun import run_local, code_to_function , NewTask, mlconf

mlconf.dbpath = "http://mlrun-api:8080"
mlconf.artifact_path = '/User/artifacts'

In [5]:
# create job function object from notebook code
fn = code_to_function('describe', kind='job', with_doc=True,
                      handler=summarize, image='mlrun/ml-models:0.4.6')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'summarize'
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ['models', 'visualization']
fn.metadata.labels = {'author': 'yjb'}

fn.save()
fn.export('function.yaml')

[mlrun] 2020-03-28 20:18:36,414 saving function: describe, tag: latest
[mlrun] 2020-03-28 20:18:36,448 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f03d1a34d10>

In [6]:
table_path = os.path.join(mlconf.artifact_path, "iris.parquet")
task = NewTask(name="tasks describe", handler=summarize, inputs={'table': table_path})
run = run_local(task)

[mlrun] 2020-03-28 20:18:36,465 starting run tasks describe uid=62480d9930e44a35b8d9dc8a14e56107  -> http://mlrun-api:8080


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


/User/artifacts/iris.parquet
True
[mlrun] 2020-03-28 20:18:40,038 log artifact histograms at /User/artifacts/62480d9930e44a35b8d9dc8a14e56107/plots/hist.html, size: 152737, db: Y
[mlrun] 2020-03-28 20:18:40,568 log artifact imbalance at /User/artifacts/62480d9930e44a35b8d9dc8a14e56107/plots/imbalance.html, size: 7464, db: Y
[mlrun] 2020-03-28 20:18:40,763 log artifact correlation at /User/artifacts/62480d9930e44a35b8d9dc8a14e56107/plots/corr.html, size: 20942, db: Y



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e56107,0,Mar 28 20:18:36,completed,tasks describe,kind=handlerowner=adminhost=jupyter-new-5c768764f6-c6l99,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 62480d9930e44a35b8d9dc8a14e56107 --project default , !mlrun logs 62480d9930e44a35b8d9dc8a14e56107 --project default
[mlrun] 2020-03-28 20:18:40,912 run executed, status=completed


In [7]:
from mlrun import mount_v3io

fn.apply(mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f03d1a34d10>

In [8]:
run = fn.run(task, artifact_path='/User/artifacts')

[mlrun] 2020-03-28 20:18:55,941 starting run tasks describe uid=2db2ece3150e475e917b2e9afb214e76  -> http://mlrun-api:8080
[mlrun] 2020-03-28 20:18:56,059 Job is running in the background, pod: tasks-describe-bkqm7
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
/User/artifacts/iris.parquet
True
[mlrun] 2020-03-28 20:19:12,323 log artifact histograms at /User/artifacts/plots/hist.html, size: 281881, db: Y
[mlrun] 2020-03-28 20:19:12,939 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 11840, db: Y
[mlrun] 2020-03-28 20:19:13,151 log artifact correlation at /User/artifacts/plots/corr.html, size: 33266, db: Y

[mlrun] 2020-03-28 20:19:13,232 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...214e76,0,Mar 28 20:19:08,completed,tasks describe,host=tasks-describe-bkqm7kind=jobowner=admin,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 2db2ece3150e475e917b2e9afb214e76  , !mlrun logs 2db2ece3150e475e917b2e9afb214e76 
[mlrun] 2020-03-28 20:19:15,286 run executed, status=completed
