In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from sklearn.preprocessing import StandardScaler
from yellowbrick import ClassBalance

from typing import IO, AnyStr, Union, List, Optional



In [3]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def _gcf_clear(plt):
    plt.cla()
    plt.clf()
    plt.close() 

def summarize(
    context: MLClientCtx,
    table: str,
    label_column: str = 'labels',
    class_labels: List[str] = None,
    plot_hist: bool = True,
    plots_dest: str = 'plots'
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = str(table)
    if table.endswith('.csv'):
        table = pd.read_csv(table)
    else: 
        table = pd.read_parquet(table)
    header = table.columns.values

    _gcf_clear(plt)
    try:
        snsplt = sns.pairplot(table, hue=label_column, ax=ax)
    except Exception as e:
        snsplt = sns.pairplot(table, hue=label_column, diag_kws={'bw': 1.5})
    context.log_artifact(PlotArtifact('histograms',  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    _gcf_clear(plt)   
    labels = table.pop(label_column)
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    _gcf_clear(plt)

In [4]:
# nuclio: end-code

In [7]:
mlconf.artifact_path

''

In [10]:
from mlrun import run_local, NewTask, mlconf

mlconf.dbpath = "http://mlrun-api:8080"
mlconf.artifact_path = '/User/functions/artifacts'

In [9]:
table_path = os.path.join(mlconf.artifact_path, "iris.parquet")
task = NewTask(handler=summarize, inputs={'table': table_path})
run = run_local(task)

[mlrun] 2020-03-24 19:25:39,552 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-03-24 19:25:39,561 starting run mlrun-30e956-summarize uid=70500577b2554f3c8bc8139fdfbff762  -> http://mlrun-api:8080
[mlrun] 2020-03-24 19:25:39,900 Traceback (most recent call last):
  File "/User/.pythonlibs/jupyter/lib/python3.6/site-packages/mlrun/runtimes/local.py", line 183, in exec_from_params
    val = handler(*args_list)
  File "<ipython-input-3-177239356d5c>", line 30, in summarize
    table = pd.read_parquet(table)
  File "/User/.conda/envs/stable/lib/python3.7/site-packages/pandas/io/parquet.py", line 310, in read_parquet
    return impl.read(path, columns=columns, **kwargs)
  File "/User/.conda/envs/stable/lib/python3.7/site-packages/pandas/io/parquet.py", line 125, in read
    path, columns=columns, **kwargs
  File "/User/.conda/envs/stable/lib/python3.7/site-packages/pyarrow/parquet.py", line 1274, in read_table
    filesystem=filesystem, filter

Passed non-file path: ./iris.parquet


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...bff762,0,Mar 24 19:25:39,error,mlrun-30e956-summarize,host=jupyter-6d69dc994d-h9nv2kind=handlerowner=admin,table,,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 70500577b2554f3c8bc8139fdfbff762 --project default , !mlrun logs 70500577b2554f3c8bc8139fdfbff762 --project default
[mlrun] 2020-03-24 19:25:39,952 run executed, status=error


RunError: Passed non-file path: ./iris.parquet

In [None]:
import pandas as pd
df = pd.read_parquet(table_path)
df

In [20]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function('describe', kind='job', with_doc=True,
                      handler=summarize, image='mlrun/ml-models')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'summarize'
fn.spec.description = "this function visualize dataset stats"
fn.metadata.categories = ['models', 'visualization']
fn.metadata.labels = {'author': 'yjb'}

In [21]:
fn.export('function.yaml')

[mlrun] 2020-03-24 18:54:01,979 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fd3ed1665d0>