In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# nuclio: ignore
import nuclio

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from yellowbrick import ClassBalance

from typing import List

In [68]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: str,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = str(table)
    if table.endswith("csv"):
        table = pd.read_csv(table)
    elif table.endswith("pq") or table.endswith("parquet"): 
        table = pd.read_parquet(table)
    else:
        raise Exception("unknown file extension, should be one of csv, pq or parquet")
    header = table.columns.values
    
    gcf_clear(plt)
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    gcf_clear(plt)   
    labels = table.pop(label_column)
    if not class_labels:
        class_labels = labels.unique()
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)   
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:

    gcf_clear(plt)

In [69]:
# nuclio: end-code

In [70]:
from mlrun import run_local, code_to_function , NewTask, mlconf, mount_v3io

mlconf.dbpath = "http://mlrun-api:8080"
mlconf.artifact_path = "/User/artifacts"

# create job function object from notebook code
fn = code_to_function("describe", kind="job", with_doc=True,
                      handler=summarize, image="mlrun/ml-models:test")

# add metadata (for templates and reuse)
fn.spec.default_handler = "summarize"
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["models", "visualization"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")
fn.apply(mount_v3io())

table_path = os.path.join(mlconf.artifact_path, "iris.parquet")
task = NewTask(name="tasks describe", handler=summarize, inputs={"table": table_path})
run = run_local(task)

[mlrun] 2020-04-20 19:23:50,231 saving function: describe, tag: latest
[mlrun] 2020-04-20 19:23:50,268 function spec saved to path: function.yaml
[mlrun] 2020-04-20 19:23:50,278 starting run tasks describe uid=e68393d7447f41ca8cb7f47e38ac9cfd  -> http://mlrun-api:8080
[mlrun] 2020-04-20 19:23:54,029 log artifact histograms at /User/artifacts/e68393d7447f41ca8cb7f47e38ac9cfd/plots/hist.html, size: 177021, db: Y
[mlrun] 2020-04-20 19:23:54,798 log artifact imbalance at /User/artifacts/e68393d7447f41ca8cb7f47e38ac9cfd/plots/imbalance.html, size: 7464, db: Y
[mlrun] 2020-04-20 19:23:54,992 log artifact correlation at /User/artifacts/e68393d7447f41ca8cb7f47e38ac9cfd/plots/corr.html, size: 20942, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...38ac9cfd,0,Apr 20 19:23:50,completed,tasks describe,v3io_user=iguaziokind=handlerowner=iguaziohost=jupyter-iguazio-fbcf6f67b-bcshp,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run e68393d7447f41ca8cb7f47e38ac9cfd --project default , !mlrun logs e68393d7447f41ca8cb7f47e38ac9cfd --project default
[mlrun] 2020-04-20 19:23:55,117 run executed, status=completed


In [72]:
fn.run(
    NewTask(name="tasks describe"), 
    handler=summarize, 
    inputs={"table": "iris.parquet"},  
    workdir="/User/artifacts", 
    out_path="/User/artifacts")

[mlrun] 2020-04-20 19:24:39,237 starting run tasks describe uid=6c035a7cf8254e5caf9704ebd355efca  -> http://mlrun-api:8080
[mlrun] 2020-04-20 19:24:39,353 Job is running in the background, pod: tasks-describe-7br2v
[mlrun] 2020-04-20 19:24:48,077 log artifact histograms at /User/artifacts/plots/hist.html, size: 284413, db: Y
[mlrun] 2020-04-20 19:24:48,671 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 11716, db: Y
[mlrun] 2020-04-20 19:24:48,878 log artifact correlation at /User/artifacts/plots/corr.html, size: 30642, db: Y

[mlrun] 2020-04-20 19:24:48,964 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...d355efca,0,Apr 20 19:24:44,completed,tasks describe,host=tasks-describe-7br2vkind=jobowner=iguaziov3io_user=iguazio,table,,scale_pos_weight=1.00,histogramsimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 6c035a7cf8254e5caf9704ebd355efca  , !mlrun logs 6c035a7cf8254e5caf9704ebd355efca 
[mlrun] 2020-04-20 19:24:58,577 run executed, status=completed


<mlrun.model.RunObject at 0x7f92e4f04e10>