In [14]:
# nuclio: ignore
import nuclio

In [15]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models:0.4.8"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models:0.4.8'


In [16]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from typing import List

In [20]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: DataItem,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           MLRun input pointing to pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = table.as_df()
    header = table.columns.values
    
    # pairplots
    gcf_clear(plt)
    # TODO
    # get the underlying data and save it as an artifact, use as features filter
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), 
                         local_path=f"{plots_dest}/hist.html")
    
    # class balance
    gcf_clear(plt)   
    labels = table.pop(label_column)
    imbtable = labels.value_counts(normalize=True).sort_index()
    balancebar = imbtable.plot(kind='bar', title='class imbalance - labels')
    balancebar.set_xlabel('class')
    balancebar.set_ylabel("proportion of total")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), 
                         local_path=f"{plots_dest}/imbalance.html")
    context.log_artifact(TableArtifact("imbalance-weights-vec", 
                                       df=pd.DataFrame({"weights": imbtable})),
                         local_path=f"{plots_dest}/imbalance-weights-vec.csv")

    # correlation matrix
    # TODO
    # do this by variable types
    tblcorr = table.corr()
    mask = np.zeros_like(tblcorr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    
    dfcorr = pd.DataFrame(data=tblcorr, columns=header, index=header)
    # TODO:
    # add indexing to TableArtifact so we could display labels as first column and use for 
    # quick lookup
    # clean lower diag set nan
    dfcorr = dfcorr[np.arange(dfcorr.shape[0])[:, None] > np.arange(dfcorr.shape[1])]
    context.log_artifact(TableArtifact("correlation-matrix", df=tblcorr), 
                         local_path=f"{plots_dest}/correlation-matrix.csv")
    
    # correlation plots
    gcf_clear(plt)
    ax = plt.axes()
    # TODO
    # make prettier
    sns.heatmap(tblcorr, ax=ax, mask=mask, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), 
                         local_path=f"{plots_dest}/corr.html")
    
    gcf_clear(plt)

In [21]:
# nuclio: end-code

### mlconfig

In [22]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [23]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("describe")

# add metadata (for templates and reuse)
fn.spec.default_handler = "summarize"
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["analysis"]
fn.metadata.labels = {"author": "yjb"}

fn.export("function.yaml")

[mlrun] 2020-05-13 20:23:52,069 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7eff14dbb860>

## tests

In [24]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/jovyan/data'))

In [25]:
from mlrun import NewTask, run_local

table_path = os.path.join(mlconf.artifact_path, "iris.parquet")

In [26]:
task = NewTask(
    name="tasks describe", 
    handler=summarize, 
    inputs={"table": table_path})

### run locally

In [27]:
run = run_local(task)

[mlrun] 2020-05-13 20:23:54,488 starting run tasks describe uid=b248c07f87f9457ebe4b5b6ccdd88044  -> http://mlrun-api:8080
[mlrun] 2020-05-13 20:23:57,414 log artifact histograms at /User/artifacts/plots/hist.html, size: 149961, db: Y
[mlrun] 2020-05-13 20:23:58,102 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 10360, db: Y
[mlrun] 2020-05-13 20:23:58,117 log artifact imbalance-weights-vec at /User/artifacts/plots/imbalance-weights-vec.csv, size: 65, db: Y
[mlrun] 2020-05-13 20:23:58,132 log artifact correlation-matrix at /User/artifacts/plots/correlation-matrix.csv, size: 324, db: Y
[mlrun] 2020-05-13 20:23:58,291 log artifact correlation at /User/artifacts/plots/corr.html, size: 11998, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...cdd88044,0,May 13 20:23:54,completed,tasks describe,v3io_user=adminkind=handlerowner=adminhost=jupyter-5fcc788f98-m762b,table,,,histogramsimbalanceimbalance-weights-veccorrelation-matrixcorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run b248c07f87f9457ebe4b5b6ccdd88044 --project default , !mlrun logs b248c07f87f9457ebe4b5b6ccdd88044 --project default
[mlrun] 2020-05-13 20:23:58,390 run executed, status=completed


### run remotely

In [28]:
fn.run(task, inputs={"table": os.path.join(mlconf.artifact_path, "iris.parquet")})

[mlrun] 2020-05-13 20:23:58,410 starting run tasks describe uid=1534b286ac864430864f110212ddc3c8  -> http://mlrun-api:8080
[mlrun] 2020-05-13 20:23:58,500 Job is running in the background, pod: tasks-describe-ppt88
[mlrun] 2020-05-13 20:24:05,571 log artifact histograms at /User/artifacts/plots/hist.html, size: 255745, db: Y
[mlrun] 2020-05-13 20:24:06,068 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 22348, db: Y
[mlrun] 2020-05-13 20:24:06,084 log artifact imbalance-weights-vec at /User/artifacts/plots/imbalance-weights-vec.csv, size: 65, db: Y
[mlrun] 2020-05-13 20:24:06,100 log artifact correlation-matrix at /User/artifacts/plots/correlation-matrix.csv, size: 324, db: Y
[mlrun] 2020-05-13 20:24:06,286 log artifact correlation at /User/artifacts/plots/corr.html, size: 26338, db: Y

[mlrun] 2020-05-13 20:24:06,357 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...12ddc3c8,0,May 13 20:24:02,completed,tasks describe,host=tasks-describe-ppt88kind=jobowner=adminv3io_user=admin,table,,,histogramsimbalanceimbalance-weights-veccorrelation-matrixcorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 1534b286ac864430864f110212ddc3c8  , !mlrun logs 1534b286ac864430864f110212ddc3c8 
[mlrun] 2020-05-13 20:24:07,729 run executed, status=completed


<mlrun.model.RunObject at 0x7efe95821860>