In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from typing import List

In [15]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: DataItem,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           MLRun input pointing to pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = table.as_df()
    header = table.columns.values
    
    # pairplots
    gcf_clear(plt)
    # TODO
    # get the underlying data and save it as an artifact, use as features filter
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), 
                         local_path=f"{plots_dest}/hist.html")
    
    # class balance
    gcf_clear(plt)   
    labels = table.pop(label_column)
    imbtable = labels.value_counts(normalize=True).sort_index()
    balancebar = imbtable.plot(kind='bar', title='class imbalance - labels')
    balancebar.set_xlabel('class')
    balancebar.set_ylabel("proportion of total")
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), 
                         local_path=f"{plots_dest}/imbalance.html")
    context.log_artifact(TableArtifact("imbalance-weights-vec", 
                                       df=pd.DataFrame({"weights": imbtable})),
                         local_path=f"{plots_dest}/imbalance-weights-vec.csv")

    # correlation matrix
    # TODO
    # do this by variable types
    tblcorr = table.corr()
    mask = np.zeros_like(tblcorr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    
    dfcorr = pd.DataFrame(data=tblcorr, columns=header, index=header)
    # TODO:
    # add indexing to TableArtifact so we could display labels as first column and use for 
    # quick lookup
    # clean lower diag set nan
    dfcorr = dfcorr[np.arange(dfcorr.shape[0])[:, None] > np.arange(dfcorr.shape[1])]
    context.log_artifact(TableArtifact("correlation-matrix", df=tblcorr), 
                         local_path=f"{plots_dest}/correlation-matrix.csv")
    
    # correlation plots
    gcf_clear(plt)
    ax = plt.axes()
    # TODO
    # make prettier
    sns.heatmap(tblcorr, ax=ax, mask=mask, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), 
                         local_path=f"{plots_dest}/corr.html")
    
    gcf_clear(plt)

In [16]:
# nuclio: end-code

### mlconfig

In [17]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [18]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("describe",
                      handler="summarize")

# add metadata (for templates and reuse)
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["analysis"]
fn.metadata.labels = {"author": "yjb"}

fn.export("function.yaml")

[mlrun] 2020-05-21 12:30:03,012 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f39e2312470>

## tests

In [19]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/jovyan/data'))

In [20]:
from mlrun import NewTask, run_local

DATA_URL = "https://raw.githubusercontent.com/yjb-ds/testdata/master/data/classifier-data.csv"

In [21]:
task = NewTask(
    name="tasks describe", 
    handler=summarize, 
    inputs={"table": DATA_URL})

### run locally

In [22]:
run = run_local(task)

[mlrun] 2020-05-21 12:30:03,082 starting run tasks describe uid=593756823e4348eda3d4dddb85bb1690  -> http://mlrun-api:8080
[mlrun] 2020-05-21 12:30:05,833 log artifact histograms at /User/artifacts/plots/hist.html, size: 164473, db: Y
[mlrun] 2020-05-21 12:30:06,492 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 8748, db: Y
[mlrun] 2020-05-21 12:30:06,507 log artifact imbalance-weights-vec at /User/artifacts/plots/imbalance-weights-vec.csv, size: 20, db: Y
[mlrun] 2020-05-21 12:30:06,524 log artifact correlation-matrix at /User/artifacts/plots/correlation-matrix.csv, size: 282, db: Y
[mlrun] 2020-05-21 12:30:06,680 log artifact correlation at /User/artifacts/plots/corr.html, size: 11730, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...85bb1690,0,May 21 12:30:03,completed,tasks describe,v3io_user=adminkind=handlerowner=adminhost=jupyter-67c88b95d4-crdhq,table,,,histogramsimbalanceimbalance-weights-veccorrelation-matrixcorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 593756823e4348eda3d4dddb85bb1690 --project default , !mlrun logs 593756823e4348eda3d4dddb85bb1690 --project default
[mlrun] 2020-05-21 12:30:06,781 run executed, status=completed


### run remotely

In [23]:
fn.run(task, inputs={"table": DATA_URL})

[mlrun] 2020-05-21 12:30:06,793 starting run tasks describe uid=0ab0634894c44103a59cdc035b8b1660  -> http://mlrun-api:8080
[mlrun] 2020-05-21 12:30:06,919 Job is running in the background, pod: tasks-describe-vw575
[mlrun] 2020-05-21 12:30:10,360 starting local run: main.py # summarize
[mlrun] 2020-05-21 12:30:13,724 log artifact histograms at /User/artifacts/plots/hist.html, size: 280641, db: Y
[mlrun] 2020-05-21 12:30:14,213 log artifact imbalance at /User/artifacts/plots/imbalance.html, size: 18948, db: Y
[mlrun] 2020-05-21 12:30:14,228 log artifact imbalance-weights-vec at /User/artifacts/plots/imbalance-weights-vec.csv, size: 20, db: Y
[mlrun] 2020-05-21 12:30:14,247 log artifact correlation-matrix at /User/artifacts/plots/correlation-matrix.csv, size: 282, db: Y
[mlrun] 2020-05-21 12:30:14,412 log artifact correlation at /User/artifacts/plots/corr.html, size: 24634, db: Y

[mlrun] 2020-05-21 12:30:14,480 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...5b8b1660,0,May 21 12:30:10,completed,tasks describe,host=tasks-describe-vw575kind=jobowner=adminv3io_user=admin,table,,,histogramsimbalanceimbalance-weights-veccorrelation-matrixcorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 0ab0634894c44103a59cdc035b8b1660  , !mlrun logs 0ab0634894c44103a59cdc035b8b1660 
[mlrun] 2020-05-21 12:30:16,082 run executed, status=completed


<mlrun.model.RunObject at 0x7f39e2099f60>