In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact
from mlrun.mlutils import gcf_clear

from typing import List

ImportError: cannot import name 'eval_class_model' from 'mlrun.mlutils.models' (/home/jovyan/data/repos/mlrun/mlrun/mlutils/models.py)

In [None]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

def summarize(
    context: MLClientCtx,
    table: str,
    label_column: str = "labels",
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots"
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    table = str(table)
    if table.endswith("csv"):
        table = pd.read_csv(table)
    elif table.endswith("pq") or table.endswith("parquet"): 
        table = pd.read_parquet(table)
    else:
        raise Exception("unknown file extension, should be one of csv, pq or parquet")
    header = table.columns.values
    
    gcf_clear(plt)
    snsplt = sns.pairplot(table, hue=label_column, diag_kws={"bw": 1.5})
    context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")

    gcf_clear(plt)   
    labels = table.pop(label_column)
#     if not class_labels:
#         class_labels = labels.unique()
#     class_balance_model = ClassBalance(labels=class_labels)
#     class_balance_model.fit(labels)   
#     scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
#     context.log_result("scale_pos_weight", f"{scale_pos_weight:0.2f}")
#     context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}/corr.html")
    # otherwise shows last plot:

    gcf_clear(plt)

In [None]:
# nuclio: end-code

### mlconfig

In [None]:
from mlrun import mlconf

mlconf.dbpath = mlconf.dbpath or './'

vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'

import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'

### save

In [None]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("describe")

# add metadata (for templates and reuse)
fn.spec.default_handler = "summarize"
fn.spec.description = "describe and visualizes dataset stats"
fn.metadata.categories = ["models", "visualization"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

## tests

In [None]:
from mlrun import import_function

func = import_function("hub://describe")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [None]:
from mlrun import NewTask, run_local

table_path = os.path.join(mlconf.artifact_path, "classifier-data.csv")

In [None]:
task = NewTask(
    name="tasks describe", 
    handler=summarize, 
    inputs={"table": table_path})

### run local where artifact path is fixed 

In [None]:
run = run_local(task, artifact_path=mlconf.artifact_path)

### run remote where artifact path includes the run id

In [None]:
func.run(task, inputs={"table": "iris.parquet"},  workdir=mlconf.artifact_path)