# Describe

In [2]:
import mlrun
import os

In [32]:
mlrun.mlconf.dbpath='http://mlrun-api:8080'

mlrun.artifact_path = os.path.join(os.getcwd(), 'artifacts')
os.makedirs(mlrun.artifact_path, exist_ok=True)

In [63]:
FN_DIR = '/User/functions/describe'                                  # To save function specs
pq_file = '/User/demo-network-operations/data/aggregate_no_index.pq' # For internal function test

In [8]:
# nuclio: start-code

In [56]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from sklearn.preprocessing import StandardScaler
from yellowbrick import ClassBalance

from typing import IO, AnyStr, Union, List, Optional

In [10]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

In [11]:
def _gcf_clear(plt):
    plt.cla()
    plt.clf()
    plt.close() 

In [55]:
def describe(
    context: MLClientCtx,
    table: Union[DataItem, str],
    label_column: str,
    class_labels: List[str] = None,
    key: str = "table-summary",
    plot_hist: bool = True,
    plots_dest: str = 'plots'
) -> None:
    """Summarize a table

    TODO: merge with dask version

    :param context:         the function context
    :param table:           pandas dataframe
    :param label_column:    ground truth column label
    :param key:             key of table summary in artifact store
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    """
    base_path = context.artifact_path
    
    table = pd.read_parquet(str(table))
    header = table.columns.values

    _gcf_clear(plt)
    try:
        snsplt = sns.pairplot(table, hue=label_column)
        snsplt.savefig(os.path.join(base_path, f"{plots_dest}/hist.png"))
        context.log_artifact(PlotArtifact("histograms",  body=plt.gcf()), local_path=f"{plots_dest}/hist.html")
    except Exception as e:
        context.logger.warning(f'While trying to run sns.pairplot encountered the following error: {e}')

    sumtbl = table.describe()
    sumtbl = sumtbl.append(len(table.index)-table.count(), ignore_index=True)
    sumtbl.insert(0, "metric", ["count", "mean", "std", "min","25%", "50%", "75%", "max", "nans"])
    
    sumtbl.to_csv(os.path.join(base_path, key+".csv"), index=False)
    context.log_artifact(key, local_path=key+".csv")

    _gcf_clear(plt)
    
    labels = table.pop(label_column)
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)
    
    scale_pos_weight = class_balance_model.support_[0]/class_balance_model.support_[1]
    context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}")

    class_balance_model.show(outpath=os.path.join(base_path, f"{plots_dest}/imbalance.png"))
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html")
    
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    plt.savefig(os.path.join(base_path, f"{plots_dest}/corr.png"))
    context.log_artifact(PlotArtifact("correlation",  body=plt.gcf()), local_path=f"{plots_dest}corr.html")
    
   
    _gcf_clear(plt)



In [8]:
# nuclio: end-code

## Setup Describe

In [73]:
describe_fn = mlrun.code_to_function('describe',
                                     kind='job',
#                                      code_output=os.path.join(FN_DIR, 'describe.py'),
                                     image='mlrun/ml-models:0.4.5')

In [62]:
summ_task = mlrun.NewTask(
    "sum", 
    handler=describe,  
    params={"key": "summary", 
            "label_column": "is_error", 
            'class_labels': ['0', '1'],
            'plot_hist': True,
            'plot_dest': 'plots'},
    inputs={"table": pq_file},
    artifact_path=mlrun.artifact_path)

## Test
### Local

In [58]:
mlrun.run_local(summ_task)

[mlrun] 2020-03-19 14:01:51,728 starting run sum uid=67ad4af5fd4c40a0ad6a1af3142b62b6  -> http://mlrun-api:8080
[mlrun] 2020-03-19 14:01:59,027 While trying to run sns.pairplot encountered the following error: singular matrix
[mlrun] 2020-03-19 14:01:59,078 log artifact summary at /User/functions/describe/artifacts/summary.csv, size: None, db: Y
[mlrun] 2020-03-19 14:02:02,192 log artifact scale_pos_weight at /User/functions/describe/artifacts/scale_pos_weight, size: 5, db: Y
[mlrun] 2020-03-19 14:02:02,279 log artifact imbalance at /User/functions/describe/artifacts/plots/imbalance.html, size: 12164, db: Y
[mlrun] 2020-03-19 14:02:02,525 log artifact correlation at /User/functions/describe/artifacts/plotscorr.html, size: 32066, db: Y



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...2b62b6,0,Mar 19 14:01:51,completed,sum,kind=handlerowner=adminhost=jupyter-78ddb8b99c-brphv,table,"key=summarylabel_column=is_errorclass_labels=['0', '1']plot_hist=Trueplot_dest=plots",,summaryscale_pos_weightimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run 67ad4af5fd4c40a0ad6a1af3142b62b6 --project default , !mlrun logs 67ad4af5fd4c40a0ad6a1af3142b62b6 --project default
[mlrun] 2020-03-19 14:02:02,614 run executed, status=completed


<mlrun.model.RunObject at 0x7f57b64db9b0>

### Deployed

In [60]:
describe_fn.apply(mlrun.mount_v3io())
describe_fn.run(summ_task)

[mlrun] 2020-03-19 14:03:37,423 starting run sum uid=d535dc6fd4ef41ffa0f5590a82e0a8df  -> http://mlrun-api:8080
[mlrun] 2020-03-19 14:03:37,477 Job is running in the background, pod: sum-xwd5c
Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
[mlrun] 2020-03-19 14:03:48,327 While trying to run sns.pairplot encountered the following error: Selected KDE bandwidth is 0. Cannot estimate density.
[mlrun] 2020-03-19 14:03:48,422 log artifact summary at /User/functions/describe/artifacts/summary.csv, size: None, db: Y
[mlrun] 2020-03-19 14:03:50,940 log artifact scale_pos_weight at /User/functions/describe/artifacts/scale_pos_weight, size: 5, db: Y
[mlrun] 2020-03-19 14:03:51,

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e0a8df,0,Mar 19 14:03:42,completed,sum,host=sum-xwd5ckind=jobowner=admin,table,"class_labels=['0', '1']key=summarylabel_column=is_errorplot_dest=plotsplot_hist=True",,summaryscale_pos_weightimbalancecorrelation


to track results use .show() or .logs() or in CLI: 
!mlrun get run d535dc6fd4ef41ffa0f5590a82e0a8df  , !mlrun logs d535dc6fd4ef41ffa0f5590a82e0a8df 
[mlrun] 2020-03-19 14:03:55,825 run executed, status=completed


<mlrun.model.RunObject at 0x7f5725508fd0>

## Add yaml function definitions

In [74]:
describe_fn.metadata.categories = ['models', 'graphics']
describe_fn.set_label('stage', 'development')
print(describe_fn.to_yaml())

kind: job
metadata:
  name: describe
  tag: ''
  project: ''
  labels:
    stage: development
  categories:
  - models
  - graphics
spec:
  command: ''
  args: []
  image: mlrun/ml-models:0.4.5
  volumes: []
  volume_mounts: []
  env: []
  description: ''
  build:
    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDIwLTAzLTE5IDE0OjExCgppbXBvcnQgb3MKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IFBsb3RBcnRpZmFjdCwgVGFibGVBcnRpZmFjdAoKZnJvbSBza2xlYXJuLnByZXByb2Nlc3NpbmcgaW1wb3J0IFN0YW5kYXJkU2NhbGVyCmZyb20geWVsbG93YnJpY2sgaW1wb3J0IENsYXNzQmFsYW5jZQoKZnJvbSB0eXBpbmcgaW1wb3J0IElPLCBBbnlTdHIsIFVuaW9uLCBMaXN0LCBPcHRpb25hbAoKcGQuc2V0X29wdGlvbigiZGlzcGxheS5mbG9hdF9mb3JtYXQiLCBsYW1iZGEgeDogIiUuMmYiICUgeCkKCmRlZiBfZ2NmX2NsZWFyKHBsdCk6CiAgICBwbHQ

In [77]:
describe_fn.set_label
describe_fn.export(os.path.join(FN_DIR, 'function.yaml'))

[mlrun] 2020-03-19 14:16:01,147 function spec saved to path: /User/functions/describe/function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f5714264710>