### Pandas Profiling Report

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


In [3]:
%%nuclio cmd -c
pip install pandas_profiling

In [4]:
import pandas as pd
import pandas_profiling

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

In [5]:
def pandas_profiling_report(
    context: MLClientCtx,
    data: DataItem,
) -> None:
    """Create a Pandas Profiling Report for a dataset.
    :param context:         the function context
    :param data:            Dataset to create report for
    """
    
    # Load dataset
    df = data.as_df()
    
    # Create Pandas Profiling Report
    profile = df.profile_report(title='Pandas Profiling Report')
    
    # Save to MLRun DB
    context.log_artifact('Pandas Profiling Report',
                         body=profile.to_html(),
                         local_path='pandas_profiling_report.html')

In [6]:
# nuclio: end-code

### mlconfig

In [7]:
from mlrun import mlconf
import os

mlconf.dbpath = 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [8]:
from mlrun import code_to_function

# create job function object from notebook code
fn = code_to_function("pandas_profiling_report", kind="job")

# add metadata (for templates and reuse)
fn.spec.default_handler = "pandas_profiling_report"
fn.spec.description = "Create Pandas Profiling Report from Dataset"
fn.metadata.categories = ["analysis"]
fn.metadata.labels = {"author": "nicks"}
fn.export("function.yaml")

> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>

## tests

In [9]:
from mlrun.platforms import auto_mount
fn.apply(auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>

In [10]:
from mlrun import NewTask, run_local

DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'

In [11]:
task = NewTask(name="pandas-profiling-report", 
               handler=pandas_profiling_report, 
               inputs={"data": DATA_URL})

### run  locally

In [12]:
run = run_local(task)

> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…





project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...dcaff80e,0,Oct 15 19:21:41,completed,pandas-profiling-report,v3io_user=nickskind=handlerowner=nickshost=nicks-jupyter-76668bdd46-g9sxf,data,,,Pandas Profiling Report


to track results use .show() or .logs() or in CLI: 
!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
> 2020-10-15 19:21:52,944 [info] run executed, status=completed


### run remotely

In [13]:
# Create MLRun image (only needs to be run once)
fn.deploy()

In [14]:
fn.run(task, inputs={"data": DATA_URL})

> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
> 2020-10-15 19:23:33,779 [info] run executed, status=completed
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...7afd3b22,0,Oct 15 19:23:25,completed,pandas-profiling-report,v3io_user=nickskind=jobowner=nickshost=pandas-profiling-report-xr48m,data,,,Pandas Profiling Report


to track results use .show() or .logs() or in CLI: 
!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
> 2020-10-15 19:23:36,481 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7fe2297b51d0>