# Generic Scikit-Learn Classifier With Dask

Run any scikit-learn compatible classifier or list of classifiers

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import joblib
import numpy as np
import pandas as pd
import sklearn
from cloudpickle import dumps, load, dump
from typing import List, Optional

from dask.distributed import Client
from dask import dataframe as dd
from dask import array as da
from dask.delayed import delayed
from dask_ml import model_selection
from dask_ml import metrics
from dask_ml.preprocessing import StandardScaler, LabelEncoder

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import (gen_sklearn_model, create_class)

import matplotlib.pyplot as plt
from yellowbrick.classifier import ROCAUC, ClassificationReport, ConfusionMatrix
from yellowbrick.model_selection import FeatureImportances

In [4]:
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """
    
    # set up dask client 
    if scheduler_key:
        client = Client(scheduler_key)
        
    else:
        client = Client()

    context.logger.info("Read Data")
    # read data with dask and mlrun
    df = dataset.as_df(df_module=dd) 

    # take only numrical cols
    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)
    
    # dropna
    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')
    
    # save cols names
    df_header = df.columns
    
    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates() # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=train_validation_size,
                                                                        random_state=random_state)
    
    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)
    
    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed,
                                "y": y_train})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    
    model = ClassifierClass(**model_config["CLASS"])
    
    # load and fit model
    with joblib.parallel_backend("dask"):
        
        # initialize classifier from sklearn
        model = model.fit(**model_config["FIT"])

    # log artifacts
    artifact_path = context.artifact_subpath(models_dest)
    
    # log plots
    plots_path = context.artifact_subpath(models_dest, plots_dest)

    # create reports
    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):
        
        report_name = str(report.__name__)
        # clear output
        plt.cla()
        plt.clf()
        plt.close()
        
        # genrate report
        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed, y_train)               # Fit the training data to the visualizer
        viz.score(X_test_transformed, y_test.compute())     # Evaluate the model on the test data
        
        # log reports
        plot = context.log_artifact(PlotArtifact(report_name, 
                                                 body=viz.fig,
                                                 title=report_name), 
                                                 db_key=False)
        extra_data_dict[str(report)] = plot
        
        # log results
        if report_name == 'ROCAUC':
            context.log_results({"micro": viz.roc_auc.get("micro"),
                                "macro": viz.roc_auc.get("macro")})
            
        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:
                    
                    context.log_results({score_name + "-" + score_class : 
                                         viz.scores_[score_name].get(score_class)})
        
        #viz.show()
    
    # get feature importance
    viz = FeatureImportances(model, classes=classes, per_class=True, 
                             is_fitted=True, labels=df_header.delete(df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train) 
    viz.score(X_test_transformed, y_test)
    #viz.show()
    
    plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, 
                                             title="FeatureImportances"), db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot
    
    # clear final output
    plt.cla()
    plt.clf()
    plt.close()

    # log artifacts
    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)
    
    # log plots
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    
    # set label
    context.set_label('class', model_pkg_class)
    
    # log models
    context.log_model("model", body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})
    
    # log scalers
    context.log_artifact("standard_scaler", body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")
    
    # log encoder
    context.log_artifact("label_encoder", body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")
    
    # set aside some test data
    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(test_set_key, 
                        df=pd.DataFrame(df_to_save, 
                                        columns=df_header), # improve log dataset ability
                        format=file_ext, index=False, 
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))
    
    context.logger.info("Done!")

In [5]:
# nuclio: end-code

### Save and Config

In [6]:
import mlrun
skf = mlrun.code_to_function('sklearn-classifier-dask', kind='job', code_output=".") .apply(mlrun.mount_v3io())

### Set Environment

In [7]:
artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',
                                      artifact_path = os.path.abspath('./'))



### Init Dask

#### init a dask cluster and set dask specs

In [8]:
dsf = mlrun.new_function('dask_init', kind='dask', image='mlrun/ml-models')

> 2020-11-23 14:59:40,443 [info] using in-cluster config.


In [9]:
dsf.spec.remote = True
dsf.spec.replicas = 5
dsf.spec.service_type = 'NodePort'
dsf.with_limits(mem="8G")
dsf.spec.nthreads = 6
dsf.export("function.yaml")

> 2020-11-23 14:59:41,313 [info] function spec saved to path: function.yaml


<mlrun.runtimes.daskjob.DaskCluster at 0x7ff39e6ffd10>

#### mount v3io in for file system access

In [10]:
dsf.apply(mlrun.mount_v3io())

<mlrun.runtimes.daskjob.DaskCluster at 0x7ff39e6ffd10>

#### init dask client 
copy the scheduler address to **DASK_CLIENT** param in the following cell, this will make the function use the dask cluster.

In [14]:
dsf.client

> 2020-11-23 15:00:15,924 [info] trying dask client at: tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786
> 2020-11-23 15:00:15,932 [info] using remote dask scheduler (mlrun-dask-init-4fdf1dc3-5) at: tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786


0,1
Client  Scheduler: tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786  Dashboard: http://mlrun-dask-init-4fdf1dc3-5.default-tenant:8787/status,Cluster  Workers: 4  Cores: 24  Memory: 32.00 GB


### Set Parameters

In [15]:
DATA_URL = '/User/iris.csv'
DASK_CLIENT = 'tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786'

task_params = {
    "params" : {
        "sample"             : 1,
        "train_val_split"    : 0.75,
        "random_state"       : 42,
        "n_jobs"             : -1,
        "plots_dest"         : "plots-p",
        "models_dest"        : 'sklearn-clfmodel'}}


models = [
    "sklearn.ensemble.RandomForestClassifier",
    "sklearn.ensemble.AdaBoostClassifier",
    "sklearn.linear_model.LogisticRegression"
]

### Test and Run

In [16]:
outputs = []
for model in models:
    task_copy = task_params.copy()
    task_copy.update(
        {
            "params":{ "model_pkg_class" : model,
                       "label_column"    : "label",
                       "scheduler_key": DASK_CLIENT}
        }
    )
    
    # customize specific model parameters
    if "RandomForestClassifier" in model:
        task_copy["params"].update({"CLASS_max_depth" : 5})

    if "LogisticRegression" in model:
        task_copy["params"].update({"CLASS_solver" : "liblinear"})
    
    if "AdaBoostClassifier" in model:
        task_copy["params"].update({"CLASS_n_estimators"  : 200,
                                    "CLASS_learning_rate" : 0.01
                                   })
    
    name = model.replace('.', '_')
    output = skf.run(mlrun.NewTask(**task_copy),
                             handler=train_model,
                             name=name,
                             inputs={"dataset"      : DATA_URL},
                             artifact_path=os.path.join(artifact_path, model))
    
    outputs.append({name: output.outputs})

> 2020-11-23 15:00:23,780 [info] starting run sklearn_ensemble_RandomForestClassifier uid=a78d70155eb54280a04f1d6c5b42f673 DB=http://mlrun-api:8080
> 2020-11-23 15:00:23,941 [info] Job is running in the background, pod: sklearn-ensemble-randomforestclassifier-cd6bk
> 2020-11-23 15:00:29,218 [info] Read Data
> 2020-11-23 15:00:29,236 [info] Prep Data
> 2020-11-23 15:00:29,665 [info] Split and Train
> 2020-11-23 15:00:32,016 [info] Evaluate
> 2020-11-23 15:00:33,768 [info] Log artifacts
> 2020-11-23 15:00:34,595 [info] Done!
> 2020-11-23 15:00:34,660 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...5b42f673,0,Nov 23 15:00:29,completed,sklearn_ensemble_RandomForestClassifier,v3io_user=adminkind=jobowner=adminhost=sklearn-ensemble-randomforestclassifier-cd6bkclass=sklearn.ensemble.RandomForestClassifier,dataset,model_pkg_class=sklearn.ensemble.RandomForestClassifierlabel_column=labelscheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786CLASS_max_depth=5,micro=0.9941135734072022macro=0.9942943331178625precision-1=1.0precision-2=0.8888888888888888precision-0=0.9166666666666666recall-1=1.0recall-2=0.9411764705882353recall-0=0.8461538461538461f1-1=1.0f1-2=0.9142857142857143f1-0=0.8799999999999999,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run a78d70155eb54280a04f1d6c5b42f673 --project default , !mlrun logs a78d70155eb54280a04f1d6c5b42f673 --project default
> 2020-11-23 15:00:43,168 [info] run executed, status=completed
> 2020-11-23 15:00:43,169 [info] starting run sklearn_ensemble_AdaBoostClassifier uid=efac1f79e21f46259c423268d334ae6d DB=http://mlrun-api:8080
> 2020-11-23 15:00:43,335 [info] Job is running in the background, pod: sklearn-ensemble-adaboostclassifier-fd887
> 2020-11-23 15:00:48,569 [info] Read Data
> 2020-11-23 15:00:48,588 [info] Prep Data
> 2020-11-23 15:00:48,796 [info] Split and Train
> 2020-11-23 15:00:49,220 [info] Evaluate
> 2020-11-23 15:00:51,094 [info] Log artifacts
> 2020-11-23 15:00:51,533 [info] Done!
> 2020-11-23 15:00:51,581 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...d334ae6d,0,Nov 23 15:00:48,completed,sklearn_ensemble_AdaBoostClassifier,v3io_user=adminkind=jobowner=adminhost=sklearn-ensemble-adaboostclassifier-fd887class=sklearn.ensemble.AdaBoostClassifier,dataset,model_pkg_class=sklearn.ensemble.AdaBoostClassifierlabel_column=labelscheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786CLASS_n_estimators=200CLASS_learning_rate=0.01,micro=0.9581024930747923macro=0.9808974358974359precision-0=1.0precision-2=0.8precision-1=0.9375recall-0=1.0recall-2=0.9230769230769231recall-1=0.8333333333333334f1-0=1.0f1-2=0.8571428571428571f1-1=0.8823529411764706,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run efac1f79e21f46259c423268d334ae6d --project default , !mlrun logs efac1f79e21f46259c423268d334ae6d --project default
> 2020-11-23 15:00:52,501 [info] run executed, status=completed
> 2020-11-23 15:00:52,502 [info] starting run sklearn_linear_model_LogisticRegression uid=e314ab6e5e8546afbf851765e47506b0 DB=http://mlrun-api:8080
> 2020-11-23 15:00:52,675 [info] Job is running in the background, pod: sklearn-linear-model-logisticregression-drxn4
> 2020-11-23 15:00:58,029 [info] Read Data
> 2020-11-23 15:00:58,045 [info] Prep Data
> 2020-11-23 15:00:58,232 [info] Split and Train
> 2020-11-23 15:00:58,420 [info] Evaluate
> 2020-11-23 15:01:00,116 [info] Log artifacts
> 2020-11-23 15:01:00,439 [info] Done!
> 2020-11-23 15:01:00,489 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...e47506b0,0,Nov 23 15:00:57,completed,sklearn_linear_model_LogisticRegression,v3io_user=adminkind=jobowner=adminhost=sklearn-linear-model-logisticregression-drxn4class=sklearn.linear_model.LogisticRegression,dataset,model_pkg_class=sklearn.linear_model.LogisticRegressionlabel_column=labelscheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786CLASS_solver=liblinear,micro=0.9854570637119113macro=0.9832142857142858precision-1=1.0precision-0=0.8461538461538461precision-2=0.8571428571428571recall-1=1.0recall-0=0.8461538461538461recall-2=0.8571428571428571f1-1=1.0f1-0=0.8461538461538461f1-2=0.8571428571428571,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run e314ab6e5e8546afbf851765e47506b0 --project default , !mlrun logs e314ab6e5e8546afbf851765e47506b0 --project default
> 2020-11-23 15:01:01,914 [info] run executed, status=completed
