# Generic Scikit-Learn Classifier With Dask

Run any scikit-learn compatible classifier or list of classifiers

In [1]:
import mlrun



In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import joblib
import numpy as np
import pandas as pd
from cloudpickle import dumps, load, dump

from dask import dataframe as dd
from dask import array as da
from dask.delayed import delayed
from dask_ml import model_selection
from dask_ml import metrics
from dask_ml.preprocessing import StandardScaler, LabelEncoder

from mlrun.artifacts import PlotArtifact
from mlrun.mlutils.models import gen_sklearn_model
from mlrun.utils.helpers import create_class

import matplotlib.pyplot as plt
from yellowbrick.classifier import ROCAUC, ClassificationReport, ConfusionMatrix
from yellowbrick.model_selection import FeatureImportances

In [4]:
def train_model(context,
                dataset: mlrun.DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_function: str = None,
                dask_client=None,
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_function:           dask function url (db://..)
    :param dask_client:             dask client object
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """
    
    # set up dask client 
    if dask_function:
        client = mlrun.import_function(dask_function).client
    elif dask_client:
        client = dask_client
    else:
        raise ValueError('dask client was not provided')

    context.logger.info("Read Data")
    # read data with dask and mlrun
    df = dataset.as_df(df_module=dd) 

    # take only numrical cols
    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)
    
    # dropna
    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')
    
    # save cols names
    df_header = df.columns
    
    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates() # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=train_validation_size,
                                                                        random_state=random_state)
    
    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)
    
    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed,
                                "y": y_train})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    
    model = ClassifierClass(**model_config["CLASS"])
    
    # load and fit model
    with joblib.parallel_backend("dask"):
        
        # initialize classifier from sklearn
        model = model.fit(**model_config["FIT"])

    # create reports
    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):
        
        report_name = str(report.__name__)
        # clear output
        plt.cla()
        plt.clf()
        plt.close()
        
        # genrate report
        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed, y_train)               # Fit the training data to the visualizer
        viz.score(X_test_transformed, y_test.compute())     # Evaluate the model on the test data
        
        # log reports
        plot = context.log_artifact(PlotArtifact(report_name, 
                                                 body=viz.fig,
                                                 title=report_name), 
                                                 db_key=False)
        extra_data_dict[str(report)] = plot
        
        # log results
        if report_name == 'ROCAUC':
            context.log_results({"micro": viz.roc_auc.get("micro"),
                                "macro": viz.roc_auc.get("macro")})
            
        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:
                    
                    context.log_results({score_name + "-" + score_class : 
                                         viz.scores_[score_name].get(score_class)})
        
        #viz.show()
    
    # get feature importance
    viz = FeatureImportances(model, classes=classes, per_class=True, 
                             is_fitted=True, labels=df_header.delete(df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train) 
    viz.score(X_test_transformed, y_test)
    #viz.show()
    
    plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, 
                                             title="FeatureImportances"), db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot
    
    # clear final output
    plt.cla()
    plt.clf()
    plt.close()

    # log artifacts
    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)
    
    # set label
    context.set_label('class', model_pkg_class)
    
    # log models
    context.log_model("model", body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})
    
    # log scalers
    context.log_artifact("standard_scaler", body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")
    
    # log encoder
    context.log_artifact("label_encoder", body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")
    
    # set aside some test data
    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(test_set_key, 
                        df=pd.DataFrame(df_to_save, 
                                        columns=df_header), # improve log dataset ability
                        format=file_ext, index=False, 
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))
    
    context.logger.info("Done!")

In [5]:
# nuclio: end-code

### Save and Config

In [16]:
import mlrun
_, artifact_path = mlrun.set_environment(artifact_path='./')
fn = mlrun.code_to_function('sklearn-classifier-dask', kind='job', handler='train_model', 
                            description="train any classifier using scikit-learn's API over Dask", 
                            categories=["ml", "training", "dask"], 
                            labels={'author': 'yjb', "framework": "sklearn"},
                            code_output='.')

fn.export()

> 2021-01-26 15:31:31,836 [info] function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f25d2bc3a50>

### Init Dask

#### init a dask cluster and set dask specs

In [7]:
fn.apply(mlrun.platforms.auto_mount())
DATA_URL = "/User/iris.csv"

In [10]:
!curl -L "https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv" > {DATA_URL}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2776  100  2776    0     0  16722      0 --:--:-- --:--:-- --:--:-- 16622


In [11]:
# create a dask test cluster (dask function)
dask_cluster = mlrun.new_function('dask_tests', kind='dask', image='mlrun/ml-models')
dask_cluster.apply(mlrun.mount_v3io())
dask_cluster.spec.remote = True
dask_cluster.with_requests(mem='8G')
dask_cluster.save()

> 2021-01-26 15:22:37,907 [info] using in-cluster config.


'0f3172545726617a0f38a9661e5d731315f0b4ca'

#### init dask client 
copy the scheduler address to **DASK_CLIENT** param in the following cell, this will make the function use the dask cluster.

In [12]:
dask_cluster.client

> 2021-01-26 15:22:45,162 [info] to get a dashboard link, use NodePort service_type
> 2021-01-26 15:22:45,163 [info] trying dask client at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:22:45,197 [info] using remote dask scheduler (mlrun-dask-tests-58d71599-7) at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786


0,1
Client  Scheduler: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786  Dashboard: http://mlrun-dask-tests-58d71599-7.default-tenant:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


### Set Parameters

In [13]:
task_params = {
    "params" : {
        "sample"             : 1,
        "train_val_split"    : 0.75,
        "random_state"       : 42,
        "n_jobs"             : -1,
        "plots_dest"         : "plots-p",
        "models_dest"        : 'sklearn-clfmodel'}}


models = [
    "sklearn.ensemble.RandomForestClassifier",
    "sklearn.ensemble.AdaBoostClassifier",
    "sklearn.linear_model.LogisticRegression"
]

### Test and Run

In [14]:
outputs = []
for model in models:
    task_copy = task_params.copy()
    task_copy.update(
        {
            "params":{ "model_pkg_class" : model,
                       "label_column"    : "label",
                       "dask_function": 'db://default/dask_tests'}
        }
    )
    
    # customize specific model parameters
    if "RandomForestClassifier" in model:
        task_copy["params"].update({"CLASS_max_depth" : 5})

    if "LogisticRegression" in model:
        task_copy["params"].update({"CLASS_solver" : "liblinear"})
    
    if "AdaBoostClassifier" in model:
        task_copy["params"].update({"CLASS_n_estimators"  : 200,
                                    "CLASS_learning_rate" : 0.01
                                   })
    
    name = model.replace('.', '_')
    output = fn.run(mlrun.NewTask(**task_copy),
                             handler=train_model,
                             name=name,
                             inputs={"dataset"      : DATA_URL},
                             artifact_path=os.path.join(artifact_path, model),
                             local=False)
    
    outputs.append({name: output.outputs})

> 2021-01-26 15:22:47,942 [info] starting run sklearn_ensemble_RandomForestClassifier uid=c536e15d7f804262a3b14305fbb8d47b DB=http://mlrun-api:8080
> 2021-01-26 15:22:48,160 [info] Job is running in the background, pod: sklearn-ensemble-randomforestclassifier-r48fw
> 2021-01-26 15:22:53,372 [info] using in-cluster config.
> 2021-01-26 15:22:53,372 [info] to get a dashboard link, use NodePort service_type
> 2021-01-26 15:22:53,372 [info] trying dask client at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:22:53,404 [info] using remote dask scheduler (mlrun-dask-tests-58d71599-7) at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:22:53,404 [info] Read Data
> 2021-01-26 15:22:53,426 [info] Prep Data
> 2021-01-26 15:22:57,916 [info] Split and Train
> 2021-01-26 15:22:59,187 [info] Evaluate
> 2021-01-26 15:23:00,997 [info] Log artifacts
> 2021-01-26 15:23:01,557 [info] Done!
> 2021-01-26 15:23:01,601 [info] run executed, status=completed
final 

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...fbb8d47b,0,Jan 26 15:22:53,completed,sklearn_ensemble_RandomForestClassifier,v3io_user=adminkind=jobowner=adminhost=sklearn-ensemble-randomforestclassifier-r48fwclass=sklearn.ensemble.RandomForestClassifier,dataset,model_pkg_class=sklearn.ensemble.RandomForestClassifierlabel_column=labeldask_function=db://default/dask_testsCLASS_max_depth=5,micro=1.0macro=1.0precision-2=1.0precision-1=1.0precision-0=1.0recall-2=1.0recall-1=1.0recall-0=1.0f1-2=1.0f1-1=1.0f1-0=1.0,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run c536e15d7f804262a3b14305fbb8d47b --project default , !mlrun logs c536e15d7f804262a3b14305fbb8d47b --project default
> 2021-01-26 15:23:03,504 [info] run executed, status=completed
> 2021-01-26 15:23:03,505 [info] starting run sklearn_ensemble_AdaBoostClassifier uid=3ebb600c11d542cda56fceaad8a5f32d DB=http://mlrun-api:8080
> 2021-01-26 15:23:03,704 [info] Job is running in the background, pod: sklearn-ensemble-adaboostclassifier-vhxtc
> 2021-01-26 15:23:08,963 [info] using in-cluster config.
> 2021-01-26 15:23:08,963 [info] to get a dashboard link, use NodePort service_type
> 2021-01-26 15:23:08,963 [info] trying dask client at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:23:08,993 [info] using remote dask scheduler (mlrun-dask-tests-58d71599-7) at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:23:08,993 [info] Read Data
> 2021-01-26 15:23:09,017 [info] Prep Data
> 2021-0

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...d8a5f32d,0,Jan 26 15:23:08,completed,sklearn_ensemble_AdaBoostClassifier,v3io_user=adminkind=jobowner=adminhost=sklearn-ensemble-adaboostclassifier-vhxtcclass=sklearn.ensemble.AdaBoostClassifier,dataset,model_pkg_class=sklearn.ensemble.AdaBoostClassifierlabel_column=labeldask_function=db://default/dask_testsCLASS_n_estimators=200CLASS_learning_rate=0.01,micro=0.9761080332409972macro=0.9848504273504274precision-2=1.0precision-0=0.8571428571428571precision-1=0.9230769230769231recall-2=1.0recall-0=0.9230769230769231recall-1=0.8571428571428571f1-2=1.0f1-0=0.888888888888889f1-1=0.888888888888889,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run 3ebb600c11d542cda56fceaad8a5f32d --project default , !mlrun logs 3ebb600c11d542cda56fceaad8a5f32d --project default
> 2021-01-26 15:23:29,053 [info] run executed, status=completed
> 2021-01-26 15:23:29,054 [info] starting run sklearn_linear_model_LogisticRegression uid=9c623457a4de47b3881cc1ef3dacbd24 DB=http://mlrun-api:8080
> 2021-01-26 15:23:29,247 [info] Job is running in the background, pod: sklearn-linear-model-logisticregression-pr8j9
> 2021-01-26 15:23:34,379 [info] using in-cluster config.
> 2021-01-26 15:23:34,380 [info] to get a dashboard link, use NodePort service_type
> 2021-01-26 15:23:34,380 [info] trying dask client at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:23:34,405 [info] using remote dask scheduler (mlrun-dask-tests-58d71599-7) at: tcp://mlrun-dask-tests-58d71599-7.default-tenant:8786
> 2021-01-26 15:23:34,405 [info] Read Data
> 2021-01-26 15:23:34,424 [info] Prep Data


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...3dacbd24,0,Jan 26 15:23:34,completed,sklearn_linear_model_LogisticRegression,v3io_user=adminkind=jobowner=adminhost=sklearn-linear-model-logisticregression-pr8j9class=sklearn.linear_model.LogisticRegression,dataset,model_pkg_class=sklearn.linear_model.LogisticRegressionlabel_column=labeldask_function=db://default/dask_testsCLASS_solver=liblinear,micro=0.9916897506925207macro=0.9985119047619048precision-2=1.0precision-1=1.0precision-0=0.7857142857142857recall-2=1.0recall-1=0.7857142857142857recall-0=1.0f1-2=1.0f1-1=0.88f1-0=0.88,ROCAUCClassificationReportConfusionMatrixFeatureImportancesmodelstandard_scalerlabel_encodertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9c623457a4de47b3881cc1ef3dacbd24 --project default , !mlrun logs 9c623457a4de47b3881cc1ef3dacbd24 --project default
> 2021-01-26 15:23:44,614 [info] run executed, status=completed
