## split data into train, validation and test sets

In [1]:
import mlrun
import os
import numpy as np
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [14]:
CODE_BASE          = '/User/repos/functions/datagen'           
N_SAMPLES          = 100_000
M_FEATURES         = 28
NEG_WEIGHT         = 0.5
RNG                = 1
TARGET_DATA_PATH   = '/User/mlrun/splitter'
SRC_FILE           = 'simdata.pqt'
KEY                = 'simdata'

## generate some binary classification data

In [15]:
binarydatagen = mlrun.import_function(
    os.path.join(CODE_BASE, 'classification', 'binary.yaml')
).apply(mlrun.mount_v3io())

binarydatagen.deploy(skip_deployed=True)

task1 = mlrun.NewTask()
task1.with_params(
    n_samples=N_SAMPLES,
    m_features=M_FEATURES,
    weight=NEG_WEIGHT,
    target_path=TARGET_DATA_PATH,
    filename=TARGET_DATA_PATH + '/' + SRC_FILE,
    key=KEY,
    random_state=RNG)

tsk1 = binarydatagen.run(task1, handler='create_binary_classification')

[mlrun] 2020-01-26 12:00:09,764 starting run create_binary_classification uid=561adae458ac4df1a16d7ac371d2e450  -> http://mlrun-api:8080
[mlrun] 2020-01-26 12:00:09,844 Job is running in the background, pod: create-binary-classification-qqm6z
[mlrun] 2020-01-26 12:00:22,759 log artifact simdata at /User/mlrun/splitter/simdata.pqt, size: None, db: Y

  result = infer_dtype(pandas_collection)
[mlrun] 2020-01-26 12:00:22,773 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...d2e450,0,Jan 26 12:00:21,completed,binary,host=create-binary-classification-qqm6zkind=jobowner=admin,,filename=/User/mlrun/splitter/simdata.pqtkey=simdatam_features=28n_samples=100000random_state=1target_path=/User/mlrun/splitterweight=0.5,,simdata


to track results use .show() or .logs() or in CLI: 
!mlrun get run 561adae458ac4df1a16d7ac371d2e450  , !mlrun logs 561adae458ac4df1a16d7ac371d2e450 
[mlrun] 2020-01-26 12:00:29,005 run executed, status=completed


In [16]:
tsk1.outputs

{'simdata': '/User/mlrun/splitter/simdata.pqt'}

## split the data

In [28]:
yaml_name = os.path.join(CODE_BASE, 'splitters', 'train_valid_test.yaml')
if not os.path.isfile(yaml_name):
    testfn = mlrun.code_to_function(
        kind='job', 
        image='yjbds/mlrun-ds:latest',
        filename=os.path.join(CODE_BASE, 'splitters', 'train_valid_test.py'))
    testfn.build_config(base_image='yjbds/mlrun-ds:latest', commands=[])
    testfn.export(yaml_name)

[mlrun] 2020-01-26 13:01:10,237 function spec saved to path: /User/repos/functions/datagen/splitters/train_valid_test.yaml


In [18]:
splitter = mlrun.import_function(
    os.path.join(CODE_BASE, 'splitters', 'train_valid_test.yaml')
).apply(mlrun.mount_v3io())

In [19]:
splitter.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [20]:
task2 = mlrun.NewTask()
task2.with_params(
    src_file=tsk1.outputs['simdata'],
    target_path=TARGET_DATA_PATH,
    random_state=RNG)

tsk2 = splitter.run(task2, handler='train_valid_test_splitter')

[mlrun] 2020-01-26 12:00:29,121 starting run train_valid_test_splitter uid=5be904e5c6a14a9ba59e016e212f4499  -> http://mlrun-api:8080
[mlrun] 2020-01-26 12:00:29,206 Job is running in the background, pod: train-valid-test-splitter-kqfdd
[mlrun] 2020-01-26 12:00:39,440 log artifact header at /User/mlrun/splitter/header.pkl, size: None, db: Y
[mlrun] 2020-01-26 12:00:39,753 log artifact xtrain at /User/mlrun/splitter/xtrain.pqt, size: None, db: Y
[mlrun] 2020-01-26 12:00:39,889 log artifact xvalid at /User/mlrun/splitter/xvalid.pqt, size: None, db: Y
[mlrun] 2020-01-26 12:00:40,042 log artifact xtest at /User/mlrun/splitter/xtest.pqt, size: None, db: Y
[mlrun] 2020-01-26 12:00:40,073 log artifact ytrain at /User/mlrun/splitter/ytrain.pqt, size: None, db: Y
[mlrun] 2020-01-26 12:00:40,100 log artifact yvalid at /User/mlrun/splitter/yvalid.pqt, size: None, db: Y
[mlrun] 2020-01-26 12:00:40,124 log artifact ytest at /User/mlrun/splitter/ytest.pqt, size: None, db: Y

[mlrun] 2020-01-26 12:00

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...2f4499,0,Jan 26 12:00:39,completed,train-valid-test,host=train-valid-test-splitter-kqfddkind=jobowner=admin,,random_state=1src_file=/User/mlrun/splitter/simdata.pqttarget_path=/User/mlrun/splitter,,headerxtrainxvalidxtestytrainyvalidytest


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5be904e5c6a14a9ba59e016e212f4499  , !mlrun logs 5be904e5c6a14a9ba59e016e212f4499 
[mlrun] 2020-01-26 12:00:48,372 run executed, status=completed


In [21]:
tsk2.outputs

{'header': '/User/mlrun/splitter/header.pkl',
 'xtrain': '/User/mlrun/splitter/xtrain.pqt',
 'xvalid': '/User/mlrun/splitter/xvalid.pqt',
 'xtest': '/User/mlrun/splitter/xtest.pqt',
 'ytrain': '/User/mlrun/splitter/ytrain.pqt',
 'yvalid': '/User/mlrun/splitter/yvalid.pqt',
 'ytest': '/User/mlrun/splitter/ytest.pqt'}

## tests

In [22]:
import pandas as pd

In [23]:
# rounding error of one sample
ERROR = -1
xtrain_shape = pd.read_parquet(tsk2.outputs['xtrain'], engine='pyarrow').shape
ytrain_shape = pd.read_parquet(tsk2.outputs['ytrain'], engine='pyarrow').shape

assert (int(.75*(N_SAMPLES*(1-.1)))+ERROR, M_FEATURES) == xtrain_shape, "xtrain doesn't have the expected shape"
assert ytrain_shape[0] == xtrain_shape[0], "ytrain and xtrain have different shapes"
assert ytrain_shape[1] == 1, "ytrain (labels) has more than 1 column"

In [24]:
xtest_shape = pd.read_parquet(tsk2.outputs['xtest'], engine='pyarrow').shape
ytest_shape = pd.read_parquet(tsk2.outputs['ytest'], engine='pyarrow').shape
assert (int(N_SAMPLES*.1), M_FEATURES) == xtest_shape,  "xtest doesn't have the expected shape"
assert ytest_shape[0] == xtest_shape[0], "ytest and xtest have different shapes"
assert ytest_shape[1] == 1, "ytest (test labels) has more than 1 column"

In [25]:
from cloudpickle import load

In [26]:
assert len(load(open(tsk2.outputs['header'], 'rb'))) == M_FEATURES