In [9]:
from azureml.core import Workspace, Experiment
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.sklearn import SKLearn

In [10]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [11]:
%%writefile ./src/train.py

from azureml.core import Run
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import recall_score, precision_score, hamming_loss, zero_one_loss
import os
import joblib

run = Run.get_context()

# load data
# train_data = run.input_datasets['train_data'].to_pandas_dataframe()
# test_data = run.input_datasets['test_data'].to_pandas_dataframe()
# ws = run.experiment.workspace
# train_data, test_data, pipes = joblib.load(ws.datasets['oneperid_data'])
data = run.input_datasets['oneperid_data']

train_data,test_data,pipes = joblib.load(data)

models = {}
y_pred = []
y_true = []
for k in train_data:
    # split train/test and feat/target
    X_train = train_data[k][[ col for col in train_data[k].columns if col.startswith('feat')]]
    y_train = train_data[k][[ col for col in train_data[k].columns if col.startswith('target')]]
    X_test = test_data[k][[col for col in test_data[k].columns if col.startswith('feat')]]
    y_test = test_data[k][[ col for col in test_data[k].columns if col.startswith('target')]]

    # train classifier
    models[k] = MultiOutputClassifier(
                XGBClassifier(n_jobs=-1, max_depth=3, learning_rate=0.1, n_estimators=100, reg_alpha=0, reg_lambda=1)
            )
    model.fit(X_train, y_train)
    y_pred += models[k].predict(X_test)
    y_true += y_test

run.log('precision_macro', precision_score(y_true, y_pred, average='macro'))
run.log('precision_samples', precision_score(y_true, y_pred, average='samples'))
run.log('recall_macro', recall_score(y_true, y_pred, average='macro'))
run.log('recall_macro', recall_score(y_true, y_pred, average='samples'))
run.log('hamming_loss', hamming_loss(y_true, y_pred))
run.log('zero_one_loss', zero_one_loss(y_true, y_pred))

# # evaluate train data
# y_pred = model.predict(X_train)
# run.log('precision_macro_train', precision_score(y_train, y_pred, average='macro'))
# run.log('precision_samples_train', precision_score(y_train, y_pred, average='samples'))
# run.log('recall_macro_train', recall_score(y_train, y_pred, average='macro'))
# run.log('recall_macro_train', recall_score(y_train, y_pred, average='samples'))
# run.log('hamming_loss_train', hamming_loss(y_train, y_pred))
# run.log('zero_one_loss_train', zero_one_loss(y_train, y_pred))

# save model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

Overwriting ./src/train.py


In [12]:
est = SKLearn(entry_script='train.py', source_directory='src',
             inputs=[   ws.datasets['oneperid_data'].as_named_input('oneperid_data').as_mount() ],
             compute_target='local')

In [13]:
exp = Experiment(ws, 'ProductPredictionOnePerID')
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: ProductPredictionOnePerID_1592245422_52d7151e
Web View: https://ml.azure.com/experiments/ProductPredictionOnePerID/runs/ProductPredictionOnePerID_1592245422_52d7151e?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/70_driver_log.txt

Entering context manager injector. Current time:2020-06-15T18:23:44.936384
Initialize DatasetContextManager.
Starting the daemon thread to refresh tokens in background for process with pid = 8
Set Dataset oneperid_data's target path to /tmp/tmpkjltj5rc
Enter __enter__ of DatasetContextManager
SDK version: azureml-core==1.7.0 azureml-dataprep==1.7.0
Processing 'oneperid_data'
Processing dataset FileDataset
{
  "source": [
    "https://resdynml1test6456542521.blob.core.windows.net/azureml/ExperimentRun/dcid.ProductPredictionOnePerID_1592243223_7bd77c20/outputs/data"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "1b7b59d6-743d-492d-a5bd-2

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "ServiceError",
        "message": "Dataset initialization failed: Cannot mount dataset. Source of the dataset is either not accessible or does not contain any data.",
        "details": [],
        "debugInfo": {
            "type": "RuntimeError",
            "message": "Cannot mount dataset. Source of the dataset is either not accessible or does not contain any data.",
            "stackTrace": "  File \"azureml-setup/context_manager_injector.py\", line 62, in __enter__\n    self.context_manager.__enter__()\n  File \"/azureml-run/azureml-setup/context_managers.py\", line 237, in __enter__\n    self.datasets.__enter__()\n  File \"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/context_managers.py\", line 172, in __enter__\n    context_manager = dataset.mount(mount_point=target_path, mount_options=mount_options)\n  File \"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/_loggerfactory.py\", line 106, in wrapper\n    return func(*args, **kwargs)\n  File \"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/file_dataset.py\", line 203, in mount\n    'or does not contain any data.')\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"ServiceError\",\n        \"message\": \"Dataset initialization failed: Cannot mount dataset. Source of the dataset is either not accessible or does not contain any data.\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"RuntimeError\",\n            \"message\": \"Cannot mount dataset. Source of the dataset is either not accessible or does not contain any data.\",\n            \"stackTrace\": \"  File \\\"azureml-setup/context_manager_injector.py\\\", line 62, in __enter__\\n    self.context_manager.__enter__()\\n  File \\\"/azureml-run/azureml-setup/context_managers.py\\\", line 237, in __enter__\\n    self.datasets.__enter__()\\n  File \\\"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/context_managers.py\\\", line 172, in __enter__\\n    context_manager = dataset.mount(mount_point=target_path, mount_options=mount_options)\\n  File \\\"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/_loggerfactory.py\\\", line 106, in wrapper\\n    return func(*args, **kwargs)\\n  File \\\"/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/site-packages/azureml/data/file_dataset.py\\\", line 203, in mount\\n    'or does not contain any data.')\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [6]:
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import recall_score, precision_score, hamming_loss, zero_one_loss
import os
import joblib

# load data
# train_data = run.input_datasets['train_data'].to_pandas_dataframe()
# test_data = run.input_datasets['test_data'].to_pandas_dataframe()
data = ws.datasets['oneperid_data'].as_named_input('oneperid_data').as_mount()

In [14]:
data.path_on_compute

In [None]:
train_data, test_data, pipes

In [None]:
models = {}
y_pred = []
y_true = []

for k in train_data:
    # split train/test and feat/target
    X_train = train_data[k][[ col for col in train_data[k].columns if col.startswith('feat')]]
    y_train = train_data[k][[ col for col in train_data[k].columns if col.startswith('target')]]
    X_test = test_data[k][[col for col in test_data[k].columns if col.startswith('feat')]]
    y_test = test_data[k][[ col for col in test_data[k].columns if col.startswith('target')]]

    # train classifier
    models[k] = MultiOutputClassifier(
                XGBClassifier(n_jobs=-1, max_depth=3, learning_rate=0.1, n_estimators=100, reg_alpha=0, reg_lambda=1)
            )
    model.fit(X_train, y_train)
    y_pred += models[k].predict(X_test)
    y_true += y_test

In [None]:
print('precision_macro', precision_score(y_true, y_pred, average='macro'))
print('precision_samples', precision_score(y_true, y_pred, average='samples'))
print('recall_macro', recall_score(y_true, y_pred, average='macro'))
print('recall_macro', recall_score(y_true, y_pred, average='samples'))
print('hamming_loss', hamming_loss(y_true, y_pred))
print('zero_one_loss', zero_one_loss(y_true, y_pred))