In [1]:
from azureml.core import Workspace, Experiment, Environment, Dataset
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.estimator import Estimator
import json
import os

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [3]:
# load experiment cfg
with open("experiment_cfg.json", "r") as cfg_file:
    cfg = json.load(cfg_file)

## Preprocess Scripts

In [4]:
%%writefile ./src/pipe.py

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self        
    def transform(self, X):
        return X[self.attribute_names].astype(self.dtype).values

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
def create_pipelines(cfg):
    
    # Pipeline for multilabel features
    multi_pipe = Pipeline([
        ('multi_feat_select', DataFrameSelector(cfg['multi_cols'], str)),
        ('multi_encode', MultiHotEncoder(delimiter=' '))
    ])
    
    cat_pipe = Pipeline([
        ('cat_feat_select', DataFrameSelector(cfg['cat_cols'], str)),
        ('cat_encode', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    ##############################################
    
    # combine features
    feat_union = FeatureUnion([
        ('multi_features', multi_pipe),
        ('cat_features', cat_pipe)
    ])
    
    # preprocess all features
    all_feat_pipe = Pipeline([
        ('all_features_pipe', feat_union),
#         ('all_feautres_pca', PCA(n_components=0.8, svd_solver = 'full'))
    ])
    
    ###########################################################################################
    
    # Pipeline for multi target cols
    multi_target_pipe = Pipeline([
        ('target_select', DataFrameSelector(cfg['multi_target_cols'], str)),
        ('target_encode', MultiHotEncoder(delimiter=' '))
    ])

    # Pipeline for numerical target cols
    num_target_pipe = Pipeline([
        ('num_feature_select', DataFrameSelector(cfg['num_target_cols'], float))
    ])
    
    ##############################################
    
    target_union = FeatureUnion([
        ('num_targets', num_target_pipe),
        ('multi_targets', multi_target_pipe)
    ])
    
    all_target_pipe = Pipeline([
        ('all_targets_pipe', target_union)
    ])
    
    ###########################################################################################

    return { 'feature_pipe': all_feat_pipe, 'target_pipe': all_target_pipe }

Overwriting ./src/pipe.py


In [14]:
%%writefile ./src/preprocess.py

from azureml.core import Run

import datetime
import os
import pandas as pd
import joblib
from argparse import ArgumentParser

from pipe import create_pipelines

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--input', dest='prepared_data')
parser.add_argument('--output', dest='preprocessed_data')
args = parser.parse_args()

# load datasets
if args.prepared_data:
    df = pd.read_csv(args.prepared_data + '/prepared_data.csv', sep=';', header=0)
else:
    df = run.input_datasets['df_prepared'].to_pandas_dataframe()

print('\n#######################################################################')
print('data loaded')
print('\n#######################################################################')
    
##############################################################################

# split data (test data from last t_test years)
t_test = 0.5
df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

print('\n#######################################################################')
print('train test split')
print('\n#######################################################################')

##############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['Symptoms', 'ProductId']
cfg['cat_cols'] = ['Country', 'City', 'LocationType', 'PostalCode', 'Month', 'Daytime', 'Weekday']
cfg['num_target_cols'] = ['duration']
cfg['multi_target_cols'] = ['ProductNr']

# create pipeline
pipelines = create_pipelines(cfg)

# fit pipelines and transform data
X_train = pipelines['feature_pipe'].fit_transform(df_train)
print('\n#######################################################################')
print('features fitted')
print('\n#######################################################################')
y_train = pipelines['target_pipe'].fit_transform(df_train)
print('\n#######################################################################')
print('targets fitted')
print('\n#######################################################################')
X_test = pipelines['feature_pipe'].transform(df_test)
y_test = pipelines['target_pipe'].transform(df_test)
print('\n#######################################################################')
print('pipelines created')
print('\n#######################################################################')

##############################################################################

# rename columns
feature_columns = [ 'feat_'+ str(i) for i in range(X_train.shape[1])]
target_columns = [ 'target_'+ str(i) for i in range(y_train.shape[1])]

print('\n#######################################################################')
print(X_train.shape)
print(len(feature_columns))
print('\n#######################################################################')

df_train = pd.concat([
    pd.DataFrame(X_train, columns=feature_columns),
    pd.DataFrame(y_train, columns=target_columns)
], axis=1)

df_test = pd.concat([
    pd.DataFrame(X_test, columns=feature_columns),
    pd.DataFrame(y_test, columns=target_columns)
], axis=1)

##############################################################################

# save train and test data
path = args.preprocessed_data if args.preprocessed_data else './outputs'
os.makedirs(path, exist_ok=True)
df_train.to_csv(path + '/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv(path + '/test_data.csv', sep=';', header=True, index=False)

# save pipelines
os.makedirs('outputs', exist_ok=True)
joblib.dump(pipelines, './outputs/pipelines.pkl')

run.complete()

Overwriting ./src/preprocess.py


## Create Estimator

In [15]:
est = Estimator(entry_script='preprocess.py', source_directory='src', 
              inputs=[ws.datasets[cfg['prepared_data_dataset']].as_named_input('df_prepared')],
              compute_target=cfg['compute_target'], environment_definition=ws.environments[cfg['env_name']])

## Run Experiment

In [16]:
exp = Experiment(ws, cfg['experiment_name'])
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: KerasPrediction_1592898919_1225b647
Web View: https://ml.azure.com/experiments/KerasPrediction/runs/KerasPrediction_1592898919_1225b647?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/55_azureml-execution-tvmps_b0edb5e64fa322497e56ffeafee6df167f403074c5edd6f5a147978e6c1e2da8_d.txt

2020-06-23T08:00:07Z Executing 'Copy ACR Details file' on 10.0.0.4
2020-06-23T08:00:07Z Copy ACR Details file succeeded on 10.0.0.4. Output: 
>>>   
>>>   
2020-06-23T08:00:07Z Starting output-watcher...
2020-06-23T08:00:07Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_c14e68a5a54beac144cd751fe11b91c5
a1298f4ce990: Pulling fs layer
04a3282d9c4b: Pulling fs layer
9b0d3db6dc03: Pulling fs layer
8269c605f3f1: Pulling fs layer
6504d449e70c: Pulling fs layer
4e38f320d0d4: Pulling fs layer
b0a763e8ee03: Pulling fs layer
6504

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with ValueError: Shape of passed values is (134561, 1), indices imply (134561, 34503)",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "ValueError",
            "message": "Shape of passed values is (134561, 1), indices imply (134561, 34503)",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/resdynml1test/azureml/kerasprediction_1592898919_1225b647/mounts/workspaceblobstore/azureml/KerasPrediction_1592898919_1225b647/azureml-setup/context_manager_injector.py\", line 148, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"preprocess.py\", line 79, in <module>\n    pd.DataFrame(X_train, columns=feature_columns),\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/frame.py\", line 488, in __init__\n    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/construction.py\", line 210, in init_ndarray\n    return create_block_manager_from_blocks(block_values, [columns, index])\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/managers.py\", line 1664, in create_block_manager_from_blocks\n    construction_error(tot_items, blocks[0].shape[1:], axes, e)\n  File \"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/managers.py\", line 1694, in construction_error\n    raise ValueError(f\"Shape of passed values is {passed}, indices imply {implied}\")\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with ValueError: Shape of passed values is (134561, 1), indices imply (134561, 34503)\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"ValueError\",\n            \"message\": \"Shape of passed values is (134561, 1), indices imply (134561, 34503)\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/resdynml1test/azureml/kerasprediction_1592898919_1225b647/mounts/workspaceblobstore/azureml/KerasPrediction_1592898919_1225b647/azureml-setup/context_manager_injector.py\\\", line 148, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"preprocess.py\\\", line 79, in <module>\\n    pd.DataFrame(X_train, columns=feature_columns),\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/frame.py\\\", line 488, in __init__\\n    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/construction.py\\\", line 210, in init_ndarray\\n    return create_block_manager_from_blocks(block_values, [columns, index])\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/managers.py\\\", line 1664, in create_block_manager_from_blocks\\n    construction_error(tot_items, blocks[0].shape[1:], axes, e)\\n  File \\\"/azureml-envs/azureml_42df74e95cf2de1f301b9fba9e8035c0/lib/python3.6/site-packages/pandas/core/internals/managers.py\\\", line 1694, in construction_error\\n    raise ValueError(f\\\"Shape of passed values is {passed}, indices imply {implied}\\\")\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}