In [1]:
from azureml.core import Workspace, Experiment, Environment, RunConfiguration
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.sklearn import SKLearn
import datetime
import numpy as np
import pandas as pd

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

## Scripts

In [3]:
%%writefile ./src/pipe.py

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self        
    def transform(self, X):
        return X[self.attribute_names].astype(self.dtype).values

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
def create_pipelines(cfg):
    
    # Pipeline for multilabel features
    multi_pipe = Pipeline([
        ('multi_feat_select', DataFrameSelector(cfg['multi_cols'], str)),
        ('multi_encode', MultiHotEncoder(delimiter=' '))
    ])
    
    # combine features
    feat_union = FeatureUnion([
        ('multi_features', multi_pipe)
    ])
    
    # preprocess all features
    all_feat_pipe = Pipeline([
        ('all_features_pipe', feat_union),
#         ('all_feautres_pca', PCA(n_components=0.8, svd_solver = 'full'))
    ])
    
    # Pipeline for multi target cols
    multi_target_pipe = Pipeline([
        ('target_select', DataFrameSelector(cfg['multi_target_cols'], str)),
        ('target_encode', MultiHotEncoder(delimiter=' '))
    ])

    # Pipeline for numerical target cols
    num_target_pipe = Pipeline([
        ('num_feature_select', DataFrameSelector(cfg['num_target_cols'], float))
    ])
    
    all_target_pipe = FeatureUnion([
        ('num_targets', num_target_pipe),
        ('multi_targets', multi_target_pipe)
    ])

    return all_feat_pipe, all_target_pipe

Overwriting ./src/pipe.py


In [4]:
%%writefile ./src/prep.py

from azureml.core import Run
import pandas as pd
import datetime
from pipe import create_pipelines
import os
import numpy as np
import joblib
from argparse import ArgumentParser

t = 0.5
t_test = 0.1

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--output', dest='prepared_data')
parser.add_argument('--pipeline_data', dest='pipeline_data')
args = parser.parse_args()

# load datasets
df_symptoms = run.input_datasets['symptomcodes'].to_pandas_dataframe()
df = run.input_datasets['df'].to_pandas_dataframe()

###########################################################

# get only data from last t years
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum())))),
    'Start': x['Job Card.Date Start Work'].min(),
    'End': x['Job Card.Date End Work'].max()
  })).reset_index()

df = pd.concat([df, pd.DataFrame((df['End'] - df['Start']), columns=['duration'])],axis=1)
df['duration'] = df['duration'].apply(lambda x: x.seconds / 3600)

##############################################################################

# split data (test data from last t_test years)
df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

##############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['Symptoms']
cfg['num_target_cols'] = ['duration']
cfg['multi_target_cols'] = ['ProductNr']

feature_pipe, target_pipe = create_pipelines(cfg)
pipelines = { 'feature_pipe': feature_pipe, 'target_pipe': target_pipe }

##############################################################################

X_train = pipelines['feature_pipe'].fit_transform(df_train)
y_train = pipelines['target_pipe'].fit_transform(df_train)
X_test = pipelines['feature_pipe'].transform(df_test)
y_test = pipelines['target_pipe'].transform(df_test)

# rename columns
feature_columns = [ 'feat_'+ str(i) for i in range(X_train.shape[1])]
target_columns = [ 'target_'+ str(i) for i in range(y_train.shape[1])]

df_train = pd.concat([
    pd.DataFrame(X_train, columns=feature_columns),
    pd.DataFrame(y_train, columns=target_columns)
], axis=1)

df_test = pd.concat([
    pd.DataFrame(X_test, columns=feature_columns),
    pd.DataFrame(y_test, columns=target_columns)
], axis=1)

##############################################################################

# save train and test data to run output
os.makedirs('outputs', exist_ok=True)
df_train.to_csv('./outputs/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv('./outputs/test_data.csv', sep=';', header=True, index=False)

# and save train and test data to PipelineData output
os.makedirs(args.prepared_data, exist_ok=True)
df_train.to_csv(args.prepared_data + '/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv(args.prepared_data + '/test_data.csv', sep=';', header=True, index=False)

# save pipelines only in run output
joblib.dump(pipelines, './outputs/pipelines.pkl')

# and save in PipelineData output
joblib.dump(pipelines, args.pipeline_data)# + '/pipelines.pkl')

run.complete()

Overwriting ./src/prep.py


In [5]:
%%writefile ./src/train.py

from azureml.core import Run
import os
import pandas as pd
import joblib
from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.dummy import DummyRegressor
from sklearn.metrics import recall_score, precision_score, hamming_loss, zero_one_loss, mean_absolute_error, mean_squared_error, r2_score
from argparse import ArgumentParser

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--input', dest='prepared_data')
parser.add_argument('--trained_classifier', dest='trained_classifier')
parser.add_argument('--trained_regressor', dest='trained_regressor')
args = parser.parse_args()

# load data
# train_data = run.input_datasets['train_data'].to_pandas_dataframe()
# test_data = run.input_datasets['test_data'].to_pandas_dataframe()
file_path = args.prepared_data
train_data = pd.read_csv(file_path + '/train_data.csv', sep=';', header=0)
test_data = pd.read_csv(file_path + '/test_data.csv', sep=';', header=0)

# split train/test and feat/target
X_train = train_data[[ col for col in train_data.columns if col.startswith('feat')]]
y_train = train_data[[ col for col in train_data.columns if col.startswith('target')]].drop(['target_0'], axis=1)
X_test = test_data[[col for col in test_data.columns if col.startswith('feat')]]
y_test = test_data[[ col for col in test_data.columns if col.startswith('target')]].drop(['target_0'], axis=1)

############################################################

# train classifier
model = MultiOutputClassifier(DummyClassifier(strategy='stratified'))
model.fit(X_train, y_train)

# evaluate test data
y_pred = model.predict(X_test)
run.log('precision_macro', precision_score(y_test, y_pred, average='macro'))
run.log('precision_samples', precision_score(y_test, y_pred, average='samples'))
run.log('recall_macro', recall_score(y_test, y_pred, average='macro'))
run.log('recall_samples', recall_score(y_test, y_pred, average='samples'))
run.log('hamming_loss', hamming_loss(y_test, y_pred))
run.log('zero_one_loss', zero_one_loss(y_test, y_pred))

# evaluate train data
y_pred = model.predict(X_train)
run.log('precision_macro_train', precision_score(y_train, y_pred, average='macro'))
run.log('precision_samples_train', precision_score(y_train, y_pred, average='samples'))
run.log('recall_macro_train', recall_score(y_train, y_pred, average='macro'))
run.log('recall_samples_train', recall_score(y_train, y_pred, average='samples'))
run.log('hamming_loss_train', hamming_loss(y_train, y_pred))
run.log('zero_one_loss_train', zero_one_loss(y_train, y_pred))

# save model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')
joblib.dump(value=model, filename= args.trained_classifier)# + '/model.pkl')

############################################################

# train regressor
X_train = train_data[[ col for col in train_data.columns if col.startswith('feat')]]
y_train = train_data[[ col for col in train_data.columns if col.startswith('target')]][['target_0']]
X_test = test_data[[col for col in test_data.columns if col.startswith('feat')]]
y_test = test_data[[ col for col in test_data.columns if col.startswith('target')]][['target_0']]

model_regressor = DummyRegressor(strategy="mean")
model_regressor.fit(X_train, y_train)

y_pred = model_regressor.predict(X_test)
run.log('mae', mean_absolute_error(y_test, y_pred))
run.log('mse', mean_squared_error(y_test, y_pred))
run.log('r2', r2_score(y_test, y_pred))

y_pred = model_regressor.predict(X_train)
run.log('mae_train', mean_absolute_error(y_train, y_pred))
run.log('mse_train', mean_squared_error(y_train, y_pred))
run.log('r2_train', r2_score(y_train, y_pred))

# save regressor model
joblib.dump(value=model_regressor, filename='outputs/model_regressor.pkl')
joblib.dump(value=model_regressor, filename=args.trained_regressor)# + '/model_regressor.pkl')


run.complete()

Overwriting ./src/train.py


In [6]:
%%writefile ./src/deploy.py

from azureml.core import Run, Model
import os
import pandas as pd
import joblib
from argparse import ArgumentParser

run = Run.get_context()
ws = run.experiment.workspace

parser = ArgumentParser()
parser.add_argument('--pipeline_data', dest='pipeline_data')
parser.add_argument('--trained_classifier', dest='trained_classifier')
parser.add_argument('--trained_regressor', dest='trained_regressor')
args = parser.parse_args()

# Model.register(args.pipeline_data, 'DummyPipe', ws)
# Model.register(args.trained_classifier, 'DummyModel', ws)
# Model.register(args.trained_regressor, 'DummyModelRegressor', ws)

for child in run.parent.get_children():
    if child.name == 'prep.py':
        child.register_model('DummyPipe', 'outputs/pipelines.pkl')
    elif child.name == 'train.py':
        child.register_model('DummyModel', 'outputs/model.pkl')
        child.register_model('DummyModelRegressor', 'outputs/model_regressor.pkl')
        
run.complete()

Overwriting ./src/deploy.py


In [7]:
%%writefile ./src/score.py

import json
import numpy as np
import os
from azureml.core.model import Model
import joblib
from pipe import create_pipeline
import pandas as pd

def init():
    global model
    global regressor
    global pipelines
    model_path = Model.get_model_path('DummyModel')
    model = joblib.load(model_path)
    regressor_path = Model.get_model_path('DummyModelRegressor')
    regressor = joblib.load(regressor_path)
    pipeline_path = Model.get_model_path('DummyPipe')
    pipelines = joblib.load(pipeline_path)
    
def run(raw_data):
    
    # get input data
    data = json.loads(raw_data)
    
    # transform with pipeline
    X = pipelines['feature_pipe'].transform(pd.DataFrame(data))
    
    # make prediction
    y = model.predict(X)
    
    # predict duration
    y_dur = regressor.predict(X)
    
    response = [
        {
            'Products':
            [ 
                pipelines['target_pipe'].transformer_list[1][1].named_steps['target_encode'].col_cats[0][i] 
                for i in range(y.shape[1]) if y[j,i] == 1 
            ],
            'Duration':
                 y_dur[j,0]
        }        
            for j in range(y.shape[0])
    ]

    return response

Overwriting ./src/score.py


## Conda Environment

In [8]:
# define dependencies

# cd = CondaDependencies()
# cd.add_pip_package("azureml-defaults")
# cd.add_pip_package('pyarrow==0.12.0')
# cd.add_pip_package('joblib')
# cd.add_pip_package('scikit-learn==0.20.3')

In [9]:
# register environment

# if 'WILO_POC' in ws.environments:
#     env = ws.environments['WILO_POC']
# else:
#     env = Environment('WILO_POC')
# env.python.conda_dependencies = cd
# env.register(ws)

In [10]:
# create run config
run_config = RunConfiguration()
run_config.environment = ws.environments['WILO_POC']

## Define DataObject

In [11]:
# input data
symptoms_data = ws.datasets['symptomcodes.csv'].as_named_input('symptomcodes')
raw_input_data = ws.datasets['ItemResourceData.csv'].as_named_input('df')
# prepared data
prepared_data = PipelineData("prepared_data", datastore=ws.datastores['workspaceblobstore'], is_directory=True)

# output
pipeline_data = PipelineData("pipeline_data", datastore=ws.datastores['workspaceblobstore'], is_directory=False)
trained_classifier = PipelineData("trained_classifier", datastore=ws.datastores['workspaceblobstore'], is_directory=False)
trained_regressor = PipelineData("trained_regressor", datastore=ws.datastores['workspaceblobstore'], is_directory=False)

## Define Steps

In [12]:
prep_step = PythonScriptStep(script_name='prep.py', source_directory='src',
                            inputs=[symptoms_data, raw_input_data], outputs=[pipeline_data, prepared_data],
                             arguments=['--pipeline_data', pipeline_data, '--output', prepared_data],
                            compute_target=ws.compute_targets['mlcompute'], runconfig=run_config, allow_reuse=True)

In [13]:
train_step = PythonScriptStep(script_name='train.py', source_directory='src',
                             inputs=[prepared_data], outputs=[trained_classifier, trained_regressor],
                             arguments=['--input', prepared_data, '--trained_classifier', trained_classifier, '--trained_regressor', trained_regressor],
                             compute_target=ws.compute_targets['mlcompute'], runconfig=run_config, allow_reuse=True)

In [14]:
deploy_step = PythonScriptStep(script_name='deploy.py', source_directory='src',
                               inputs=[pipeline_data, trained_classifier, trained_regressor],
                               arguments=['--pipeline_data', pipeline_data, '--trained_classifier', trained_classifier, '--trained_regressor', trained_regressor],
                             compute_target=ws.compute_targets['mlcompute'], runconfig=run_config, allow_reuse=True)

## Create Pipeline

In [15]:
pipeline = Pipeline(ws, [prep_step, train_step, deploy_step])

## Run Experiment

In [None]:
exp = Experiment(ws, 'DummyPrediction')
run = exp.submit(pipeline)
run.wait_for_completion(show_output=True)

Created step prep.py [6a3546b0][89473af0-956d-4ac8-825a-0b47c37e520c], (This step will run and generate new outputs)
Created step train.py [9331e624][c6a7951d-7d3d-4a4c-9ce8-4cce2a21c4d8], (This step will run and generate new outputs)
Created step deploy.py [2ce0593a][46d94c06-f4b6-4b5b-8dbd-6ed5d88c0219], (This step will run and generate new outputs)
Submitted PipelineRun a937fb2b-34da-41b8-bdb1-f87a65310e19
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/DummyPrediction/runs/a937fb2b-34da-41b8-bdb1-f87a65310e19?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test
PipelineRunId: a937fb2b-34da-41b8-bdb1-f87a65310e19
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/DummyPrediction/runs/a937fb2b-34da-41b8-bdb1-f87a65310e19?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test
PipelineRun Status: NotStarted
PipelineRun Stat




StepRunId: fd63693c-909c-44d0-be1b-6497608e91fc
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/DummyPrediction/runs/fd63693c-909c-44d0-be1b-6497608e91fc?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test
StepRun( train.py ) Status: NotStarted
StepRun( train.py ) Status: Queued

Streaming azureml-logs/55_azureml-execution-tvmps_2f1de38d79ebbfc529747028dee486c67ec69c7c0947d63ee4f3f8a59fef922a_d.txt
2020-06-17T18:31:16Z Starting output-watcher...
2020-06-17T18:31:16Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
df1513763a3d64d39928d67aa989f81ac8ac40c81f5936be86c9336d11d359c1

Streaming azureml-logs/65_job_prep-tvmps_2f1de38d79ebbfc529747028dee486c67ec69c7c0947d63ee4f3f8a59fef922a_d.txt
Entering job preparation. Current time:2020-06-17T18:31:18.433045
StepRun( train.py ) Status: Running
Starting job preparation. Current time:2020-06-17T18:31:19.118546
Extracting the control code.
fet




StepRunId: 0551acac-5c23-486e-8edc-b75ececf4811
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/DummyPrediction/runs/0551acac-5c23-486e-8edc-b75ececf4811?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test
StepRun( deploy.py ) Status: Queued


In [None]:
exp = Experiment(ws, 'DummyPrediction')

In [None]:
for r in exp.get_runs():
    run = r
    break

In [None]:
for c in run.get_children():
    child = c

In [None]:
child.get_file_names()

In [None]:
child.get_properties()