In [None]:
from azureml.core import Workspace, Experiment, Environment, Dataset
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.estimator import Estimator
import json
import os

In [None]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [None]:
# load experiment cfg
with open("experiment_cfg.json", "r") as cfg_file:
    cfg = json.load(cfg_file)

## Preprocess Scripts

In [None]:
%%writefile ./src/pipe.py

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self        
    def transform(self, X):
        return X[self.attribute_names].astype(self.dtype).values

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
def create_pipelines(cfg):
    
    # Pipeline for multilabel features
    multi_pipe = Pipeline([
        ('multi_feat_select', DataFrameSelector(cfg['multi_cols'], str)),
        ('multi_encode', MultiHotEncoder(delimiter=' '))
    ])
    
    # combine features
    feat_union = FeatureUnion([
        ('multi_features', multi_pipe)
    ])
    
    # preprocess all features
    all_feat_pipe = Pipeline([
        ('all_features_pipe', feat_union),
#         ('all_feautres_pca', PCA(n_components=0.8, svd_solver = 'full'))
    ])
    
    # Pipeline for multi target cols
    multi_target_pipe = Pipeline([
        ('target_select', DataFrameSelector(cfg['multi_target_cols'], str)),
        ('target_encode', MultiHotEncoder(delimiter=' '))
    ])

    # Pipeline for numerical target cols
    num_target_pipe = Pipeline([
        ('num_feature_select', DataFrameSelector(cfg['num_target_cols'], float))
    ])
    
    all_target_pipe = FeatureUnion([
        ('num_targets', num_target_pipe),
        ('multi_targets', multi_target_pipe)
    ])

    return { 'feature_pipe': all_feat_pipe, 'target_pipe': all_target_pipe }

In [None]:
%%writefile ./src/preprocess.py

from azureml.core import Run

import datetime
import os
import pandas as pd
import joblib
from argparse import ArgumentParser

from pipe import create_pipelines

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--input', dest='prepared_data')
parser.add_argument('--output', dest='preprocessed_data')
args = parser.parse_args()

# load datasets
if args.prepared_data:
    df = pd.read_csv(args.prepared_data + '/prepared_data.csv', sep=';', header=0)
else:
    df = run.input_datasets['df_prepared'].to_pandas_dataframe()

# ##############################################################################

# # split data (test data from last t_test years)
# t_test = 0.5
# df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
# df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

# ##############################################################################

# # select columns for training
# cfg = {}
# cfg['multi_cols'] = ['Symptoms']
# cfg['num_target_cols'] = ['duration']
# cfg['multi_target_cols'] = ['ProductNr']

# # create pipeline
# pipelines = create_pipelines(cfg)

# # fit pipelines and transform data
# X_train = pipelines['feature_pipe'].fit_transform(df_train)
# y_train = pipelines['target_pipe'].fit_transform(df_train)
# X_test = pipelines['feature_pipe'].transform(df_test)
# y_test = pipelines['target_pipe'].transform(df_test)

# ##############################################################################

# # rename columns
# feature_columns = [ 'feat_'+ str(i) for i in range(X_train.shape[1])]
# target_columns = [ 'target_'+ str(i) for i in range(y_train.shape[1])]

# df_train = pd.concat([
#     pd.DataFrame(X_train, columns=feature_columns),
#     pd.DataFrame(y_train, columns=target_columns)
# ], axis=1)

# df_test = pd.concat([
#     pd.DataFrame(X_test, columns=feature_columns),
#     pd.DataFrame(y_test, columns=target_columns)
# ], axis=1)

# ##############################################################################

# # save train and test data
# path = args.preprocessed_data if args.preprocessed_data else './outputs'
# os.makedirs(path, exist_ok=True)
# df_train.to_csv(path + '/train_data.csv', sep=';', header=True, index=False)
# df_test.to_csv(path + '/test_data.csv', sep=';', header=True, index=False)

# # save pipelines
# os.makedirs('outputs', exist_ok=True)
# joblib.dump(pipelines, './outputs/pipelines.pkl')

##############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['Symptoms']
cfg['num_target_cols'] = ['duration']
cfg['multi_target_cols'] = ['ProductNr']

##############################################################################

# split datasets into groups per productid (and keep only with more than n entries)
n = 20
df_prods = [ df[df['ProductId']==prod].reset_index(drop=True) for prod in df['ProductId'].unique() if len(df[df['ProductId']==prod])>=n]

# prepare data for each productid and save in dictionary
prod_pipes = {}
prod_train_data = {}
prod_test_data = {}

for i in range(len(df_prods)):
    
    prodid = df_prods[i]['ProductId'][0]
   
    ##############################################################################

    # train test split
    # get n% newest rows as test
    n = 25
    df_train = df_prods[i].sort_values(by=['Start'], ascending=False).iloc[int(len(df_prods[i])*((n)/100)):]
    df_test = df_prods[i].sort_values(by=['Start'], ascending=False).iloc[:int(len(df_prods[i])*((n)/100))]
    
    ##############################################################################
    
    # create pipeline
    pipelines = create_pipelines(cfg)

    # fit pipelines and transform data
    X_train = pipelines['feature_pipe'].fit_transform(df_train)
    y_train = pipelines['target_pipe'].fit_transform(df_train)
    X_test = pipelines['feature_pipe'].transform(df_test)
    y_test = pipelines['target_pipe'].transform(df_test)

    ##############################################################################

    # rename columns
    feature_columns = [ 'feat_'+ str(i) for i in range(X_train.shape[1])]
    target_columns = [ 'target_'+ str(i) for i in range(y_train.shape[1])]

    ##############################################################################
    
    df_train = pd.concat([
        pd.DataFrame(X_train, columns=feature_columns),
        pd.DataFrame(y_train, columns=target_columns)
    ], axis=1)

    df_test = pd.concat([
        pd.DataFrame(X_test, columns=feature_columns),
        pd.DataFrame(y_test, columns=target_columns)
    ], axis=1)
    
    ##############################################################################
    
    prod_pipes[prodid] = pipelines
    prod_train_data[prodid] = df_train
    prod_test_data[prodid] = df_test

##############################################################################

# save train and test data
path = args.preprocessed_data if args.preprocessed_data else './outputs'
os.makedirs(path, exist_ok=True)
# df_train.to_csv(path + '/train_data.csv', sep=';', header=True, index=False)
# df_test.to_csv(path + '/test_data.csv', sep=';', header=True, index=False)
joblib.dump(prod_train_data, path + '/train_data.pkl')
joblib.dump(prod_test_data, path + '/test_data.pkl')

# save pipelines
os.makedirs('outputs', exist_ok=True)
joblib.dump(prod_pipes, './outputs/pipelines.pkl')

run.complete()

## Create Estimator

In [None]:
est = Estimator(entry_script='preprocess.py', source_directory='src', 
              inputs=[ws.datasets[cfg['prepared_data_dataset']].as_named_input('df_prepared')],
              compute_target='local', environment_definition=ws.environments[cfg['env_name']])

## Run Experiment

In [None]:
exp = Experiment(ws, cfg['experiment_name'])
run = exp.submit(est)
run.wait_for_completion(show_output=True)

## Register Datasets

In [None]:
run.download_file('outputs/train_data.pkl', output_file_path='artifacts/train_data.pkl')
ds = ws.datastores[cfg['storage']]
data_ref = ds.upload_files(['artifacts/train_data.pkl'], target_path='./'+cfg['experiment_name'], overwrite=True)
#prepared_data_dataset = Dataset.Tabular.from_delimited_files(data_ref, separator=';', header=True, infer_column_types=True)
prepared_data_dataset = Dataset.File.from_files(path=data_ref)
prepared_data_dataset.register(ws, cfg['train_dataset'], create_new_version=True)

In [None]:
run.download_file('outputs/test_data.pkl', output_file_path='artifacts/test_data.pkl')
ds = ws.datastores[cfg['storage']]
data_ref = ds.upload_files(['artifacts/test_data.pkl'], target_path='./'+cfg['experiment_name'], overwrite=True)
# prepared_data_dataset = Dataset.Tabular.from_delimited_files(data_ref, separator=';', header=True, infer_column_types=True)
prepared_data_dataset = Dataset.File.from_files(path=data_ref)
prepared_data_dataset.register(ws, cfg['test_dataset'], create_new_version=True)

## Register Pipelines (as Model)

In [None]:
run.register_model(cfg['PreprocessPipeline'], 'outputs/pipelines.pkl')