In [6]:
from azureml.core import Workspace, Experiment
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.estimator import Estimator
import json

In [7]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [15]:
# load experiment cfg
with open("experiment_cfg.json", "r") as cfg_file:
    cfg = json.load(cfg_file)

## Training Script

In [9]:
%%writefile ./src/lookup.py

import numpy as np
import pandas as pd

class LookUpClassifier():
    
    def __init__(self, threshold=0.5):
        self.threshold = threshold
    
    def fit(self, df):
        
        self.product_transform = list(set(x for l in df['ProductNrs'].apply(lambda x: x.split()).values.tolist() for x in l))
        
        symptoms_per_case_df = pd.DataFrame(df['Symptoms'].str.split(' ').tolist(), index=df['Job Card.JobCard Number']).stack().reset_index([0, 'Job Card.JobCard Number'])
        symptoms_per_case_df.columns = ['Job Card.JobCard Number', 'Symptom']

        prodnr_per_case_df = pd.DataFrame(df['ProductNrs'].str.split(' ').tolist(), index=df['Job Card.JobCard Number']).stack().reset_index([0, 'Job Card.JobCard Number'])
        prodnr_per_case_df.columns = ['Job Card.JobCard Number', 'ProductNr']

        df = pd.merge(symptoms_per_case_df, df, on='Job Card.JobCard Number', how='left')
        df = pd.merge(prodnr_per_case_df, df, on='Job Card.JobCard Number', how='left')

        df = df[['ProductId', 'Country', 'Symptom', 'ProductNr']].replace('', np.nan).dropna().reset_index(drop=True)

#         self.model = df.groupby(['ProductId', 'Country', 'Symptom'])#
        self.model = {}
        for i in range(len(df)):
            if not df['ProductId'][i] in self.model:
                self.model[df['ProductId'][i]] = {}
            if not df['Country'][i] in self.model[df['ProductId'][i]]:
                self.model[df['ProductId'][i]][df['Country'][i]] = {}
            if not df['Symptom'][i] in self.model[df['ProductId'][i]][df['Country'][i]]:
                self.model[df['ProductId'][i]][df['Country'][i]][df['Symptom'][i]] = []
            self.model[df['ProductId'][i]][df['Country'][i]][df['Symptom'][i]].append(df['ProductNr'][i])
    
    def predict(self, X):
        # X = [['<prodid>', '<country>', '<symptom1>, <symptom2>']]
        y = []
        for row in X:
            y_row = []
            for symptom in row[2].split(' '):
                if row[0] in self.model:
                    if row[1] in self.model[row[0]]:
                        if symptom in self.model[row[0]][row[1]]:
                            y_row += self.model[row[0]][row[1]][symptom]
            
            y_probs = np.random.random(len(y_row))
            y_row = [ y_row[i] for i in range(len(y_probs)) if y_probs[i] > self.threshold ]
            
            y.append(' '.join(map(str, list(set(y_row)))))
            
        return y
    
    def transform_products(self, y):
        # y = [ '<prod1> <prod2>', '<prod1> <prod3>' ]
        y_tr = np.zeros([len(y), len(self.product_transform)])
        for row in range(len(y)):
            for prod in y[row].split(' '):
                if prod in self.product_transform:
                    y_tr[row, self.product_transform.index(prod)] = 1
        return y_tr

Overwriting ./src/lookup.py


In [10]:
%%writefile ./src/train.py

from azureml.core import Run

import os
import joblib
from argparse import ArgumentParser
from lookup import LookUpClassifier
from sklearn.metrics import recall_score, precision_score, hamming_loss, zero_one_loss, mean_absolute_error, mean_squared_error, r2_score

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--input', dest='prepared_data')
args = parser.parse_args()

############################################################

print('\n#####################################################')
print('loaded')
print('\n#####################################################')

# load data
if args.prepared_data:
    train_data = pd.read_csv(args.prepared_data + '/train_data.csv', sep=';', header=0)
    test_data = pd.read_csv(args.prepared_data + '/test_data.csv', sep=';', header=0)
else:
    train_data = run.input_datasets['train_data'].to_pandas_dataframe()
    test_data = run.input_datasets['test_data'].to_pandas_dataframe()
    
train_data = train_data.dropna().reset_index(drop=True)
test_data = test_data.dropna().reset_index(drop=True)
    
#################################################################

print('\n#####################################################')
print('train')
print('\n#####################################################')

# train classifier
model = LookUpClassifier(threshold=0.2)
model.fit(train_data)

print('\n#####################################################')
print('trained')
print('\n#####################################################')

############################################################

X_test = test_data[['ProductId', 'Country', 'Symptoms']].values.tolist()
y_test = test_data['ProductNrs'].values.tolist() 

X_train = train_data[['ProductId', 'Country', 'Symptoms']].values.tolist()
y_train = train_data['ProductNrs'].values.tolist() 

############################################################

# # evaluate test data
# y_pred = model.predict(X_test)
# y_pred_tr = model.transform_products(y_pred)
# y_test_tr = model.transform_products(y_test)
# run.log_table(
#     'test_evaluation_classification',
#     {
#         'precision_macro': [precision_score(y_test_tr, y_pred_tr, average='macro')],
#         'precision_samples': [precision_score(y_test_tr, y_pred_tr, average='samples')],
#         'recall_macro': [recall_score(y_test_tr, y_pred_tr, average='macro')],
#         'recall_samples': [recall_score(y_test_tr, y_pred_tr, average='samples')],
#         'hamming_loss': [hamming_loss(y_test_tr, y_pred_tr)],
#         'zero_one_loss': [zero_one_loss(y_test_tr, y_pred_tr)]
#     }
# )

# # evaluate train data
# y_pred = model.predict(X_train)
# y_pred_tr = model.transform_products(y_pred)
# y_train_tr = model.transform_products(y_train)
# run.log_table(
#     'train_evaluation_classification',
#     {
#         'precision_macro_train': [precision_score(y_train_tr, y_pred_tr, average='macro')],
#         'precision_samples_train': [precision_score(y_train_tr, y_pred_tr, average='samples')],
#         'recall_macro_train': [recall_score(y_train_tr, y_pred_tr, average='macro')],
#         'recall_samples_train': [recall_score(y_train_tr, y_pred_tr, average='samples')],
#         'hamming_loss_train': [hamming_loss(y_train_tr, y_pred_tr)],
#         'zero_one_loss_train': [zero_one_loss(y_train_tr, y_pred_tr)]
#     }
# )

############################################################

# save model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

Overwriting ./src/train.py


## Create Estimator

In [11]:
est = Estimator(entry_script='train.py', source_directory='src',
                inputs=[ws.datasets[cfg['train_dataset']].as_named_input('train_data'), 
                        ws.datasets[cfg['test_dataset']].as_named_input('test_data')   ],
                compute_target=cfg['compute_target'], environment_definition=ws.environments[cfg['env_name']])

## Run Experiment

In [12]:
exp = Experiment(ws, cfg['experiment_name'])
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: LookUpPrediction_1592833657_8f12ad73
Web View: https://ml.azure.com/experiments/LookUpPrediction/runs/LookUpPrediction_1592833657_8f12ad73?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/55_azureml-execution-tvmps_068cdeb5d4e5a75752476994dc6ded2eda9f98d6faa2bebe1480cd2f2071660c_d.txt

2020-06-22T13:51:41Z Executing 'Copy ACR Details file' on 10.0.0.5
2020-06-22T13:51:42Z Copy ACR Details file succeeded on 10.0.0.5. Output: 
>>>   
>>>   
2020-06-22T13:51:42Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2020-06-22T13:51:42Z Starting output-watcher...
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_c14e68a5a54beac144cd751fe11b91c5
a1298f4ce990: Pulling fs layer
04a3282d9c4b: Pulling fs layer
9b0d3db6dc03: Pulling fs layer
8269c605f3f1: Pulling fs layer
6504d449e70c: Pulling fs layer
4e38f320d0d4: Pulling fs layer
b0a763e8ee03: Pulling fs layer
1

{'runId': 'LookUpPrediction_1592833657_8f12ad73',
 'target': 'mlcompute',
 'status': 'Completed',
 'startTimeUtc': '2020-06-22T13:51:37.814442Z',
 'endTimeUtc': '2020-06-22T13:55:17.551748Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'bf1c23b4-a85d-4437-a8d0-e8ce41efda23',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '1cf29584-c0d6-4a92-b14d-3bc8bcd83723'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'test_data', 'mechanism': 'Direct'}}, {'dataset': {'id': '7b05e6a0-bdcb-4dac-badd-43da21ebfb81'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'train_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'mlcompute',
  'dataReferences': {},
  'data': {'test_

## Register Models

In [16]:
# register trained model
run.register_model(cfg['TrainedClassifier'], 'outputs/model.pkl')

Model(workspace=Workspace.create(name='resdynml1test', subscription_id='793146d9-d4dc-4a73-9728-76c4ffd0cc0d', resource_group='rg_dynamics_test'), name=LookUpModel, id=LookUpModel:1, version=1, tags={}, properties={})