In [1]:
from azureml.core import Workspace, Experiment, Environment, Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.train.estimator import Estimator
import json

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [3]:
# load experiment cfg
with open("experiment_cfg.json", "r") as cfg_file:
    cfg = json.load(cfg_file)

In [4]:
%%writefile ./src/prepare.py

from azureml.core import Run
from argparse import ArgumentParser
import numpy as np
import pandas as pd
import datetime
import os

run = Run.get_context()

parser = ArgumentParser()
parser.add_argument('--output', dest='prepared_data')
args = parser.parse_args()

# load datasets
df_symptoms = run.input_datasets['symptomcodes'].to_pandas_dataframe()
df = run.input_datasets['df_raw'].to_pandas_dataframe()

###########################################################

# get only data from last t years
t = 5
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

##############################################################################

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum())))),
    'Start': x['Job Card.Date Start Work'].min(),
    'End': x['Job Card.Date End Work'].max()
  })).reset_index()

##############################################################################

# compute duration column
df = pd.concat([df, pd.DataFrame((df['End'] - df['Start']), columns=['duration'])],axis=1)
df['duration'] = df['duration'].apply(lambda x: x.seconds / 3600)

##############################################################################

# save train and test data
path = arg.prepared_data if args.prepared_data else './outputs'
os.makedirs(path, exist_ok=True)
df.to_csv(path + '/prepared_data.csv', sep=';', header=True, index=False)

run.complete()

Overwriting ./src/prepare.py


In [5]:
est = Estimator(entry_script='prepare.py', source_directory='src', 
              inputs=[   ws.datasets[cfg['symptomcodes_dataset']].as_named_input('symptomcodes'), 
                         ws.datasets[cfg['raw_data_dataset']].as_named_input('df_raw')       ],
              compute_target='local', environment_definition=ws.environments[cfg['env_name']])

In [6]:
exp = Experiment(ws, cfg['experiment_name'])
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: SKLearnPrediction_1592832031_b9e30462
Web View: https://ml.azure.com/experiments/SKLearnPrediction/runs/SKLearnPrediction_1592832031_b9e30462?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/70_driver_log.txt

Entering context manager injector. Current time:2020-06-22T13:20:34.757240
Starting the daemon thread to refresh tokens in background for process with pid = 8
Entering Run History Context Manager.
Preparing to call script [ prepare.py ] with arguments: []
After variable expansion, calling script [ prepare.py ] with arguments: []

Starting the daemon thread to refresh tokens in background for process with pid = 8


The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.3278477191925049 seconds

Execution Summary
RunId: SKLearn

{'runId': 'SKLearnPrediction_1592832031_b9e30462',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-06-22T13:20:33.939913Z',
 'endTimeUtc': '2020-06-22T13:25:17.184322Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'c7f31f14-d426-4dbc-9034-5637b2b1ea8c'},
 'inputDatasets': [{'dataset': {'id': '02e6cb83-4d0c-42b2-bbef-e103c74b3a3c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'df_raw', 'mechanism': 'Direct'}}, {'dataset': {'id': '88af5740-1a1b-4e09-8129-d3c538680909'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'symptomcodes', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'prepare.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'df_raw': {'dataLocation': {'dataset': {'id': '02e6cb83-4d0c-42b2-bbef-e103c74b3a3c',
      'name': None,
      'version': None},
 

In [13]:
run.download_file('outputs/prepared_data.csv', output_file_path='artifacts/prepared_data.csv')
ds = ws.datastores[cfg['storage']]
data_ref = ds.upload_files(['artifacts/prepared_data.csv'], target_path='./'+cfg['experiment_name'], overwrite=True)
prepared_data_dataset = Dataset.Tabular.from_delimited_files(data_ref, separator=';', header=True, infer_column_types=True)
prepared_data_dataset.register(ws, cfg['prepared_data_dataset'], create_new_version=True)

Uploading an estimated of 1 files
Uploading artifacts/prepared_data.csv
Uploaded artifacts/prepared_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [18]:
ws.datasets['SklearnPreparedData'].to_pandas_dataframe()

First partition columns (ordered): ['Job Card.JobCard Number', 'ProductNr', 'Symptoms', 'Start', 'End', 'duration']
Found Partition has columns (ordered): ['Job Card.JobCard Number', 'ProductNr', 'Symptoms', 'Start', 'End', 'duration', 'Column7', 'Column8', 'Column9', 'Column10', 'Column11', 'Column12', 'Column13', 'Column14']


Unnamed: 0,Job Card.JobCard Number,ProductNr,Symptoms,Start,End,duration,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,JC03.00,EXT001 2028272,L001,2018-03-27 06:00:00,2018-03-27 13:30:00,7.50,,,,,,,,
1,C-AT000002-HAFOL JC01.00,501033799 501034197 2028261 2050239 2028269 60...,,2016-08-09 04:45:00,2016-08-09 09:15:00,4.50,,,,,,,,
2,C-AT000052-SCHOL JC01.00,501033799 501034197 2028261 2046930,P001,2016-08-05 05:00:00,2016-08-05 06:00:00,1.00,,,,,,,,
3,C-AT000058-HERRO JC01.00,501031 501033799 2054129 501033891 2028261 202...,A002,2016-08-02 04:00:00,2016-08-02 07:00:00,3.00,,,,,,,,
4,C-AT000059-SCHOL JC01.00,501033799 501033891 2028261 2519927 2050239 21...,F002 F001 F003,2016-08-16 07:30:00,2016-08-16 10:00:00,2.50,,,,,,,,
5,C-AT000061-SCHOL JC01.00,501033799 501034197 2028261 2050239 2028269,W001,2016-08-04 10:00:00,2016-08-04 14:30:00,4.50,,,,,,,,
6,C-AT000065-SCHOL JC01.00,501033799 501034197 2028261,,2016-08-08 10:30:00,2016-08-08 11:45:00,1.25,,,,,,,,
7,C-AT000072-SCHOL JC01.00,501033799 2027533 501034197 2028261 2027535 20...,F003 F001 K002,2016-08-01 15:30:00,2016-08-01 16:30:00,1.00,,,,,,,,
8,C-AT000073-SCHOL JC02.00,501033799 501034197 2028261,I001,2016-08-18 08:30:00,2016-08-18 15:00:00,6.50,,,,,,,,
9,C-AT000078-SCHOL JC01.00,501033799 501034197 2028261 2050239,F002 F001 F003,2016-08-05 07:00:00,2016-08-05 10:00:00,3.00,,,,,,,,
