In [1]:
from azureml.core import Workspace, Experiment
from azureml.train.sklearn import SKLearn
from azureml.core.authentication import InteractiveLoginAuthentication

import pandas as pd
import numpy as np
import datetime 
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [3]:
%%writefile ./src/pipe.py

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self        
    def transform(self, X):
        return X[self.attribute_names].astype(self.dtype).values

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
def create_pipeline(cfg):    
    # Pipeline for multilabel features
    multi_pipe = Pipeline([
        ('multi_feat_select', DataFrameSelector(cfg['multi_cols'], str)),
#         ('multi_replace_missing', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=tuple())),
        ('multi_encode', MultiHotEncoder(delimiter=' '))
    ])
    
    # Pipeline for target features
    target_pipe = Pipeline([
        ('target_select', DataFrameSelector(cfg['target_cols'], str)),
#         ('multi_replace_missing', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=tuple())),
        ('target_encode', MultiHotEncoder(delimiter=' '))
    ])

#   # Pipeline for categories
#     cat_pipe = Pipeline([
#         ('cat_feature_select', DataFrameSelector(cfg['cat_cols'])),
#         ('cat_replace_missing', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='0')),
#         ('cat_one_hot_encode', OneHotEncoder(sparse=False))
#     ])

#     # Pipeline for numericals
#     num_pipe = Pipeline([
#         ('num_feature_select', DataFrameSelector(cfg['num_cols'])),
#         ('num_replace_missing', SimpleImputer(missing_values=np.nan, strategy='mean')),
#         #('num_normalization', MinMaxScaler())
#         ('num_standardization', StandardScaler())
#     ])

    feat_union = FeatureUnion([
#         ('num_features', num_pipe),
#         ('cat_features', cat_pipe),
        ('multi_features', multi_pipe)
    ])
    
    all_feat_pipe = Pipeline([
        ('all_features_pipe', feat_union),
        ('all_feautres_pca', PCA(n_components=0.8, svd_solver = 'full'))
    ])
    
    pipeline = FeatureUnion([
        ("all_feat_pipe", all_feat_pipe),
        ("target_pipe", target_pipe)
    ])

    return pipeline    

Overwriting ./src/pipe.py


In [4]:
%%writefile ./src/prep.py

from azureml.core import Run
import pandas as pd
import datetime
from pipe import create_pipeline
import os
import numpy as np

run = Run.get_context()

# load datasets
df_symptoms = run.input_datasets['symptomcodes'].to_pandas_dataframe()
df = run.input_datasets['df'].to_pandas_dataframe()

run.log('# rows before', len(df))

###########################################################

# get only data from last t years
t = 2
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

run.log('# rows for last ' +str(t) + ' years', len(df))

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

run.log('# rows after cleaning', len(df))

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
#     'ProductGroup': tuple(x['Installed Base.Product Group'].unique()),
    'ProductGroup': ' '.join(x['Installed Base.Product Group'].unique()),
#     'ProductId': tuple(x['Installed Base.InstalledBase ProductID'].unique()),
    'ProductId': ' '.join(x['Installed Base.InstalledBase ProductID'].unique()),
    'Country': x['Location.Country'].unique()[0],
    'City': x['Location.City'].unique()[0],
    'LocationType': x['Location.Location Type'].unique()[0],
    'PostalCode': x['Location.Postal Code'].unique()[0],
#     'ProductName': tuple(x['Product.Product Name'].unique()), 
    'ProductName': ' '.join(x['Product.Product Name'].unique()), 
#     'ProductNr': tuple(x['Product.Product Number'].unique()),
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
#     'Quantity': tuple((x['Product.Product Number']),x['ItemResourceAppliedQuantity']),
    'Start': x['Job Card.Date Start Work'].unique()[0],
    'End': x['Job Card.Date End Work'].unique()[0],
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum()))))
  })).reset_index()

run.log('# rows after merging cases', len(df))

##########################################################################

# split data (test data from last t_test years)
t_test = 0.5
df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

run.log('# rows in train data', len(df_train))
run.log('# rows in test data', len(df_test))

#############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['ProductGroup', 'Symptoms'] #['ProductGroup', 'ProductId', 'Symptoms']
cfg['cat_cols'] = ['Country', 'City', 'LocationType', 'PostalCode']
cfg['date_cols'] = ['Start', 'End']
cfg['num_cols'] = []
cfg['target_cols'] = ['ProductNr']

# create pipeline
pipe = create_pipeline(cfg)

# transform data
df_train = pipe.fit_transform(df_train)
df_test = pipe.transform(df_test)

# rename columns
columns = [ 'feat_' + str(i) if i < df_train.shape[1]-len(pipe.transformer_list[1][1].named_steps['target_encode'].col_cats[0]) else 'target_' + str(i) for i in range(df_train.shape[1]) ]
df_train = pd.DataFrame(df_train, columns=columns)
df_test = pd.DataFrame(df_test, columns=columns)

############################################################################

# save prepared data to csv
os.makedirs('outputs', exist_ok=True)
df_train.to_csv('./outputs/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv('./outputs/test_data.csv', sep=';', header=True, index=False)

############################################################################

run.complete()

Overwriting ./src/prep.py


In [5]:
est = SKLearn(entry_script='prep.py', source_directory='src', 
              inputs=[   ws.datasets['symptomcodes.csv'].as_named_input('symptomcodes'), 
                         ws.datasets['ItemResourceData.csv'].as_named_input('df')       ],
              pip_packages=['pyarrow==0.12.0 '], compute_target='local')

In [6]:
exp = Experiment(ws, 'ProductPrediction')
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: ProductPrediction_1591974819_8f3aaf84
Web View: https://ml.azure.com/experiments/ProductPrediction/runs/ProductPrediction_1591974819_8f3aaf84?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/70_driver_log.txt

Entering context manager injector. Current time:2020-06-12T15:13:42.193980
Starting the daemon thread to refresh tokens in background for process with pid = 9
Entering Run History Context Manager.
Preparing to call script [ prep.py ] with arguments: []
After variable expansion, calling script [ prep.py ] with arguments: []

Starting the daemon thread to refresh tokens in background for process with pid = 9


The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.42168569564819336 seconds

Execution Summary
RunId: ProductPredi

{'runId': 'ProductPrediction_1591974819_8f3aaf84',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-06-12T15:13:41.302125Z',
 'endTimeUtc': '2020-06-12T15:26:39.941816Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'c77142cf-4715-4974-88c7-a2f5b7a955ba'},
 'inputDatasets': [{'dataset': {'id': '02e6cb83-4d0c-42b2-bbef-e103c74b3a3c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'df', 'mechanism': 'Direct'}}, {'dataset': {'id': '88af5740-1a1b-4e09-8129-d3c538680909'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'symptomcodes', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'prep.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'df': {'dataLocation': {'dataset': {'id': '02e6cb83-4d0c-42b2-bbef-e103c74b3a3c',
      'name': None,
      'version': None},
     'dataPa