In [1]:
from azureml.core import Workspace, Experiment
from azureml.train.sklearn import SKLearn
from azureml.core.authentication import InteractiveLoginAuthentication

import pandas as pd
import numpy as np
import datetime 
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [56]:
%%writefile ./src/pipe.py

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
class LookUpPredictor():
    def __init__(self, look_up_table):
        self.look_up_table = look_up_table
    
    def fit(self,X,y=None):
        pass
    
    def transform(self,X):
        y = []
        
        
        symptoms = X['Symptoms']
        
        for i in range(len(X)):
            prods_df = self.look_up_table[self.look_up_table['ProductId']==X['ProductId'][i]].reset_index(drop=True)
            prods_df = prods_df[prods_df['Symptoms']==X['Symptoms'][i]].reset_index(drop=True).iloc[:,2:].sum()
            y.append(prods_df.values)
            
        return np.array(y)

Overwriting ./src/pipe.py


In [60]:
%%writefile ./src/prep.py

from azureml.core import Run
import pandas as pd
import datetime
from pipe import MultiHotEncoder, LookUpPredictor
import os
import numpy as np
from sklearn.model_selection import train_test_split

run = Run.get_context()

# load datasets
df_symptoms = run.input_datasets['symptomcodes'].to_pandas_dataframe()
df = run.input_datasets['df'].to_pandas_dataframe()

run.log('# rows before', len(df))

###########################################################

# get only data from last t years
t = 2
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

run.log('# rows for last ' +str(t) + ' years', len(df))

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

run.log('# rows after cleaning', len(df))

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
    'ProductId': ' '.join(x['Installed Base.InstalledBase ProductID'].unique()),
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum()))))
  })).reset_index()

df = df.groupby(['ProductId', 'Symptoms'])['ProductNr'].apply(lambda x : list(x)).reset_index()
df = pd.concat([df, df['ProductNr'].apply(lambda x: x[0])], axis=1).iloc[:,[0,1,3]].reset_index(drop=True)

run.log('# rows after merging cases', len(df))

#########################################################################

mhe = MultiHotEncoder(delimiter=' ')
df = pd.concat([df.drop(['ProductNr'], axis=1), pd.DataFrame(mhe.fit_transform(df[['ProductNr']].values))], axis=1)

##########################################################################

# split data (test data from last t_test years)
# t_test = 0.5
# df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
# df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_train = df.iloc[:round(len(df)*0.7)]
df_test = df.iloc[round(len(df)*0.7):]

run.log('# rows in train data', len(df_train))
run.log('# rows in test data', len(df_test))

#############################################################################

# split train/test and feat/target
X_train = df_train[['ProductId', 'Symptoms']]
y_train = df_train.drop(['ProductId', 'Symptoms'], axis=1)
X_test = df_test[['ProductId', 'Symptoms']]
y_test = df_test.drop(['ProductId', 'Symptoms'], axis=1)

# train classifier
model = LookUpPredictor(look_up_table=df_train)

# evaluate test data
y_pred = model.transform(X_test)
run.log('precision_macro', precision_score(y_test, y_pred, average='macro'))
run.log('precision_samples', precision_score(y_test, y_pred, average='samples'))
run.log('recall_macro', recall_score(y_test, y_pred, average='macro'))
run.log('recall_macro', recall_score(y_test, y_pred, average='samples'))
run.log('hamming_loss', hamming_loss(y_test, y_pred))
run.log('zero_one_loss', zero_one_loss(y_test, y_pred))

# evaluate train data
y_pred = model.predict(X_train)
run.log('precision_macro_train', precision_score(y_train, y_pred, average='macro'))
run.log('precision_samples_train', precision_score(y_train, y_pred, average='samples'))
run.log('recall_macro_train', recall_score(y_train, y_pred, average='macro'))
run.log('recall_macro_train', recall_score(y_train, y_pred, average='samples'))
run.log('hamming_loss_train', hamming_loss(y_train, y_pred))
run.log('zero_one_loss_train', zero_one_loss(y_train, y_pred))

# save model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

run.complete()

Overwriting ./src/prep.py


In [61]:
est = SKLearn(entry_script='prep.py', source_directory='src', 
              inputs=[   ws.datasets['symptomcodes.csv'].as_named_input('symptomcodes'), 
                         ws.datasets['ItemResourceData.csv'].as_named_input('df')       ],
              pip_packages=['pyarrow==0.12.0 '], compute_target='local')

In [62]:
exp = Experiment(ws, 'ProductPredictionLookUp')
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: ProductPredictionLookUp_1592209312_8ba7e0e7
Web View: https://ml.azure.com/experiments/ProductPredictionLookUp/runs/ProductPredictionLookUp_1592209312_8ba7e0e7?wsid=/subscriptions/793146d9-d4dc-4a73-9728-76c4ffd0cc0d/resourcegroups/rg_dynamics_test/workspaces/resdynml1test

Streaming azureml-logs/70_driver_log.txt

Entering context manager injector. Current time:2020-06-15T08:21:54.471649
Starting the daemon thread to refresh tokens in background for process with pid = 8
Entering Run History Context Manager.
Preparing to call script [ prep.py ] with arguments: []
After variable expansion, calling script [ prep.py ] with arguments: []

Starting the daemon thread to refresh tokens in background for process with pid = 8


The experiment failed. Finalizing run...
Logging experiment finalizing status in history service.
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.4453582763671875 seconds
Traceback (most recent call last):
  

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with KeyError: 0",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "KeyError",
            "message": "0",
            "stackTrace": "  File \"azureml-setup/context_manager_injector.py\", line 148, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"prep.py\", line 88, in <module>\n    y_pred = model.transform(X_test)\n  File \"/azureml-run/pipe.py\", line 56, in transform\n    prods_df = self.look_up_table[self.look_up_table['ProductId']==X['ProductId'][i]].reset_index(drop=True)\n  File \"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/site-packages/pandas/core/series.py\", line 871, in __getitem__\n    result = self.index.get_value(self, key)\n  File \"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 4405, in get_value\n    return self._engine.get_value(s, k, tz=getattr(series.dtype, \"tz\", None))\n  File \"pandas/_libs/index.pyx\", line 80, in pandas._libs.index.IndexEngine.get_value\n  File \"pandas/_libs/index.pyx\", line 90, in pandas._libs.index.IndexEngine.get_value\n  File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n  File \"pandas/_libs/hashtable_class_helper.pxi\", line 998, in pandas._libs.hashtable.Int64HashTable.get_item\n  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1005, in pandas._libs.hashtable.Int64HashTable.get_item\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with KeyError: 0\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"KeyError\",\n            \"message\": \"0\",\n            \"stackTrace\": \"  File \\\"azureml-setup/context_manager_injector.py\\\", line 148, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"prep.py\\\", line 88, in <module>\\n    y_pred = model.transform(X_test)\\n  File \\\"/azureml-run/pipe.py\\\", line 56, in transform\\n    prods_df = self.look_up_table[self.look_up_table['ProductId']==X['ProductId'][i]].reset_index(drop=True)\\n  File \\\"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/site-packages/pandas/core/series.py\\\", line 871, in __getitem__\\n    result = self.index.get_value(self, key)\\n  File \\\"/azureml-envs/azureml_6da8db82c0a6a27195a6a6ae29218268/lib/python3.6/site-packages/pandas/core/indexes/base.py\\\", line 4405, in get_value\\n    return self._engine.get_value(s, k, tz=getattr(series.dtype, \\\"tz\\\", None))\\n  File \\\"pandas/_libs/index.pyx\\\", line 80, in pandas._libs.index.IndexEngine.get_value\\n  File \\\"pandas/_libs/index.pyx\\\", line 90, in pandas._libs.index.IndexEngine.get_value\\n  File \\\"pandas/_libs/index.pyx\\\", line 138, in pandas._libs.index.IndexEngine.get_loc\\n  File \\\"pandas/_libs/hashtable_class_helper.pxi\\\", line 998, in pandas._libs.hashtable.Int64HashTable.get_item\\n  File \\\"pandas/_libs/hashtable_class_helper.pxi\\\", line 1005, in pandas._libs.hashtable.Int64HashTable.get_item\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [3]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import numpy as np

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, delimiter=None):
        self.delimiter = delimiter
    def fit(self, X, y=None):
        self.col_cats = {}
        for col in range(X.shape[1]):
            cats = set()
            for row in range(X.shape[0]):
                if self.delimiter:
                    for cat in X[row,col].split(self.delimiter):
                        if not cat.strip() == '':
                            cats.add(cat.strip())
                else:
                    cats.add(X[row,col])
            self.col_cats[col] = list(cats)
        return self
    def transform(self, X):
        X_tr = []
        for col in range(X.shape[1]):
            X_enc = np.zeros([X.shape[0], len(self.col_cats[col])])
            for row in range(X.shape[0]):
                if self.delimiter:
                    cats = str(X[row,col]).split(self.delimiter)
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] in cats:
                            X_enc[row, col_cat_idx] = 1
                else:
                    for col_cat_idx in range(len(self.col_cats[col])):
                        if self.col_cats[col][col_cat_idx] == X[row,col]:
                            X_enc[row, col_cat_idx] = 1
            X_enc = np.array(X_enc)
            X_tr.append(X_enc)
        X_tr = np.concatenate(X_tr, axis=1)
        return X_tr
    
class LookUpPredictor():
    def __init__(self, look_up_table):
        self.look_up_table = look_up_table
    
    def fit(self,X,y=None):
        pass
    
    def transform(self,X):
        y = []
        
        
        symptoms = X['Symptoms']
        
        for i in range(len(X)):
            prods_df = self.look_up_table[self.look_up_table['ProductId']==X['ProductId'][i]].reset_index(drop=True)
            prods_df = prods_df[prods_df['Symptoms']==X['Symptoms'][i]].reset_index(drop=True).iloc[:,2:].sum()
            y.append(prods_df.values)
            
        return np.array(y)

In [4]:
df_symptoms = ws.datasets['symptomcodes.csv'].to_pandas_dataframe()
df = ws.datasets['ItemResourceData.csv'].to_pandas_dataframe()

In [6]:
###########################################################

# get only data from last t years
t = 2
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]



############################################################

# clean data
df = df.replace(['', '0', '-', '000'], np.nan)
df = df.dropna().reset_index(drop=True)



#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
    'ProductId': ' '.join(x['Installed Base.InstalledBase ProductID'].unique()),
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum()))))
  })).reset_index()

df = df.groupby(['ProductId', 'Symptoms'])['ProductNr'].apply(lambda x : list(x)).reset_index()
df = pd.concat([df, df['ProductNr'].apply(lambda x: x[0])], axis=1).iloc[:,[0,1,3]].reset_index(drop=True)



#########################################################################

mhe = MultiHotEncoder(delimiter=' ')
df = pd.concat([df.drop(['ProductNr'], axis=1), pd.DataFrame(mhe.fit_transform(df[['ProductNr']].values))], axis=1)

##########################################################################

# split data (test data from last t_test years)
# t_test = 0.5
# df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
# df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_train = df.iloc[:round(len(df)*0.7)]
df_test = df.iloc[round(len(df)*0.7):]



#############################################################################

# split train/test and feat/target
X_train = df_train[['ProductId', 'Symptoms']]
y_train = df_train.drop(['ProductId', 'Symptoms'], axis=1)
X_test = df_test[['ProductId', 'Symptoms']]
y_test = df_test.drop(['ProductId', 'Symptoms'], axis=1)



In [7]:
# train classifier
model = LookUpPredictor(look_up_table=df_train)

In [8]:
y_pred = model.transform(X_test)

KeyError: 0