In [1]:
from azureml.core import Workspace, Experiment
from azureml.train.sklearn import SKLearn
from azureml.core.authentication import InteractiveLoginAuthentication

import pandas as pd
import numpy as np
import datetime 
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [None]:
%%writefile ./src/prep.py

from azureml.core import Run
import pandas as pd
import datetime
from pipe import create_pipeline
import os
import numpy as np

run = Run.get_context()

# load datasets
df_symptoms = run.input_datasets['symptomcodes'].to_pandas_dataframe()
df = run.input_datasets['df'].to_pandas_dataframe()

run.log('# rows before', len(df))

###########################################################

# get only data from last t years
t = 2
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

run.log('# rows for last ' +str(t) + ' years', len(df))

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

run.log('# rows after cleaning', len(df))

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
#     'ProductGroup': tuple(x['Installed Base.Product Group'].unique()),
    'ProductGroup': ' '.join(x['Installed Base.Product Group'].unique()),
#     'ProductId': tuple(x['Installed Base.InstalledBase ProductID'].unique()),
    'ProductId': ' '.join(x['Installed Base.InstalledBase ProductID'].unique()),
    'Country': x['Location.Country'].unique()[0],
    'City': x['Location.City'].unique()[0],
    'LocationType': x['Location.Location Type'].unique()[0],
    'PostalCode': x['Location.Postal Code'].unique()[0],
#     'ProductName': tuple(x['Product.Product Name'].unique()), 
    'ProductName': ' '.join(x['Product.Product Name'].unique()), 
#     'ProductNr': tuple(x['Product.Product Number'].unique()),
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
#     'Quantity': tuple((x['Product.Product Number']),x['ItemResourceAppliedQuantity']),
    'Start': x['Job Card.Date Start Work'].unique()[0],
    'End': x['Job Card.Date End Work'].unique()[0],
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum()))))
  })).reset_index()

run.log('# rows after merging cases', len(df))

##########################################################################

# split data (test data from last t_test years)
t_test = 0.5
df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

run.log('# rows in train data', len(df_train))
run.log('# rows in test data', len(df_test))

#############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['ProductGroup', 'Symptoms'] #['ProductGroup', 'ProductId', 'Symptoms']
cfg['cat_cols'] = ['Country', 'City', 'LocationType', 'PostalCode']
cfg['date_cols'] = ['Start', 'End']
cfg['num_cols'] = []
cfg['target_cols'] = ['ProductNr']

# create pipeline
pipe = create_pipeline(cfg)

# transform data
df_train = pipe.fit_transform(df_train)
df_test = pipe.transform(df_test)

# rename columns
columns = [ 'feat_' + str(i) if i < df_train.shape[1]-len(pipe.transformer_list[1][1].named_steps['target_encode'].col_cats[0]) else 'target_' + str(i) for i in range(df_train.shape[1]) ]
df_train = pd.DataFrame(df_train, columns=columns)
df_test = pd.DataFrame(df_test, columns=columns)

############################################################################

# save prepared data to csv
os.makedirs('outputs', exist_ok=True)
df_train.to_csv('./outputs/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv('./outputs/test_data.csv', sep=';', header=True, index=False)

############################################################################

run.complete()

In [None]:
from azureml.core import Run
import pandas as pd
import datetime
from pipe import create_pipeline
import os
import numpy as np

# load datasets
df_symptoms = ws.datasets['symptomcodes.csv'].to_pandas_dataframe()
df = ws.datasets['ItemResourceData.csv'].to_pandas_dataframe()

###########################################################

# get only data from last t years
t = 5
df = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=t*365))]

############################################################

# clean data
df = df.replace(['', '0', '-', '000','N/A'], np.nan)
df = df.dropna().reset_index(drop=True)

run.log('# rows after cleaning', len(df))

#############################################################################

# combine Component/Failure Code in train data
df = pd.concat([df, pd.DataFrame(df.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']), axis=1), columns=['CompFail'])], axis=1)

# combine Component/Failure Code in symptom table
df_symptoms = df_symptoms[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]
df_symptoms = pd.concat([df_symptoms, pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])],axis=1)

# merge train data on symptoms
df = pd.merge(df, df_symptoms, on='CompFail', how='left')
df = pd.concat([df, pd.DataFrame(df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(lambda x: tuple([ x[col] for col in ['Symptom1','Symptom2','Symptom3','Symptom4'] if str(x[col]) != 'None' ]), axis=1), columns=['Symptoms'])], axis=1)

# merge into one row per case
df = df.groupby('Job Card.JobCard Number').apply(lambda x: pd.Series({
#     'ProductGroup': tuple(x['Installed Base.Product Group'].unique()),
    'ProductGroup': ' '.join(x['Installed Base.Product Group'].unique()),
#     'ProductId': tuple(x['Installed Base.InstalledBase ProductID'].unique()),
    'ProductId': ' '.join(x['Installed Base.InstalledBase ProductID'].unique()),
    'Country': x['Location.Country'].unique()[0],
    'City': x['Location.City'].unique()[0],
    'LocationType': x['Location.Location Type'].unique()[0],
    'PostalCode': x['Location.Postal Code'].unique()[0],
#     'ProductName': tuple(x['Product.Product Name'].unique()), 
    'ProductName': ' '.join(x['Product.Product Name'].unique()), 
#     'ProductNr': tuple(x['Product.Product Number'].unique()),
    'ProductNr': ' '.join(x['Product.Product Number'].unique()),
#     'Quantity': tuple((x['Product.Product Number']),x['ItemResourceAppliedQuantity']),
    'Start': x['Job Card.Date Start Work'].unique()[0],
    'End': x['Job Card.Date End Work'].unique()[0],
    'Symptoms': ' '.join(map(str, list(set(x['Symptoms'].sum()))))
  })).reset_index()

run.log('# rows after merging cases', len(df))

##########################################################################

# split data (test data from last t_test years)
t_test = 0.5
df_train = df[df['Start']<(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]
df_test = df[df['Start']>=(datetime.datetime.today() - datetime.timedelta(days=t_test*365))]

run.log('# rows in train data', len(df_train))
run.log('# rows in test data', len(df_test))

#############################################################################

# select columns for training
cfg = {}
cfg['multi_cols'] = ['ProductGroup', 'Symptoms'] #['ProductGroup', 'ProductId', 'Symptoms']
cfg['cat_cols'] = ['Country', 'City', 'LocationType', 'PostalCode']
cfg['date_cols'] = ['Start', 'End']
cfg['num_cols'] = []
cfg['target_cols'] = ['ProductNr']

# create pipeline
pipe = create_pipeline(cfg)

# transform data
df_train = pipe.fit_transform(df_train)
df_test = pipe.transform(df_test)

# rename columns
columns = [ 'feat_' + str(i) if i < df_train.shape[1]-len(pipe.transformer_list[1][1].named_steps['target_encode'].col_cats[0]) else 'target_' + str(i) for i in range(df_train.shape[1]) ]
df_train = pd.DataFrame(df_train, columns=columns)
df_test = pd.DataFrame(df_test, columns=columns)

############################################################################

# save prepared data to csv
os.makedirs('outputs', exist_ok=True)
df_train.to_csv('./outputs/train_data.csv', sep=';', header=True, index=False)
df_test.to_csv('./outputs/test_data.csv', sep=';', header=True, index=False)

############################################################################

run.complete()