In [75]:
from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

import pandas as pd
import numpy as np
import datetime 

In [76]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [77]:
# df_symptoms_init = pd.read_csv('symptomcodes.csv', sep=';', header=0)

df_symptoms_init = ws.datasets['symptomcodes.csv']
df_symptoms_init = df_symptoms_init.to_pandas_dataframe()

symptoms_codes = pd.concat([df_symptoms_init.Symptom1.dropna(),
                            df_symptoms_init.Symptom2.dropna(),
                            df_symptoms_init.Symptom3.dropna(),
                            df_symptoms_init.Symptom4.dropna()],
                           axis=0).unique()
symptoms_codes = [ 'symptom_' + str(symp) for symp in symptoms_codes ]

In [78]:
df = ws.datasets['ItemResourceData.csv']
df = df.to_pandas_dataframe()
len(df)

752544

In [79]:
# get only data from last n years
n = 5
df_tr = df
df_tr = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=n*365))]
len(df_tr)

605640

In [80]:
# 1.1.1900
len(df[df['Job Card.Date Start Work']<(datetime.datetime.today() - datetime.timedelta(days=n*365))])

146904

In [81]:
df[df['Job Card.Date Start Work']<(datetime.datetime.today() - datetime.timedelta(days=n*365))].head()

Unnamed: 0,Installed Base.Product Group,Installed Base.InstalledBase ProductID,Job Card.JobCard Number,Location.Country,Location.City,Location.Location Type,Location.Postal Code,Product.Product Name,Product.Product Number,ItemResourceAppliedQuantity,Job Card.Date Start Work,Job Card.Date End Work,Job Card.ComponentCode,Job Card.FailureCode,Job Card.Symptom Description
23,,-,CAS-105628-J5W5Y8 JC01.00,GB,Suffolk,Building - Commercial,IP1 1DT,Labour Supervisor Basic Time (15 min.),2028262,0,1900-01-01,1900-01-01,,,
24,,-,CAS-105628-J5W5Y8 JC01.00,GB,Suffolk,Building - Commercial,IP1 1DT,Travel Time (15 min.),2028261,0,1900-01-01,1900-01-01,,,
25,,-,CAS-105628-J5W5Y8 JC02.00,GB,Suffolk,Building - Commercial,IP1 1DT,Labour Supervisor Basic Time (15 min.),2028262,0,1900-01-01,1900-01-01,,,
26,,-,CAS-105628-J5W5Y8 JC02.00,GB,Suffolk,Building - Commercial,IP1 1DT,Travel Time (15 min.),2028261,0,1900-01-01,1900-01-01,,,
49,,-,CAS-77855-B1M2F5 JC02.00,GB,London,Building - Residential,SW20 8JG,Labour Supervisor Basic Time (15 min.),2028262,0,1900-01-01,1900-01-01,,,


In [82]:
# remove '-' as Installed Base.InstalledBase ProductID
df_tr = df_tr.replace(['', '0', '-', '000', 'N/A'], np.nan)
len(df_tr)

605640

In [83]:
df_tr = df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID','Location.Country', 'Location.City', 'Location.Location Type','Location.Postal Code', 'Job Card.JobCard Number', 'Product.Product Number','Product.Product Name', 'Job Card.Date Start Work', 'Job Card.Date End Work', 'Job Card.ComponentCode', 'Job Card.FailureCode', 'Job Card.Symptom Description']]
df_tr = df_tr.dropna().reset_index(drop=True)
len(df_tr)

385346

In [84]:
df_tr.columns

Index(['Installed Base.Product Group',
       'Installed Base.InstalledBase ProductID', 'Location.Country',
       'Location.City', 'Location.Location Type', 'Location.Postal Code',
       'Job Card.JobCard Number', 'Product.Product Number',
       'Product.Product Name', 'Job Card.Date Start Work',
       'Job Card.Date End Work', 'Job Card.ComponentCode',
       'Job Card.FailureCode', 'Job Card.Symptom Description'],
      dtype='object')

# Features

## Product Group / Product ID

In [13]:
# Problem: es gibt mehrere Groups pro ID und mehrere IDs pro Group

# anzahl ids pro gruppe
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.Product Group'])['Installed Base.InstalledBase ProductID'].count().mean())
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.Product Group'])['Installed Base.InstalledBase ProductID'].count().median())

# anzahl gruppen pro id
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Installed Base.Product Group'].count().mean())
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Installed Base.Product Group'].count().median())

74.18857142857142
10.0
1.0110583287905925
1.0


In [17]:
# multiple productids for one group
df_tr[df_tr['Installed Base.Product Group'] == '22101'][['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates().head()

Unnamed: 0,Installed Base.InstalledBase ProductID,Installed Base.Product Group
80,111319193,22101
83,111319296,22101
9038,2001345,22101
9040,2001350,22101
9047,2001351,22101


In [69]:
# df_count = df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Installed Base.Product Group'].count()
# len(df_count[df_count>1])

128

In [16]:
# multiple groups for one productid
df_tr[df_tr['Installed Base.InstalledBase ProductID'] == '2155017'][['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates()

Unnamed: 0,Installed Base.InstalledBase ProductID,Installed Base.Product Group
93078,2155017,93108
93262,2155017,43601
93282,2155017,43503
93323,2155017,41402


In [None]:
# allgemein:
# large number of ids => problematic feature
# => one clf per id/instance ?                 <<< ja, id = articletype; v.a. feste Menge an Ersatzteilen/Service pro Artikel
# => one more general clf ?

In [73]:
len(df_tr['Installed Base.InstalledBase ProductID'].unique())

12841

In [74]:
len(df_tr['Installed Base.Product Group'].unique())

175

## Location

In [21]:
# Problem: es gibt mehrere Locations pro ID (abhängig von CaseId)

print(df_tr[['Location.Country', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Country'].count().mean())
print(df_tr[['Location.City', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.City'].count().mean())
print(df_tr[['Location.Location Type', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Location Type'].count().mean())
print(df_tr[['Location.Postal Code', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Postal Code'].count().mean())

print(df_tr[df_tr['Installed Base.InstalledBase ProductID']=='111151799'][['Job Card.JobCard Number',  'Installed Base.InstalledBase ProductID', 'Location.Country', 'Location.City', 'Location.Location Type', 'Location.Postal Code']])

1.2106533759053033
4.205124211510007
2.0441554396075072
4.795420917374036
      Job Card.JobCard Number Installed Base.InstalledBase ProductID  \
42      C-BE005805-HA JC01.00                              111151799   
43      C-BE005805-HA JC01.00                              111151799   
44   CAS-45039-B8Q3F3 JC01.00                              111151799   
45   CAS-45039-B8Q3F3 JC01.00                              111151799   
46   CAS-45039-B8Q3F3 JC01.00                              111151799   
47  CAS-142034-M9P4B2 JC01.00                              111151799   
48  CAS-142034-M9P4B2 JC01.00                              111151799   
49  CAS-142034-M9P4B2 JC01.00                              111151799   
50   CAS-65124-G7C0X4 JC01.00                              111151799   
51   CAS-65124-G7C0X4 JC01.00                              111151799   
52   CAS-65124-G7C0X4 JC01.00                              111151799   
53   CAS-65124-G7C0X4 JC01.00                              111

In [22]:
# Problem: es gibt mehrere Locations pro ID (abhängig von CaseId)

print(df_tr[['Location.Country', 'Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group'])['Location.Country'].count().mean())
print(df_tr[['Location.City', 'Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group'])['Location.City'].count().mean())
print(df_tr[['Location.Location Type', 'Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group'])['Location.Location Type'].count().mean())
print(df_tr[['Location.Postal Code', 'Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group'])['Location.Postal Code'].count().mean())

print(df_tr[df_tr['Installed Base.InstalledBase ProductID']=='111151799'][['Job Card.JobCard Number',  'Installed Base.InstalledBase ProductID', 'Installed Base.Product Group', 'Location.Country', 'Location.City', 'Location.Location Type', 'Location.Postal Code']])

1.2014172379265193
4.16067164753909
2.0317338057459753
4.744435030424401
      Job Card.JobCard Number Installed Base.InstalledBase ProductID  \
42      C-BE005805-HA JC01.00                              111151799   
43      C-BE005805-HA JC01.00                              111151799   
44   CAS-45039-B8Q3F3 JC01.00                              111151799   
45   CAS-45039-B8Q3F3 JC01.00                              111151799   
46   CAS-45039-B8Q3F3 JC01.00                              111151799   
47  CAS-142034-M9P4B2 JC01.00                              111151799   
48  CAS-142034-M9P4B2 JC01.00                              111151799   
49  CAS-142034-M9P4B2 JC01.00                              111151799   
50   CAS-65124-G7C0X4 JC01.00                              111151799   
51   CAS-65124-G7C0X4 JC01.00                              111151799   
52   CAS-65124-G7C0X4 JC01.00                              111151799   
53   CAS-65124-G7C0X4 JC01.00                              1111

## Component und Failure Codes

In [23]:
# es gibt comp fail combos die nicht in symptom tabelle sind

df_failcodes = df_tr[['Job Card.JobCard Number', 'Job Card.ComponentCode','Job Card.FailureCode']].rename(columns={'Job Card.JobCard Number':'CaseId'})
df_symptoms = df_symptoms_init[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]

# create component code - failure code combination column in traindata
df_failcodes_combo = pd.DataFrame(df_failcodes.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']),axis=1), columns=['CompFail'])
df_failcodes = pd.concat([df_failcodes[['CaseId']], df_failcodes_combo], axis=1)

# create component code - failure code combination column in symptom table
symptoms_combo = pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])
df_symptoms = pd.concat([df_symptoms[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']], symptoms_combo], axis=1)

In [37]:
print(df_symptoms.head())
print(df_symptoms.iloc[43])

  Symptom1 Symptom2 Symptom3 Symptom4      CompFail
0     G001     P002     K001     K002  (CS1A, CSW9)
1     P002     M002     None     None  (CS1A, CSX3)
2     P002     M002     None     None  (CS1A, CSY0)
3     A002     M002     None     None  (CS1B, CSY5)
4     G001     None     None     None  (CS1C, CSZ9)
Symptom1            None
Symptom2            None
Symptom3            None
Symptom4            None
CompFail    (CSXE, CSX9)
Name: 43, dtype: object


In [25]:
df_failcodes.head()

Unnamed: 0,CaseId,CompFail
0,CAS-34621-Z7R3S7 JC18.00,"(CSXT, CSY7)"
1,CAS-57422-L8N1P3 JC07.00,"(CSXT, CSY7)"
2,CAS-93677-L2H9S3 JC01.00,"(CSXT, CSY7)"
3,CAS-120240-N5Z8G3 JC01.00,"(CSXT, CSY7)"
4,CAS-162708-W1W3T3 JC01.00,"(CSXT, CSY7)"


In [26]:
# Combo in Symptom-Tabelle aber nicht in Training data (nicht so schlimm, aber ?)
print(len(set(df_symptoms['CompFail']).difference(set(df_failcodes['CompFail']))))
      
# Combo in Traindata aber nicht in Sympton tabelle (keine Symptome übersetzbar)
print(len(set(df_failcodes['CompFail']).difference(set(df_symptoms['CompFail']))))

33
3


In [None]:
set(df_failcodes['CompFail']).difference(set(df_symptoms['CompFail']))

In [None]:
set(df_symptoms['CompFail']

In [27]:
# Combos in Symptom tabelle ohne Symptome
# (keine Symptome übersetzbar)

df_s = df_symptoms[df_symptoms['Symptom1'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom2'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom3'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom4'].isnull()].reset_index(drop=True)
len(df_s)

30

## Description

In [None]:
# könnte auch analysiert werden, aber eher später
# Frage: steht da auch etwas sinnvolles drin?

df_tr['Job Card.Symptom Description'].head(30)

# Targets

## ProductName / ProductNr

In [39]:
df_tr.columns

Index(['Installed Base.Product Group',
       'Installed Base.InstalledBase ProductID', 'Location.Country',
       'Location.City', 'Location.Location Type', 'Location.Postal Code',
       'Job Card.JobCard Number', 'Product.Product Number',
       'Product.Product Name', 'Job Card.Date Start Work',
       'Job Card.Date End Work', 'Job Card.ComponentCode',
       'Job Card.FailureCode', 'Job Card.Symptom Description'],
      dtype='object')

In [40]:
# Problem: es gibt mehrere Namen pro Nr und mehrere Nr pro Name

# anzahl namen pro nr
print(df_tr[['Product.Product Number', 'Product.Product Name']].drop_duplicates().groupby(['Product.Product Number'])['Product.Product Name'].count().mean())

# anzahl nr pro name
print(df_tr[['Product.Product Number', 'Product.Product Name']].drop_duplicates().groupby(['Product.Product Name'])['Product.Product Number'].count().mean())

1.207628256782165
1.0177702320316921


In [41]:
df_tr[df_tr['Product.Product Number']=='110586593']['Product.Product Name'].drop_duplicates()

26423    PASSSTÜCK FL DN40 X 30 PN6 KPL.F1
59108           F1 Passstück DN40 30mm PN6
97414                            SPACER F1
Name: Product.Product Name, dtype: object

In [42]:
df_tr[df_tr['Product.Product Name']=='Flanschdichtung DN65 PN10']['Product.Product Number'].drop_duplicates()

37040     2830018-10
102964       2830091
Name: Product.Product Number, dtype: object

In [None]:
# which is correct column: name or nr     <<<< nr benutzen
# is any grouping/reduction possible      <<<< nein

In [None]:
# Should we split products into Products AND Services
#
# ==> one clf for products
# ==> one clf for services
#
# how could we differentiate between prods and services?

In [67]:
df_tr[['Job Card.JobCard Number', 'Product.Product Number', 'Product.Product Name']].head(20)

Unnamed: 0,Job Card.JobCard Number,Product.Product Number,Product.Product Name
0,CAS-34621-Z7R3S7 JC18.00,2028266,Lohnkosten Wartung Techniker
1,CAS-57422-L8N1P3 JC07.00,2028266,Lohnkosten Wartung Techniker
2,CAS-93677-L2H9S3 JC01.00,2028266,Lohnkosten Wartung Techniker
3,CAS-120240-N5Z8G3 JC01.00,2028266,Lohnkosten Wartung Techniker
4,CAS-162708-W1W3T3 JC01.00,2028266,Lohnkosten Wartung Techniker
5,CAS-41672-N0N6X7 JC01.01,2038370,Einsatzbasisleistung
6,CAS-41672-N0N6X7 JC01.01,2028262,Lohnkosten Spezialist
7,CAS-41672-N0N6X7 JC01.01,2132712,Tagespauschale Messgeräte
8,CAS-41672-N0N6X7 JC01.02,2038370,Einsatzbasisleistung
9,CAS-41672-N0N6X7 JC01.02,2028262,Lohnkosten Spezialist


### Dauer

In [43]:
# mehrere Timestamps pro Case

print(df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates().groupby(['Job Card.JobCard Number'])['Job Card.Date Start Work', 'Job Card.Date End Work'].count().mean())

Job Card.Date Start Work    1.000031
Job Card.Date End Work      1.000031
dtype: float64


In [46]:
x = df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates().groupby(['Job Card.JobCard Number'])['Job Card.Date Start Work', 'Job Card.Date End Work'].count().reset_index()
x[x['Job Card.Date Start Work']>1]

Unnamed: 0,Job Card.JobCard Number,Job Card.Date Start Work,Job Card.Date End Work
33316,CAS-124239-C6C8Y1 JC01.00,2,2
94091,CAS-34560-S4Y6Q9 JC01.05,2,2
117219,CAS-54837-Q4D1N1 JC01.00,3,3
122533,CAS-59460-T9G6C2 JC01.01,2,2


In [47]:
df_tr[df_tr['Job Card.JobCard Number']=='CAS-124239-C6C8Y1 JC01.00'][['Job Card.Date Start Work', 'Job Card.Date End Work']]

Unnamed: 0,Job Card.Date Start Work,Job Card.Date End Work
3991,2018-10-25 09:45:00,2018-10-25 10:30:00
273457,2018-10-25 08:30:00,2018-10-25 09:00:00


In [56]:
# wie soll dauer berechnet werden
# z.B. hier
# 1) von 6:30 (min_start) bis 14:00 (max_ende)                        = 10.5h 
# oder
# 2) Summe: 6:30-7:00 + 11:30-12:00 + 12:00-12:30 + 13:30-14:00       = 2h          <=== wahrscheinlich das
# oder
# 3) Summe von allen Einträgen                                        = 2.5

print(df_tr[df_tr['Job Card.JobCard Number']=='CAS-59460-T9G6C2 JC01.01'][['Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates())
print(df_tr[df_tr['Job Card.JobCard Number']=='CAS-59460-T9G6C2 JC01.01'][['Job Card.Date Start Work', 'Job Card.Date End Work']])

       Job Card.Date Start Work Job Card.Date End Work
270307      2016-10-18 08:30:00    2016-10-18 09:00:00
270328      2016-10-28 13:00:00    2016-10-28 14:30:00
       Job Card.Date Start Work Job Card.Date End Work
270307      2016-10-18 08:30:00    2016-10-18 09:00:00
270308      2016-10-18 08:30:00    2016-10-18 09:00:00
270309      2016-10-18 08:30:00    2016-10-18 09:00:00
270328      2016-10-28 13:00:00    2016-10-28 14:30:00
270329      2016-10-28 13:00:00    2016-10-28 14:30:00
270330      2016-10-28 13:00:00    2016-10-28 14:30:00
270331      2016-10-28 13:00:00    2016-10-28 14:30:00


In [90]:
df_tr[df_tr['Job Card.JobCard Number']=='CAS-59460-T9G6C2 JC01.01'][['Product.Product Name', 'Job Card.Date Start Work', 'Job Card.Date End Work']]

Unnamed: 0,Product.Product Name,Job Card.Date Start Work,Job Card.Date End Work
270265,Einsatzbasisleistung,2016-10-18 08:30:00,2016-10-18 09:00:00
270266,Lohnkosten Techniker,2016-10-18 08:30:00,2016-10-18 09:00:00
270267,Schmutzzulage,2016-10-18 08:30:00,2016-10-18 09:00:00
270286,Einsatzbasisleistung,2016-10-28 13:00:00,2016-10-28 14:30:00
270287,Lohnkosten Techniker,2016-10-28 13:00:00,2016-10-28 14:30:00
270288,Pumpenaufsatz RexaLift L/16 SET,2016-10-28 13:00:00,2016-10-28 14:30:00
270289,Schmutzzulage,2016-10-28 13:00:00,2016-10-28 14:30:00


In [None]:
# ==> pro JobCard genau 1 Timestamp paar
# evtl entfernen wenn mehrere timestamp

In [57]:
x[x['Job Card.Date Start Work']>1]

Unnamed: 0,Job Card.JobCard Number,Job Card.Date Start Work,Job Card.Date End Work
33316,CAS-124239-C6C8Y1 JC01.00,2,2
94091,CAS-34560-S4Y6Q9 JC01.05,2,2
117219,CAS-54837-Q4D1N1 JC01.00,3,3
122533,CAS-59460-T9G6C2 JC01.01,2,2


In [58]:
y = df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates()