In [1]:
from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

import pandas as pd
import numpy as np
import datetime 

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="39288a38-ff19-432c-8011-1cd9d0dff445")
ws = Workspace(subscription_id="793146d9-d4dc-4a73-9728-76c4ffd0cc0d", resource_group="rg_dynamics_test", workspace_name="resdynml1test", auth=interactive_auth)

In [3]:
# df_symptoms_init = pd.read_csv('symptomcodes.csv', sep=';', header=0)

df_symptoms_init = ws.datasets['symptomcodes.csv']
df_symptoms_init = df_symptoms_init.to_pandas_dataframe()

symptoms_codes = pd.concat([df_symptoms_init.Symptom1.dropna(),
                            df_symptoms_init.Symptom2.dropna(),
                            df_symptoms_init.Symptom3.dropna(),
                            df_symptoms_init.Symptom4.dropna()],
                           axis=0).unique()
symptoms_codes = [ 'symptom_' + str(symp) for symp in symptoms_codes ]

In [4]:
df = ws.datasets['ItemResourceData.csv']
df = df.to_pandas_dataframe()
len(df)

752544

In [5]:
# get only data from last n years
n = 5
df_tr = df
df_tr = df[df['Job Card.Date Start Work']>(datetime.datetime.today() - datetime.timedelta(days=n*365))]
len(df_tr)

606229

In [6]:
# 1.1.1900
len(df[df['Job Card.Date Start Work']<(datetime.datetime.today() - datetime.timedelta(days=n*365))])

146315

In [7]:
# remove '-' as Installed Base.InstalledBase ProductID
df_tr = df_tr.replace(['', '0', '-', '000', 'N/A'], np.nan)
len(df_tr)

606229

In [8]:
df.columns

Index(['Installed Base.Product Group',
       'Installed Base.InstalledBase ProductID', 'Job Card.JobCard Number',
       'Location.Country', 'Location.City', 'Location.Location Type',
       'Location.Postal Code', 'Product.Product Name',
       'Product.Product Number', 'ItemResourceAppliedQuantity',
       'Job Card.Date Start Work', 'Job Card.Date End Work',
       'Job Card.ComponentCode', 'Job Card.FailureCode',
       'Job Card.Symptom Description'],
      dtype='object')

In [9]:
df_tr = df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID','Location.Country', 'Location.City', 'Location.Location Type','Location.Postal Code', 'Job Card.JobCard Number', 'Product.Product Number','Product.Product Name', 'Job Card.Date Start Work', 'Job Card.Date End Work', 'Job Card.ComponentCode', 'Job Card.FailureCode', 'Job Card.Symptom Description']]
df_tr = df_tr.dropna().reset_index(drop=True)
len(df_tr)

385877

In [10]:
df_tr.columns

Index(['Installed Base.Product Group',
       'Installed Base.InstalledBase ProductID', 'Location.Country',
       'Location.City', 'Location.Location Type', 'Location.Postal Code',
       'Job Card.JobCard Number', 'Product.Product Number',
       'Product.Product Name', 'Job Card.Date Start Work',
       'Job Card.Date End Work', 'Job Card.ComponentCode',
       'Job Card.FailureCode', 'Job Card.Symptom Description'],
      dtype='object')

# Features

## Product Group / Product ID

In [11]:
# Problem: es gibt mehrere Groups pro ID und mehrere IDs pro Group

# anzahl ids pro gruppe
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.Product Group'])['Installed Base.InstalledBase ProductID'].count().mean())
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.Product Group'])['Installed Base.InstalledBase ProductID'].count().median())

# anzahl gruppen pro id
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Installed Base.Product Group'].count().mean())
print(df_tr[['Installed Base.Product Group', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Installed Base.Product Group'].count().median())

74.21142857142857
10.0
1.0110548851693266
1.0


In [12]:
# multiple productids for one group
df_tr[df_tr['Installed Base.Product Group'] == '22101'][['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates()

Unnamed: 0,Installed Base.InstalledBase ProductID,Installed Base.Product Group
80,111319193,22101
83,111319296,22101
9052,2001345,22101
9054,2001350,22101
9061,2001351,22101
9550,2006931,22101
28533,2044009,22101
28559,2044010,22101
28588,2044011,22101
28590,2044012,22101


In [13]:
# multiple groups for one productid
df_tr[df_tr['Installed Base.InstalledBase ProductID'] == '9140962'][['Installed Base.InstalledBase ProductID', 'Installed Base.Product Group']].drop_duplicates()

Unnamed: 0,Installed Base.InstalledBase ProductID,Installed Base.Product Group


## Location

In [14]:
# Problem: es gibt mehrere Locations pro ID (abhängig von CaseId)

print(df_tr[['Location.Country', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Country'].count().mean())
print(df_tr[['Location.City', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.City'].count().mean())
print(df_tr[['Location.Location Type', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Location Type'].count().mean())
print(df_tr[['Location.Postal Code', 'Installed Base.InstalledBase ProductID']].drop_duplicates().groupby(['Installed Base.InstalledBase ProductID'])['Location.Postal Code'].count().mean())

print(df_tr[df_tr['Installed Base.InstalledBase ProductID']=='111151799'][['Job Card.JobCard Number',  'Installed Base.InstalledBase ProductID', 'Location.Country', 'Location.City', 'Location.Location Type', 'Location.Postal Code']])

1.2106656286492798
4.208407940833009
2.0445309458933436
4.799299338263916
      Job Card.JobCard Number Installed Base.InstalledBase ProductID  \
42      C-BE005805-HA JC01.00                              111151799   
43      C-BE005805-HA JC01.00                              111151799   
44   CAS-45039-B8Q3F3 JC01.00                              111151799   
45   CAS-45039-B8Q3F3 JC01.00                              111151799   
46   CAS-45039-B8Q3F3 JC01.00                              111151799   
47  CAS-142034-M9P4B2 JC01.00                              111151799   
48  CAS-142034-M9P4B2 JC01.00                              111151799   
49  CAS-142034-M9P4B2 JC01.00                              111151799   
50   CAS-65124-G7C0X4 JC01.00                              111151799   
51   CAS-65124-G7C0X4 JC01.00                              111151799   
52   CAS-65124-G7C0X4 JC01.00                              111151799   
53   CAS-65124-G7C0X4 JC01.00                              111

## Component und Failure Codes

In [15]:
# es gibt comp fail combos die nicht in symptom tabelle sind

df_failcodes = df_tr[['Job Card.JobCard Number', 'Job Card.ComponentCode','Job Card.FailureCode']].rename(columns={'Job Card.JobCard Number':'CaseId'})
df_symptoms = df_symptoms_init[['ComponentCode', 'FailureCode', 'Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']]

# create component code - failure code combination column in traindata
df_failcodes_combo = pd.DataFrame(df_failcodes.apply(lambda x: (x['Job Card.ComponentCode'],x['Job Card.FailureCode']),axis=1), columns=['CompFail'])
df_failcodes = pd.concat([df_failcodes[['CaseId']], df_failcodes_combo], axis=1)

# create component code - failure code combination column in symptom table
symptoms_combo = pd.DataFrame(df_symptoms.apply(lambda x: (x['ComponentCode'],x['FailureCode']),axis=1), columns=['CompFail'])
df_symptoms = pd.concat([df_symptoms[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']], symptoms_combo], axis=1)

In [16]:
df_symptoms.head()

Unnamed: 0,Symptom1,Symptom2,Symptom3,Symptom4,CompFail
0,G001,P002,K001,K002,"(CS1A, CSW9)"
1,P002,M002,,,"(CS1A, CSX3)"
2,P002,M002,,,"(CS1A, CSY0)"
3,A002,M002,,,"(CS1B, CSY5)"
4,G001,,,,"(CS1C, CSZ9)"


In [17]:
df_failcodes.head()

Unnamed: 0,CaseId,CompFail
0,CAS-34621-Z7R3S7 JC18.00,"(CSXT, CSY7)"
1,CAS-57422-L8N1P3 JC07.00,"(CSXT, CSY7)"
2,CAS-93677-L2H9S3 JC01.00,"(CSXT, CSY7)"
3,CAS-120240-N5Z8G3 JC01.00,"(CSXT, CSY7)"
4,CAS-162708-W1W3T3 JC01.00,"(CSXT, CSY7)"


In [18]:
# Combo in Symptom-Tabelle aber nicht in Training data (nicht so schlimm, aber ?)
print(len(set(df_symptoms['CompFail']).difference(set(df_failcodes['CompFail']))))
      
# Combo in Traindata aber nicht in Sympton tabelle (keine Symptome übersetzbar)
print(len(set(df_failcodes['CompFail']).difference(set(df_symptoms['CompFail']))))

33
3


In [19]:
# Combos in Symptom tabelle ohne Symptome
# (keine Symptome übersetzbar)

df_s = df_symptoms[df_symptoms['Symptom1'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom2'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom3'].isnull()].reset_index(drop=True)
df_s = df_s[df_s['Symptom4'].isnull()].reset_index(drop=True)
len(df_s)

30

## Description

In [21]:
# könnte auch analysiert werden, aber eher später
# Frage: steht da auch etwas sinnvolles drin?

df_tr['Job Card.Symptom Description'].head()

0    Vertragswartung
1    Vertragswartung
2    Vertragswartung
3    Vertragswartung
4    Vertragswartung
Name: Job Card.Symptom Description, dtype: object

# Targets

## ProductName / ProductNr

In [22]:
df_tr.columns

Index(['Installed Base.Product Group',
       'Installed Base.InstalledBase ProductID', 'Location.Country',
       'Location.City', 'Location.Location Type', 'Location.Postal Code',
       'Job Card.JobCard Number', 'Product.Product Number',
       'Product.Product Name', 'Job Card.Date Start Work',
       'Job Card.Date End Work', 'Job Card.ComponentCode',
       'Job Card.FailureCode', 'Job Card.Symptom Description'],
      dtype='object')

In [23]:
# Problem: es gibt mehrere Namen pro Nr und mehrere Nr pro Name

# anzahl ids pro gruppe
print(df_tr[['Product.Product Number', 'Product.Product Name']].drop_duplicates().groupby(['Product.Product Number'])['Product.Product Name'].count().mean())

# anzahl gruppen pro id
print(df_tr[['Product.Product Number', 'Product.Product Name']].drop_duplicates().groupby(['Product.Product Name'])['Product.Product Number'].count().mean())

1.2075725026852846
1.0177662102523481


In [24]:
df_tr[df_tr['Product.Product Number']=='110586593']['Product.Product Name'].drop_duplicates()

26467    PASSSTÜCK FL DN40 X 30 PN6 KPL.F1
59225           F1 Passstück DN40 30mm PN6
97593                            SPACER F1
Name: Product.Product Name, dtype: object

In [25]:
df_tr[df_tr['Product.Product Name']=='Flanschdichtung DN65 PN10']['Product.Product Number'].drop_duplicates()

37106     2830018-10
103143       2830091
Name: Product.Product Number, dtype: object

### Dauer

In [None]:
# mehrere Timestamps pro Case

print(df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates().groupby(['Job Card.JobCard Number'])['Job Card.Date Start Work', 'Job Card.Date End Work'].count().mean())

In [None]:
x = df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates().groupby(['Job Card.JobCard Number'])['Job Card.Date Start Work', 'Job Card.Date End Work'].count().reset_index()

In [None]:
df_tr[df_tr['Job Card.JobCard Number']=='CAS-124239-C6C8Y1 JC01.00'][['Job Card.Date Start Work', 'Job Card.Date End Work']]

In [None]:
# wie soll dauer berechnet werden
# z.B. hier
# 1) von 6:30 bis 14:00                                               = 10.5h
# oder
# 2) Summe: 6:30-7:00 + 11:30-12:00 + 12:00-12:30 + 13:30-14:00       = 2h          <=== wahrscheinlich das
# oder
# 3) Summe von allen Einträgen                                        = 2.5

df_tr[df_tr['Job Card.JobCard Number']=='CAS-77817-X1H3H1 JC01.00'][['Job Card.Date Start Work', 'Job Card.Date End Work']]

In [None]:
x[x['Job Card.Date Start Work']>1]

In [None]:
y = df_tr[['Job Card.JobCard Number', 'Job Card.Date Start Work', 'Job Card.Date End Work']].drop_duplicates()