# Explore and Clean Data 2

In [1]:
import math as mt

import numpy as np

import pandas as pd

import pandas_profiling

import mpu.io

# Enables interactive figures
# import mpld3

%matplotlib notebook
import matplotlib.pyplot as plt

#mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [9.5, 6]

## Import previously saved data

Data was saved in a pickled format. Let's reimport it.

The dataframe "df5": 
- consists of all the Features (=columns, =variables) that are not constant and that are not all NA
- it has 347 columns (27 were droped)
- the datetime and datetimetz variables were read with the correct dtype

The dictionary "constants":
- consists of all columns that are actually constant

The columns 'qc_entnahme_datum', 'quiver_url' and 'quiver_encoded_query' where droped since they only contained missing values ('NaN')

In [2]:
df5 = pd.read_pickle('./Data/process_data_df5.pkl')
constants = mpu.io.read('./Data/process_constants.json')

## Divide and conquer
Let's check the different dtypes: 

In [3]:
print(*df5.dtypes.unique(),sep = '\n')

datetime64[ns]
float64
object
bool
int64
datetime64[ns, UTC]


I believe we can trust that the 'datetime' dtypes were correctely recogized.

Issues:
- [ ] Some 'float' variables might actually be 'integers', 'booleans' or 'categorical'
- [ ] Some 'object' variables might actually also be better interpreted as 'categorical'
- [ ] We should check what the 'integer' variables are. (Checked. They are the indexes!)

Minor issues:
- [ ] Variable names (column names) should be consistent (either all in German or all in English?) 
    - [ ] '_%' instead of '_in_percentage', 
    - [ ] '_datum' instead of '_date'
    - [ ] windung instead of winding 
- [ ] Be careful with typos e.g 'windung' instead of 'winding'

### constants
'constants' provide important information from reference values:

In [4]:
constants

{'component': ['MEB'],
 'p_netting': [0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum_temperatur_start_ref_

### boolean

In [5]:
df5.select_dtypes(bool).head()

Unnamed: 0,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False


In [6]:
df5.select_dtypes(bool).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 2 columns):
pa_bad-wechsel_mpda_nach     118687 non-null bool
pa_bad-wechsel_chlor_nach    118687 non-null bool
dtypes: bool(2)
memory usage: 231.9 KB


### datetime and datetimetz

In [7]:
df5.select_dtypes(['datetime', 'datetimetz']).loc[50000:50006]

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
50000,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:10:47+00:00,...,2018-11-16 17:17:40+00:00,2018-11-15 12:10:47+00:00,2018-11-15 05:32:36+00:00,2018-11-15 05:32:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50001,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:49+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:18:49+00:00,2018-11-15 06:24:36+00:00,2018-11-15 06:24:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50002,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:45+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:18:45+00:00,2018-11-15 06:29:04+00:00,2018-11-15 06:29:04+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50003,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:10+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:25:10+00:00,2018-11-15 06:26:50+00:00,2018-11-15 06:26:50+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50004,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:06+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:25:06+00:00,2018-11-15 06:33:32+00:00,2018-11-15 06:33:32+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50005,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:15+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:15+00:00,2018-11-15 06:31:18+00:00,2018-11-15 06:31:18+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50006,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:09+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:09+00:00,2018-11-15 06:38:00+00:00,2018-11-15 06:38:00+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01


In [8]:
print(df5.select_dtypes(['datetime', 'datetimetz']).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 21 columns):
pa_datum                       118687 non-null datetime64[ns]
ps_datum                       118687 non-null datetime64[ns]
ps_date_coating                118661 non-null datetime64[ns]
pa_date_coating                118687 non-null datetime64[ns]
qt_datum                       116303 non-null datetime64[ns]
pp_plan_actual_date_coating    118687 non-null datetime64[ns]
pp_plan_end_date_winding       118666 non-null datetime64[ns]
windung_begin_date             117824 non-null datetime64[ns]
winding_end_date               118621 non-null datetime64[ns]
qc_erfassungs_datum            118670 non-null datetime64[ns, UTC]
qc_verpackungs_datum           115035 non-null datetime64[ns, UTC]
qc_einlager_datum              114941 non-null datetime64[ns, UTC]
sc_d_datum                     118670 non-null datetime64[ns, UTC]
sc_datum_generate              90978 non-null datetime64[ns, UT

In [9]:
df5.select_dtypes(['datetime', 'datetimetz']).describe()

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
count,118687,118687,118661,118687,116303,118687,118666,117824,118621,118670,...,114941,118670,90978,89422,76,90978,118670,118687,118687,118670
unique,578,594,596,593,578,593,608,607,603,111951,...,6217,111951,46739,88769,71,22521,10,578,578,24
top,2018-01-28 00:00:00,2018-03-03 00:00:00,2019-01-28 00:00:00,2019-03-05 00:00:00,2018-09-06 00:00:00,2019-03-05 00:00:00,2019-02-05 00:00:00,2018-06-25 00:00:00,2018-03-12 00:00:00,2019-09-05 16:19:42+00:00,...,2018-09-21 08:28:36+00:00,2019-09-05 16:19:42+00:00,2019-06-10 02:22:50+00:00,2019-08-25 21:09:54+00:00,2019-09-24 19:08:53+00:00,2019-06-01 01:10:07+00:00,2013-01-08 00:00:00+00:00,2018-01-28 00:00:00+00:00,2018-01-28 00:00:00+00:00,2019-03-01 00:00:00
freq,900,834,964,984,1093,984,790,519,601,4,...,49,4,24,2,2,663,30701,900,900,7428
first,2018-01-04 00:00:00,2018-01-01 00:00:00,2018-01-01 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-03 00:00:00,2018-01-11 00:00:00,2017-05-31 00:00:00,2018-01-10 00:00:00,2018-01-09 23:14:40+00:00,...,2018-01-12 05:22:27+00:00,2018-01-09 23:14:40+00:00,2018-06-07 10:04:35+00:00,2018-06-07 12:17:35+00:00,2018-06-07 10:04:35+00:00,2018-10-15 15:21:03+00:00,2012-03-20 00:00:00+00:00,2018-01-04 00:00:00+00:00,2018-01-04 00:00:00+00:00,2018-01-01 00:00:00
last,2019-12-05 00:00:00,2019-11-16 00:00:00,2019-11-17 00:00:00,2019-12-05 00:00:00,2019-12-05 00:00:00,2019-12-05 00:00:00,2019-12-10 00:00:00,2019-12-06 00:00:00,2019-12-06 00:00:00,2019-12-06 10:47:57+00:00,...,2019-12-06 05:27:12+00:00,2019-12-06 10:47:57+00:00,2019-12-05 14:59:30+00:00,2019-12-06 09:07:23+00:00,2019-12-05 22:18:26+00:00,2019-12-06 09:05:01+00:00,2018-10-22 00:00:00+00:00,2019-12-05 00:00:00+00:00,2019-12-05 00:00:00+00:00,2019-12-01 00:00:00


### object

In [10]:
df5.select_dtypes(object).head()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
0,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6f5dd5e75de0,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
1,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,e83198853aa3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
2,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,0c6c47811c04,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
3,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6b51542380df,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
4,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,58df9ba0a603,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [11]:
df5.select_dtypes(object).tail()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
118682,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,55315e14346a,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118683,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,5e840146da5b,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118684,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,05ac5c0533e3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118685,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,b629f239aa69,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118686,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,44994043c050,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [12]:
df5.select_dtypes(object).loc[50000:50006]

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
50000,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,79d813e4c6bf,2405711870910W0606WK31876,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50001,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,1ade0cad8b89,2405711870910W0606WK31877,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50002,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,cd97d5690539,2405711870910W0606WK31880,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50003,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,9f9ed3771287,2405711870910W0606WK31881,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50004,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,707deae1e2e0,2405711870910W0606WK31882,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50005,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,8f2bef211dd1,2405711870910W0606WK31883,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50006,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,482ad93b3873,2405711870910W0606WK31884,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W


Issues:
1. 'nr', 'ps_lsg', 'pa_tmc_gehalt_in_percentage', 'qc_barcode_leak_test_values' were read as 'object', but apparently they're numerical.
    - 'nr' has empty strings of different lengths
    - 'ps_lsg' ValueError: Unable to parse string "061/062" at position 2399
    - 'pa_tmc_gehalt_in_percentage' ValueError: Unable to parse string "0,075/0,07" at position 46788
    - 'qc_barcode_leak_test_values' is actually 'object
1. 'pp_actual_usage' appears to be a good categorical variable.
1. The other 'object' variables that have less than 15 unique values could be also categorical. 
1. 'qc_pa_beschichtungsjahr' has also some weird values: 'W', 'X' and NaN's

In [13]:
objs = df5.select_dtypes(object)
try: 
    pd.to_numeric(objs['nr'])
except:
    None

# pd.to_numeric(df5.select_dtypes)
# df5['qc_pa_beschichtungsjahr'].unique()


In [14]:
# Change 'nr' to numeric
df5.nr = pd.to_numeric(df5.nr,errors='coerse')

df5.select_dtypes(object)[objs.nr.isna()]

Unnamed: 0,ps,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr


In [15]:
pd.to_numeric(objs['ps_lsg'])

# objs[objs.nr.isna()]
# df5.select_dtypes(object)[objs.nr.isna()]



# pd.to_numeric(df5.nr)

#df5.nr.loc["          " == df5.nr]


ValueError: Unable to parse string "061/062" at position 2399

In [16]:
df5['pa_tmc_gehalt_in_percentage'].unique()

array([nan, 0.06, 0.07, 0.075, 0.045, 0.08, 0.16, '0.07', '0.06', '0.16',
       '0,075/0,07', 0.05, 0.15], dtype=object)

In [17]:
pd.to_numeric(df5['pa_tmc_gehalt_in_percentage'])

ValueError: Unable to parse string "0,075/0,07" at position 46788

In [18]:
print(df5.select_dtypes(object).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 19 columns):
ps                              118687 non-null object
ps_lsg                          118348 non-null object
pp_product_short_name           118687 non-null object
pp_plan_product                 118686 non-null object
pp_actual_product_short_name    118641 non-null object
pp_actual_product               118682 non-null object
pp_actual_usage                 118685 non-null object
winding_product_short_name      118687 non-null object
winding_product_type            118687 non-null object
winding_product_line            118687 non-null object
pa_tmc_gehalt_in_percentage     43045 non-null object
pa_ref                          118687 non-null object
qc_serien_nummer                118670 non-null object
qc_barcode_leak_test_values     90978 non-null object
p_product                       118670 non-null object
p_product_full_name             118670 non-null object
p_product_g

In [19]:
df5.select_dtypes(object).describe()

Unnamed: 0,ps,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
count,118687,118348,118687,118686,118641,118682,118685,118687,118687,118687,43045.0,118687,118670,90978,118670,118670,118670,118670,118670
unique,1236,947,7,22,9,22,4,9,7,3,12.0,1263,117862,90322,22,22,2,8,2
top,d6dca86ea5f0,121,5200bdfc01a1,fb83fd553ff7,6d2830b1e76d,fb83fd553ff7,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,0.06,f75dcbde8fe4,5ddbe01d7e36,2405717321110X0516XI32166,fb83fd553ff7,22227b31350f,4d6474389c69,6d2830b1e76d,W
freq,426,548,41352,24384,51354,24381,118537,51354,46474,61534,19059.0,426,2,2,24380,24380,112980,51353,62936


### float

In [20]:
df5.select_dtypes(float).head()

Unnamed: 0,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,pa_temperatur_m-pda-bad_c,...,sum_of_scrap,roll_position,pda_median,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median
0,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,19.0,...,0.0,58.181818,1.23,826.0,58.0,58.0,80.1,22.0,78.0,134.5
1,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,19.0,...,0.0,116.363636,1.23,826.0,58.0,58.0,80.1,22.0,78.0,134.5
2,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,19.0,...,0.0,174.545455,1.23,826.0,58.0,58.0,80.1,22.0,78.0,134.5
3,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,19.0,...,0.0,232.727273,1.23,826.0,58.0,58.0,80.1,22.0,78.0,134.5
4,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,19.0,...,0.0,290.909091,1.23,826.0,58.0,58.0,80.1,22.0,78.0,134.5


In [21]:
df5.nr.describe()

count    118583.000000
mean       3290.563106
std         406.985990
min          92.000000
25%        2973.000000
50%        3248.000000
75%        3612.000000
max        3990.000000
Name: nr, dtype: float64

In [22]:
# L = []
# for s in df5.columns:
#     pre = s[0:3]
#     #if pre == "sc":
#     L = L + [pre]

In [23]:
# print(np.unique(np.array(L)))

In [24]:
# "feuch" in s 
# for index name in enumerate(df5.columns)
    
# #df5.loc[:,]

it = iter(list(df5.columns),)

In [25]:
for i in range(10):
    print(next(it))

pa_datum
pa_ps_beschichtete_rollenlange_m
pa_beschichtete_rollenlange_m
pa_bahn-geschwin-digkeit_m/min
pa_badwechsel_m-pda
pa_badwechsel_hw1
pa_badwechsel_hw2
pa_badwechsel_chlor
pa_badwechsel_hw3
pa_raum_temperatur_start_c


In [26]:
df5.iloc[0:6,0:10]

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0
5,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0


Check for columns with less than 10 unique values in order to find categoricals and booleans

In [27]:
#df5.apply(pd.unique, axis = 0)
#len(df5['pa_badwechsel_m-pda'].unique())
col = list(df5.columns)[2]
# for col in df5:


for colNr in range(347):
    unList = df5.iloc[:,colNr].unique() 
    nrUn = len(unList)
    if nrUn < 10:
        print('Column {} has {} unique values: {}'.format(colNr, nrUn, unList))


Column 1 has 2 unique values: [250. 280.]
Column 3 has 3 unique values: [ 9. 12.  6.]
Column 4 has 2 unique values: [0. 1.]
Column 5 has 2 unique values: [0. 1.]
Column 6 has 2 unique values: [1. 0.]
Column 7 has 2 unique values: [0. 1.]
Column 8 has 2 unique values: [0. 1.]
Column 20 has 7 unique values: [ 0.   16.6   2.4   4.    0.67  0.16  0.12]
Column 22 has 2 unique values: [ nan 0.12]
Column 25 has 8 unique values: [20.  21.  22.  19.  18.   0.  17.  22.4]
Column 52 has 9 unique values: [30. 35. 65. 25. 33.  0. 36. 27. 32.]
Column 73 has 9 unique values: [  0.  80.  85. 100.  65.   5.  88.  30.  92.]
Column 77 has 2 unique values: [  0. 100.]
Column 78 has 2 unique values: [  0. 100.]
Column 79 has 2 unique values: [  0. 100.]
Column 80 has 2 unique values: [  0. 100.]
Column 81 has 2 unique values: [  0. 100.]
Column 82 has 2 unique values: [9. 6.]
Column 84 has 6 unique values: [2.3 2.9 4.4 3.3 0.  5.6]
Column 85 has 7 unique values: [2.5  3.1  4.8  3.5  0.   6.   2.52]
Column 

In [28]:
int(1) == float(1) 

True

In [29]:
float(2.4)

2.4

In [30]:
df5.derived_date.unique()

array(['2018-05-01T00:00:00.000000000', '2018-07-01T00:00:00.000000000',
       '2018-10-01T00:00:00.000000000', '2018-11-01T00:00:00.000000000',
       '2019-03-01T00:00:00.000000000', '2019-05-01T00:00:00.000000000',
       '2019-06-01T00:00:00.000000000', '2019-07-01T00:00:00.000000000',
       '2018-01-01T00:00:00.000000000', '2018-02-01T00:00:00.000000000',
       '2018-04-01T00:00:00.000000000', '2018-06-01T00:00:00.000000000',
       '2018-09-01T00:00:00.000000000', '2019-02-01T00:00:00.000000000',
       '2019-04-01T00:00:00.000000000', '2019-09-01T00:00:00.000000000',
       '2018-12-01T00:00:00.000000000', '2019-10-01T00:00:00.000000000',
       '2018-03-01T00:00:00.000000000', '2019-01-01T00:00:00.000000000',
       '2019-11-01T00:00:00.000000000', '2018-08-01T00:00:00.000000000',
                                 'NaT', '2019-08-01T00:00:00.000000000',
       '2019-12-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [53]:
# Check all the prefixes in our data

L = []
for s in list(df5.columns):
    prefix = (s.split('_',1))[0]
    L.append(prefix)
    
list(np.unique(np.array(L)))
    

['assembled',
 'assembling',
 'chlor',
 'ct',
 'ct1',
 'derived',
 'element',
 'f',
 'leak',
 'nr',
 'p',
 'pa',
 'pda',
 'pp',
 'ps',
 'qc',
 'qt',
 'reaction',
 'roll',
 'sc',
 'sum',
 'temp',
 'total',
 'winding',
 'windung']

Issues:

- [ ] Column with name "windung" instead of "winding"

In [52]:
# Check all columns that have missing values
naCols = df5.isna().any()
df5_nans = df5.loc[:,naCols]

# Check the percentage of missing values in each column
(df5_nans.isna().sum()/df5.shape[0]).sort_values(ascending = False)


sc_l_datum_hand                 0.999360
pa_ctmc_richtwert_%             0.998972
pa_tmc_gehalt_in_percentage     0.637323
sc_l_datum_auto                 0.246573
qc_datum_leak_test_values       0.233463
sc_datum_generate               0.233463
qc_barcode_leak_test_values     0.233463
qc_einlager_datum               0.031562
qc_verpackungs_datum            0.030770
qt_datum                        0.020086
windung_begin_date              0.007271
ps_lsg                          0.002856
nr                              0.000876
winding_end_date                0.000556
pp_actual_product_short_name    0.000388
ps_date_coating                 0.000219
pp_plan_end_date_winding        0.000177
derived_date                    0.000143
qc_serien_nummer                0.000143
qc_pa_beschichtungsjahr         0.000143
sc_d_datum                      0.000143
qc_datum_product_properties     0.000143
p_product                       0.000143
p_product_full_name             0.000143
p_product_group 

In [None]:

sc_l_datum_hand                 0.999360
pa_ctmc_richtwert_%             0.998972
pa_tmc_gehalt_in_percentage     0.637323
sc_l_datum_auto                 0.246573
qc_datum_leak_test_values       0.233463
sc_datum_generate               0.233463
qc_barcode_leak_test_values     0.233463

In [45]:
df5_nans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 31 columns):
pa_ctmc_richtwert_%             122 non-null float64
nr                              118583 non-null float64
ps_date_coating                 118661 non-null datetime64[ns]
ps_lsg                          118348 non-null object
qt_datum                        116303 non-null datetime64[ns]
pp_plan_product                 118686 non-null object
pp_plan_ausbeute_elemente       118678 non-null float64
pp_plan_end_date_winding        118666 non-null datetime64[ns]
pp_actual_product_short_name    118641 non-null object
pp_actual_product               118682 non-null object
pp_actual_usage                 118685 non-null object
windung_begin_date              117824 non-null datetime64[ns]
winding_end_date                118621 non-null datetime64[ns]
pa_tmc_gehalt_in_percentage     43045 non-null object
qc_serien_nummer                118670 non-null object
qc_erfassungs_datum      