# Explore and Clean Data 1

In [1]:
# import math as mt

import numpy as np

import pandas as pd

import re

# Enables interactive figures
# import mpld3

%matplotlib notebook
import matplotlib.pyplot as plt

#mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [9.5, 6]


## First things first: Read data <a name="first_things"></a>

* We read the data in the .csv file using pd.read_csv()
* Then we saved it as a "pickled" dataframe using the to_pickle() method. Since it is faster than always having to read the .csv file
* This was done with the following snippet of code, which is commented now.

In [2]:
# Read data from csv file


df = pd.read_csv("./Data/tu_berlin_data_analytics.csv",
                 infer_datetime_format=True,
                 true_values = ['x', 'X'], 
                 # some columns have 'x' and 'X' which we consider to be a boolean indicator
                )

# Warning on columns 159 161 234 256. 
# They should be checked!

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Save data as pickled dataframe. Which is faster to read
# df.to_pickle("./Data/processData.pkl")

In [4]:
# Read data in the pickled format
# df = pd.read_pickle("./Data/processData.pkl")

## Check general structure

In [5]:
# Check how many rows (observations) and columns (variables) the data frame has
df.shape
# 118687 rows and 374 columns

(118687, 374)

In [6]:
# Check more detailed information from df
df.info()
# It has 325 float variables, 5 integers, of wich some might be actually boolean and 44 strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Columns: 374 entries, pa_datum to derived_date
dtypes: float64(325), int64(5), object(44)
memory usage: 338.7+ MB


In [7]:
df.dtypes

pa_datum                                                object
pa_ps_beschichtete_rollenlange_m                       float64
pa_beschichtete_rollenlange_m                          float64
pa_bahn-geschwin-digkeit_m/min                         float64
pa_badwechsel_m-pda                                    float64
pa_badwechsel_hw1                                      float64
pa_badwechsel_hw2                                      float64
pa_badwechsel_chlor                                    float64
pa_badwechsel_hw3                                      float64
pa_raum_temperatur_start_c                             float64
pa_temperatur_m-pda-bad_c                              float64
pa_cm-pda_0m                                           float64
pa_cm-pda_500m                                         float64
pa_cm-pda_1000m                                        float64
pa_cm-pda_1500m                                        float64
pa_cm-pda_2000m                                        

In [8]:
df.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,quiver_url,quiver_encoded_query,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01


## Going a bit deeper

In [9]:
# Check variable (column) names in the df
variables = list(df.columns)
# variables

Get all columns with constant values and save them in a separate dictionary:

In [10]:
df_constant = df.loc[:, (df == df.iloc[0]).all()]
# https://stackoverflow.com/questions/20209600/pandas-dataframe-remove-constant-column

constants = df_constant.head(1).to_dict(orient = 'list')

In [11]:
constants

{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

Now we get all the remaining variables and find which columns have only NA's and save them in a separate dictionary::

In [12]:
df2 = df.loc[:, (df != df.iloc[0]).any()]

In [13]:
mask = df2.isna().all()

type(mask)

df2.loc[:,mask].head(1)
# https://stackoverflow.com/questions/29281815/pandas-select-dataframe-columns-using-boolean
onlyNA = df2.loc[:,mask].head(1).to_dict(orient = 'list')
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan]}

In [14]:
# Drop columns constituting of only NA's
df3 = df2.dropna(axis = 1, how = "all")
df3.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Let's check the 43 variables with dtype object:

In [15]:
df3.select_dtypes(object).head()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qt_datum,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Apparently columns "pa_bad-wechsel_mpda_nach", "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values" are also full of NA's:

In [16]:
df3.loc[:,['pa_bad-wechsel_mpda_nach', "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values"]].describe()

Unnamed: 0,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qc_datum_leak_test_values
count,34075,34092,90978
unique,1,1,22521
top,True,True,2019-06-01T01:10:07.000Z
freq,34075,34092,663


We can't drop these columns!
Let's instead change the NA's to False:

In [17]:
df4 = df3.fillna(value = {'pa_bad-wechsel_mpda_nach':False, 'pa_bad-wechsel_chlor_nach': False} )
df4.loc[:,['pa_bad-wechsel_mpda_nach', 'pa_bad-wechsel_chlor_nach']].dtypes

pa_bad-wechsel_mpda_nach     bool
pa_bad-wechsel_chlor_nach    bool
dtype: object

In [18]:
df4.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


In [19]:
df4.select_dtypes(object).head()

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,qt_datum,pp_product_short_name,pp_plan_actual_date_coating,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


## Change objects to datetime
Now we should transform the data into the appropriate type:

In [21]:
df5 = df4.copy()

for col in df4.select_dtypes(object).columns:
    try:
        df5.loc[:,col] = pd.to_datetime(df4.loc[:,col])
        
        # print("\n!!!Column {} was transformed to datetime!!! \n".format(col))

    except:
        # print("***Column {} wasn't transformed to datetime***".format(col))
        None

In [22]:
df5.select_dtypes(object).head()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
0,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6f5dd5e75de0,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
1,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,e83198853aa3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
2,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,0c6c47811c04,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
3,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6b51542380df,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
4,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,58df9ba0a603,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [23]:
df5.select_dtypes(['datetime', 'datetimetz']).iloc[50001:50006,:]

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
50001,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:49+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:18:49+00:00,2018-11-15 06:24:36+00:00,2018-11-15 06:24:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50002,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:45+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:18:45+00:00,2018-11-15 06:29:04+00:00,2018-11-15 06:29:04+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50003,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:10+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:25:10+00:00,2018-11-15 06:26:50+00:00,2018-11-15 06:26:50+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50004,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:06+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:25:06+00:00,2018-11-15 06:33:32+00:00,2018-11-15 06:33:32+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50005,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:15+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:15+00:00,2018-11-15 06:31:18+00:00,2018-11-15 06:31:18+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01


In [24]:
df5.sc_l_datum_hand.describe() 
# Check whether this column has only missing data

count                            76
unique                           71
top       2019-09-24 19:08:53+00:00
freq                              2
first     2018-06-07 10:04:35+00:00
last      2019-12-05 22:18:26+00:00
Name: sc_l_datum_hand, dtype: object

sc_datum_generate, sc_l_datum_auto, sc_l_datum_hand, qc_datum_leak_test_values have a lot of NA's and sc_l_datum_hand seems to have only NA's, but it actually has some data!

In [25]:
# Save data for future work in pickled format
df5.to_pickle("./Data/process_data_df5.pkl")

import mpu.io
mpu.io.write("./Data/process_constants.json",constants)

{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

# Explore and Clean Data 2

## Import previously saved data

Data was saved in a pickled format. Let's reimport it.

The dataframe "df5": 
- consists of all the Features (=columns, =variables) that are not constant and that are not all NA
- it has 347 columns (27 were droped)
- the datetime and datetimetz variables were read with the correct dtype

The dictionary "constants":
- consists of all columns that are actually constant

The columns 'qc_entnahme_datum', 'quiver_url' and 'quiver_encoded_query' where droped since they only contained missing values ('NaN')

In [26]:
# df5 = pd.read_pickle('./Data/process_data_df5.pkl')
# constants = mpu.io.read('./Data/process_constants.json')
# variables = list(df5.columns)

## Divide and conquer
Let's check the different dtypes: 

In [27]:
print(*df5.dtypes.unique(),sep = '\n')

datetime64[ns]
float64
object
bool
int64
datetime64[ns, UTC]


I believe we can trust that the 'datetime' dtypes were correctely recogized.

Issues:
- [ ] Some 'float' variables might actually be 'integers', 'booleans' or 'categorical'
- [ ] Some 'object' variables might actually also be better interpreted as 'categorical'
- [x] We should check what the 'integer' variables are.
- [ ] Check whether all '%' values are consistet (e.g. between 0 and 100%).

Minor issues:
- [x] Variable names (column names) should be consistent (either all in German or all in English?) 
    - [x] '_%' instead of '_in_percentage', 
    - [x] '_date' instead of '_datum'
    - [x] 'bad-wechsel' and 'badwechsel'
- [x] Be careful with typos e.g 'windung' instead of 'winding'

### constants
'constants' provide important information from reference values:

In [28]:
constants

{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

### integer

In [29]:
df5.select_dtypes('integer').head()

Unnamed: 0,pa_rollen_seit_letztem_badwechsel_mpda,pa_rollen_seit_letztem_badwechsel_chlor,qc_faktorkonzentration,p_product_size
0,2,2,1,8
1,2,2,1,8
2,2,2,1,8
3,2,2,1,8
4,2,2,1,8


In [30]:
df5.select_dtypes('integer').nunique()

pa_rollen_seit_letztem_badwechsel_mpda     7
pa_rollen_seit_letztem_badwechsel_chlor    7
qc_faktorkonzentration                     2
p_product_size                             3
dtype: int64

### boolean

In [31]:
df5.select_dtypes(bool).head()

Unnamed: 0,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False


In [32]:
df5.select_dtypes(bool).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 2 columns):
pa_bad-wechsel_mpda_nach     118687 non-null bool
pa_bad-wechsel_chlor_nach    118687 non-null bool
dtypes: bool(2)
memory usage: 231.9 KB


Let's check all columns containing 'wechsel'.
Some columns have only '0's and '1's. We believe they are booleans!

In [33]:
df6 = df5.copy()

# Get all columns containing the string 'wechsel'
wechselColumns = [s for s in variables if 'wechsel' in s]

# Check which of these columns contain only '0's ans '1's and change their dtype to boolean
for col in df6.loc[:,wechselColumns]:
    # print(pd.unique(df5[col]))
    # print(set(pd.unique(df5[col])) <= set([0,1]))
    if set(pd.unique(df5[col])) <= set([0,1]):
        df6[col] = df5[col].astype('bool')


In [34]:
df5.loc[:,wechselColumns].head()

Unnamed: 0,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,pa_rollen_seit_letztem_badwechsel_mpda,pa_rollen_seit_letztem_badwechsel_chlor
0,0.0,0.0,1.0,0.0,0.0,False,False,2,2
1,0.0,0.0,1.0,0.0,0.0,False,False,2,2
2,0.0,0.0,1.0,0.0,0.0,False,False,2,2
3,0.0,0.0,1.0,0.0,0.0,False,False,2,2
4,0.0,0.0,1.0,0.0,0.0,False,False,2,2


In [35]:
df6.loc[:,wechselColumns].head()

Unnamed: 0,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,pa_rollen_seit_letztem_badwechsel_mpda,pa_rollen_seit_letztem_badwechsel_chlor
0,False,False,True,False,False,False,False,2,2
1,False,False,True,False,False,False,False,2,2
2,False,False,True,False,False,False,False,2,2
3,False,False,True,False,False,False,False,2,2
4,False,False,True,False,False,False,False,2,2


Let's do the same with the whole dataframe. Let's check whether we find variables whose only values are '0's and '1's:

In [36]:
# df5.select_dtypes(exclude=bool).head():

for col in df6.select_dtypes(exclude=bool):
    if set(pd.unique(df5[col])) <= set([0,1]):
        print(col)
    
# [col for col in variables if set(pd.unique(df5[col])) <= set([0,1]) ]

sc_l_ergebnis_p_nio
qc_faktorkonzentration
qc_nachkommadurchfluss


The variables 'sc_l_ergebnis_p_nio', 'qc_faktorkonzentration' and 'qc_nachkommadurchfluss' contain only '0's and '1's, but we cannot infer from their names wheter they should be treated as booleans or not. Therefore we'll leave them as they are for now.

### datetime and datetimetz

In [37]:
df5.select_dtypes(['datetime', 'datetimetz']).loc[50000:50006]

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
50000,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:10:47+00:00,...,2018-11-16 17:17:40+00:00,2018-11-15 12:10:47+00:00,2018-11-15 05:32:36+00:00,2018-11-15 05:32:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50001,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:49+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:18:49+00:00,2018-11-15 06:24:36+00:00,2018-11-15 06:24:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50002,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:45+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:18:45+00:00,2018-11-15 06:29:04+00:00,2018-11-15 06:29:04+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50003,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:10+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:25:10+00:00,2018-11-15 06:26:50+00:00,2018-11-15 06:26:50+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50004,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:06+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:25:06+00:00,2018-11-15 06:33:32+00:00,2018-11-15 06:33:32+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50005,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:15+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:15+00:00,2018-11-15 06:31:18+00:00,2018-11-15 06:31:18+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50006,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:09+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:09+00:00,2018-11-15 06:38:00+00:00,2018-11-15 06:38:00+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01


In [38]:
print(df5.select_dtypes(['datetime', 'datetimetz']).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 21 columns):
pa_datum                       118687 non-null datetime64[ns]
ps_datum                       118687 non-null datetime64[ns]
ps_date_coating                118661 non-null datetime64[ns]
pa_date_coating                118687 non-null datetime64[ns]
qt_datum                       116303 non-null datetime64[ns]
pp_plan_actual_date_coating    118687 non-null datetime64[ns]
pp_plan_end_date_winding       118666 non-null datetime64[ns]
windung_begin_date             117824 non-null datetime64[ns]
winding_end_date               118621 non-null datetime64[ns]
qc_erfassungs_datum            118670 non-null datetime64[ns, UTC]
qc_verpackungs_datum           115035 non-null datetime64[ns, UTC]
qc_einlager_datum              114941 non-null datetime64[ns, UTC]
sc_d_datum                     118670 non-null datetime64[ns, UTC]
sc_datum_generate              90978 non-null datetime64[ns, UT

Issue:

- [x] Variable 'sc_l_datum_hand' has only 76 non-null observations
    - Apparently it shows the dates and times when the side cut operation was done by hand?
    - If so, then we shouldn't discard this column, since it provides valuable quality related information.
    - [ ] We should check with the 'domain experts' what this column mean

In [39]:
pd.unique(df6['sc_l_datum_hand'])

<DatetimeArray>
[                      'NaT', '2018-10-19 15:02:29+00:00',
 '2019-02-05 14:46:03+00:00', '2019-03-08 12:39:21+00:00',
 '2018-07-24 14:33:17+00:00', '2018-07-24 15:13:26+00:00',
 '2018-11-09 09:31:59+00:00', '2019-03-28 17:22:23+00:00',
 '2019-03-28 17:25:03+00:00', '2019-03-28 17:30:03+00:00',
 '2019-03-28 17:33:23+00:00', '2018-12-09 00:54:11+00:00',
 '2019-03-27 13:45:41+00:00', '2019-06-23 13:56:36+00:00',
 '2019-06-26 02:19:04+00:00', '2018-10-25 22:59:34+00:00',
 '2018-10-26 01:02:28+00:00', '2018-10-26 01:17:34+00:00',
 '2018-10-26 15:02:04+00:00', '2018-10-18 14:05:51+00:00',
 '2018-10-18 13:50:56+00:00', '2018-10-18 15:29:09+00:00',
 '2018-10-18 17:09:54+00:00', '2018-10-28 06:11:06+00:00',
 '2018-10-28 06:13:42+00:00', '2018-10-28 06:23:02+00:00',
 '2018-10-28 06:26:09+00:00', '2018-10-28 06:45:13+00:00',
 '2018-10-28 08:14:05+00:00', '2018-10-28 17:28:54+00:00',
 '2018-10-28 19:19:41+00:00', '2019-02-01 07:15:37+00:00',
 '2019-02-01 08:58:36+00:00', '2019-08-2

Let's check the other columns with 'hand' and 'auto' in their names:

In [40]:
HandAutocols = [col for col in df6.columns if 'hand' in col or 'auto' in col]
print(HandAutocols,end = "\n\n")

onlyNas   = df6['sc_l_datum_hand'].isna()

df6.loc[~onlyNas,HandAutocols].describe(include='all')

['pa_defects_hand_#', 'sc_l_leak_auto', 'sc_l_datum_auto', 'sc_l_leak_hand', 'sc_l_datum_hand']



Unnamed: 0,pa_defects_hand_#,sc_l_leak_auto,sc_l_datum_auto,sc_l_leak_hand,sc_l_datum_hand
count,76.0,76.0,69,76.0,76
unique,,,64,,71
top,,,2019-09-24 18:42:21+00:00,,2019-09-24 19:08:53+00:00
freq,,,2,,2
first,,,2018-06-08 02:19:36+00:00,,2018-06-07 10:04:35+00:00
last,,,2019-12-05 21:58:21+00:00,,2019-12-05 22:18:26+00:00
mean,0.0,0.057086,,1.309453,
std,0.0,0.032152,,7.295852,
min,0.0,-0.004397,,0.0,
25%,0.0,0.039569,,0.083,


- Variable 'sc_l_leak_hand' is 0 when 'sc_l_datum_hand' is missing.
- Variable 'pa_defects_hand_#' appears to have a outlier (max = 7125)

In [41]:
df6.select_dtypes(['datetime', 'datetimetz']).describe()

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
count,118687,118687,118661,118687,116303,118687,118666,117824,118621,118670,...,114941,118670,90978,89422,76,90978,118670,118687,118687,118670
unique,578,594,596,593,578,593,608,607,603,111951,...,6217,111951,46739,88769,71,22521,10,578,578,24
top,2018-01-28 00:00:00,2018-03-03 00:00:00,2019-01-28 00:00:00,2019-03-05 00:00:00,2018-09-06 00:00:00,2019-03-05 00:00:00,2019-02-05 00:00:00,2018-06-25 00:00:00,2018-03-12 00:00:00,2019-09-05 16:19:42+00:00,...,2018-09-21 08:28:36+00:00,2019-09-05 16:19:42+00:00,2019-06-10 02:22:50+00:00,2019-08-25 21:09:54+00:00,2019-09-24 19:08:53+00:00,2019-06-01 01:10:07+00:00,2013-01-08 00:00:00+00:00,2018-01-28 00:00:00+00:00,2018-01-28 00:00:00+00:00,2019-03-01 00:00:00
freq,900,834,964,984,1093,984,790,519,601,4,...,49,4,24,2,2,663,30701,900,900,7428
first,2018-01-04 00:00:00,2018-01-01 00:00:00,2018-01-01 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-03 00:00:00,2018-01-11 00:00:00,2017-05-31 00:00:00,2018-01-10 00:00:00,2018-01-09 23:14:40+00:00,...,2018-01-12 05:22:27+00:00,2018-01-09 23:14:40+00:00,2018-06-07 10:04:35+00:00,2018-06-07 12:17:35+00:00,2018-06-07 10:04:35+00:00,2018-10-15 15:21:03+00:00,2012-03-20 00:00:00+00:00,2018-01-04 00:00:00+00:00,2018-01-04 00:00:00+00:00,2018-01-01 00:00:00
last,2019-12-05 00:00:00,2019-11-16 00:00:00,2019-11-17 00:00:00,2019-12-05 00:00:00,2019-12-05 00:00:00,2019-12-05 00:00:00,2019-12-10 00:00:00,2019-12-06 00:00:00,2019-12-06 00:00:00,2019-12-06 10:47:57+00:00,...,2019-12-06 05:27:12+00:00,2019-12-06 10:47:57+00:00,2019-12-05 14:59:30+00:00,2019-12-06 09:07:23+00:00,2019-12-05 22:18:26+00:00,2019-12-06 09:05:01+00:00,2018-10-22 00:00:00+00:00,2019-12-05 00:00:00+00:00,2019-12-05 00:00:00+00:00,2019-12-01 00:00:00


### object

In [42]:
df5.select_dtypes(object).head()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
0,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6f5dd5e75de0,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
1,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,e83198853aa3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
2,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,0c6c47811c04,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
3,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6b51542380df,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
4,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,58df9ba0a603,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [43]:
df5.select_dtypes(object).tail()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
118682,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,55315e14346a,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118683,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,5e840146da5b,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118684,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,05ac5c0533e3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118685,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,b629f239aa69,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
118686,3fa606fdd9e8,2892,231,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,6aa9aee40c62,,0a0d4ada494a,44994043c050,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [44]:
df5.select_dtypes(object).loc[50000:50006]

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
50000,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,79d813e4c6bf,2405711870910W0606WK31876,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50001,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,1ade0cad8b89,2405711870910W0606WK31877,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50002,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,cd97d5690539,2405711870910W0606WK31880,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50003,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,9f9ed3771287,2405711870910W0606WK31881,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50004,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,707deae1e2e0,2405711870910W0606WK31882,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50005,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,8f2bef211dd1,2405711870910W0606WK31883,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W
50006,aad34f59b337,3239,581,f8ba29e9058b,9b7853269545,6d2830b1e76d,9b7853269545,use,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,,e0687f3265c3,482ad93b3873,2405711870910W0606WK31884,9b7853269545,73928f2577b5,4d6474389c69,6d2830b1e76d,W


Issues:
- [ ] 'nr', 'ps_lsg', 'pa_tmc_gehalt_in_percentage', 'qc_barcode_leak_test_values' were read as 'object', but apparently they're numerical.
    - [ ] 'nr' has empty strings of different lengths
    - [ ] 'ps_lsg' ValueError: Unable to parse string "061/062" at position 2399
    - [ ] 'pa_tmc_gehalt_%' ValueError: Unable to parse string "0,075/0,07" at position 46788
    - [ ] 'qc_barcode_leak_test_values' is actually 'object
- [ ] 'pp_actual_usage' appears to be a good categorical variable.
- [ ] The other 'object' variables that have less than 15 unique values could be also categorical. 
- [ ] 'qc_pa_beschichtungsjahr' has also some weird values: 'W', 'X' and NaN's

In [45]:
# Lower case elements in column 'pp_actual_usage'
df6['pp_actual_usage'] = df6['pp_actual_usage'].str.lower()

# Change variable to categorical
df6['pp_actual_usage'] = df6['pp_actual_usage'].astype('category')

df6['pp_actual_usage'].dtype

CategoricalDtype(categories=['trash', 'use'], ordered=False)

In [46]:
df7 = df6.copy()
# Change 'nr' to numeric
df7.nr = pd.to_numeric(df6.nr,errors='coerse')
df7.loc[:,'nr'].dtype


dtype('float64')

We get an error with following code:
    
    pd.to_numeric(objs['ps_lsg'])

In [47]:
# df7['ps_lsg'].unique()

'ps_lsg' seems to be useless. It has several strings of the form a/a+1 that cannot be parsed as numerical.

Possible solutions:
- [x] drop column
- [ ] take the average of these values (a+0,5)

'pa_tmc_gehalt_%' has the same problem, but with only one value: '0,075/0,07'
    

In [48]:
df7['pa_tmc_gehalt_in_percentage'].unique()

array([nan, 0.06, 0.07, 0.075, 0.045, 0.08, 0.16, '0.07', '0.06', '0.16',
       '0,075/0,07', 0.05, 0.15], dtype=object)

In [49]:
# Let's fix the column 'pa_tmc_gehalt_%'
df7['pa_tmc_gehalt_in_percentage'] = df6['pa_tmc_gehalt_in_percentage'].replace('0,075/0,07','0.0725')

In [50]:
df7['pa_tmc_gehalt_in_percentage'] = pd.to_numeric(df7['pa_tmc_gehalt_in_percentage'])

In [132]:
# Removes useless columns from our dataframe and save them in 'trashColsdf'
# Maybe a better option is to leave them in the dataframe, but with a subindex trashCol?

trashColsList = ['qc_pa_beschichtungsjahr', 'ps_lsg', 'derived_date']

try:
    trashColsdf = df7.loc[:,trashColsList]
except:
    print('One or more columns in trashColList could not be found in df7 \n')
    
df8 = df7.copy()

for col in trashColsList:
    try:
        df8 = df8.drop(labels = col, axis=1)
    except:
        print('Column {} could not be found. Thus it could not be removed'.format(col))

In [133]:
print(df8.select_dtypes(object).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 15 columns):
ps                              118687 non-null object
pp_product_short_name           118687 non-null object
pp_plan_product                 118686 non-null object
pp_actual_product_short_name    118641 non-null object
pp_actual_product               118682 non-null object
winding_product_short_name      118687 non-null object
winding_product_type            118687 non-null object
winding_product_line            118687 non-null object
pa_ref                          118687 non-null object
qc_serien_nummer                118670 non-null object
qc_barcode_leak_test_values     90978 non-null object
p_product                       118670 non-null object
p_product_full_name             118670 non-null object
p_product_group                 118670 non-null object
p_product_type                  118670 non-null object
dtypes: object(15)
memory usage: 13.6+ MB
None


In [134]:
df8.select_dtypes(object).describe()

Unnamed: 0,ps,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,winding_product_short_name,winding_product_type,winding_product_line,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type
count,118687,118687,118686,118641,118682,118687,118687,118687,118687,118670,90978,118670,118670,118670,118670
unique,1236,7,22,9,22,9,7,3,1263,117862,90322,22,22,2,8
top,d6dca86ea5f0,5200bdfc01a1,fb83fd553ff7,6d2830b1e76d,fb83fd553ff7,6d2830b1e76d,6d2830b1e76d,9bec1f36ec0d,f75dcbde8fe4,0a62530c5dca,2405717323910X0519XJ20819,fb83fd553ff7,22227b31350f,4d6474389c69,6d2830b1e76d
freq,426,41352,24384,51354,24381,51354,46474,61534,426,2,2,24380,24380,112980,51353


### float

In [135]:
df8.select_dtypes(float).tail()

Unnamed: 0,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_raum_temperatur_start_c,pa_temperatur_m-pda-bad_c,pa_cm-pda_0m,pa_cm-pda_500m,pa_cm-pda_1000m,pa_cm-pda_1500m,pa_cm-pda_2000m,pa_cm-pda_2500m,pa_cm-pda_3000m,pa_cm-pda_3500m,pa_cm-pda_4000m,pa_ce-capro_lactam_%,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_c,pa_ctmc_richtwert_%,pa_ctmc_%,pa_temperatur_alkali-lsg_chem_vorbereitung_start_c,pa_temperatur_chlorbad_start_c,pa_chlorkonzentration_0m_ppm,pa_chlorkonzentration_500m_ppm,pa_chlorkonzentration_1000m_ppm,pa_chlorkonzentration_1500m_ppm,pa_chlorkonzentration_2000m_ppm,pa_chlorkonzentration_2500m_ppm,pa_chlorkonzentration_3000m_ppm,pa_chlorkonzentration_3500m_ppm,pa_chlorkonzentration_4000m_ppm,pa_air-knife_vor_amin_bad_0m_%,pa_air-knife_vor_amin_bad_600m_%,pa_air-knife_vor_amin_bad_1200m_%,pa_air-knife_vor_amin_bad_1800m_%,pa_air-knife_vor_amin_bad_2400m_%,pa_air-knife_vor_amin_bad_3000m_%,pa_air-knife_vor_amin_bad_3600m_%,pa_air-knife_vor_amin_bad_4200m_%,pa_airknife_vor_amin_trockner_0m_%,pa_airknife_vor_amin_trockner_600m_%,pa_airknife_vor_amin_trockner_1200m_%,pa_airknife_vor_amin_trockner_1800m_%,pa_airknife_vor_amin_trockner_2400m_%,pa_airknife_vor_amin_trockner_3000m_%,pa_airknife_vor_amin_trockner_3600m_%,pa_airknife_vor_amin_trockner_4200m_%,pa_raum_feuchte_start_%,pa_amin-trockner_temperatur_danfugt_c,pa_amin-trockner_feuchtigkeit_danfugt_bs_0m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_1000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_2000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_3000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_4000m_%,pa_amin-trockner_feuchtigkeit_danfugt_as_0m,pa_amin-trockner_feuchtigkeit_danfugt_as_1000m,pa_amin-trockner_feuchtigkeit_danfugt_as_2000m,pa_amin-trockner_feuchtigkeit_danfugt_as_3000m,pa_amin-trockner_feuchtigkeit_danfugt_as_4000m,pa_amin-trockner_temperatur_cofely_c,pa_amin-trockner_feuchtigkeit_cofely_0m_%,pa_amin-trockner_feuchtigkeit_cofely_600m_%,pa_amin-trockner_feuchtigkeit_cofely_1200m_%,pa_amin-trockner_feuchtigkeit_cofely_1800m_%,pa_amin-trockner_feuchtigkeit_cofely_2400m_%,pa_amin-trockner_feuchtigkeit_cofely_3000m_%,pa_amin-trockner_feuchtigkeit_cofely_3600m_%,pa_amin-trockner_feuchtigkeit_cofely_4200m_%,pa_decan-trockner_geblase_start_%,pa_decan-trockner_geblase_ende_%,pa_vertikale_feuchte_oben_start_%,pa_vertikale_feuchte_oben_mitte_%,pa_vertikale_feuchte_oben_ende_%,pa_staub-sauger_1_vor_aminbad_0_aus_>0_an,pa_staub-sauger_2_nach_aminbad,pa_staub-sauger_3_zw5_vor_hw2,pa_staub-sauger_4_nach_hw2,pa_staub-sauger_5,pa_bahn-geschwin-digkeit_ref_m/min,pa_coating,pa_cm-pda_gewichts-%_richtwert_ref_low,pa_cm-pda_gewichts-%_richtwert_ref_high,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_ref_low_c,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_ref_high_c,pa_temperatur_alkali-lsg_chem_vorbereitung_start_ref_low_c,pa_temperatur_alkali-lsg_chem_vorbereitung_start_ref_high_c,pa_airknife_vor_amin_bad_ref_%,pa_airknife_vor_amin_trockner_ref_low_%,pa_airknife_vor_amin_trockner_ref_high_%,pa_amin-trockner_temperatur_danfugt_ref_c,pa_amin-trockner_feuchtigkeit_danfugt_bs_ref_low_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_ref_high_%,pa_amin-trockner_feuchtigkeit_danfugt_as_ref_low,pa_amin-trockner_feuchtigkeit_danfugt_as_ref_high,pa_amin-trockner_feuchtigkeit_cofely_ref_low_%,pa_amin-trockner_feuchtigkeit_cofely_ref_high_%,pa_decan-trockner_geblase_ref_%,pa_vertikale_feuchte_oben_ref_low_%,pa_vertikale_feuchte_oben_ref_high_%,pa_staub-sauger_ref_high_0_aus_>0_an,pa_chlorkonzentration_richtwert_low_ppm,pa_chlorkonzentration_richtwert_high_ppm,ps_beschichtete_rollenlange_m,ps_c_losung_wt_%,ps_gap_micro_m,ps_dicke_as_micro_m,ps_dicke_1,ps_dicke_2,ps_dicke_3,ps_dicke_4,ps_dicke_bs,ps_raum_temperatur_start_c,ps_auftragsbank_temperatur_start_c,ps_bad_temperatur_0m_c,ps_bad_temperatur_500m_c,ps_bad_temperatur_1000m_c,ps_bad_temperatur_1500m_c,ps_bad_temperatur_2000m_c,ps_bad_temperatur_2500m_c,ps_bad_temperatur_3000m_c,ps_bad_temperatur_3500m_c,ps_bad_temperatur_4000m_c,ps_raum-feuchtigkeit_start_%,ps_auftragswerk_feuchtigkeit_0m_%,ps_auftragswerk_feuchtigkeit_600m_%,ps_auftragswerk_feuchtigkeit_1200m_%,ps_auftragswerk_feuchtigkeit_1800m_%,ps_auftragswerk_feuchtigkeit_2400m_%,ps_auftragswerk_feuchtigkeit_3000m_%,ps_auftragswerk_feuchtigkeit_3600m_%,ps_auftragswerk_feuchtigkeit_4200m_%,nr,ps_out_m,ps_scrap_%,pa_mpda_lsg,pa_ndecan/tmc,pa_chlorlsg,pa_sbs_lsg,pa_alkalilsg,pa_out_m,pa_scrap_allgemein_m,pa_defects_semket_#,pa_defects_dr_schenk_#,pa_defects_hand_#,pa_defects_total_#,pa_scrap_%,ct1_anfang_0m_%,ct1_anfang_flux_m3/m2/d,ct1_mitte_1_1400m_rej_%,ct1_mitte_1_1400m_flux,ct1_mitte_2_2900m_rej_%,ct1_mitte_2_2900m_flux,ct1_ende_4300m_rej_%,ct1_ende_flux_m3/m2/d,ct_retest_anfang_0m_%,ct_retest_anfang_flux_m3/m2/d,ct_retest_mitte_1_1400m_rej_%,ct_retest_mitte_1_1400m_flux,ct_retest_mitte_2_2900m_rej_%,ct_retest_mitte_2_2900m_rej_%_2,ct_retest_ende_4300m_rej_%,ct_retest_ende_flux_m3/m2/d,qt_roll_after_m,qt_roll_minus_scrap_meters_m,qt_scrap_%,qt_coating_scrap_%,pp_plan_ausbeute_elemente,pp_scrap_first_outer_dia_m,pp_scrap_last_core_m,pp_sequence_aw1_winding,pp_sequence_aw2_winding,winding_pa_m,winding_scrap_first_m,winding_scrap_last_m,winding_product_size,winding_number_of_leaves,assembled_in_elements_#,assembled_in_elements_m,assembling_scrap_%,leak_test_ok_#,leak_test_ok_m,leak_test_scrap_%,element_test_ok_#,element_test_ok_m,element_test_scrap_%,winding_scrap_%,total_scrap_%,total_yield_%,pa_tmc_gehalt_in_percentage,ct1_test_ref_low,ct1_test_ref_high,qc_lasttest,qc_salzrckhalt,qc_durchfluss,qc_p_position,sc_l_leak_auto,sc_l_leak_hand,sc_l_ergebnis_p_nio,sc_l_pressure_start,sc_l_pressure_prozent,sc_l_parameter_pressure_absolut,sc_l_parameter_pressure_prozent,qc_minimalersalzrueckhalt,qc_minimalerdurchfluss,qc_durchminimalersalzrueckhalt,qc_durchminimalerdurchfluss,qc_minimalerdurchflussgpd,qc_maximalersalzrueckhalt,qc_maximalerdurchfluss,qc_konzentration,qc_druckpsi,qc_temperaturc,qc_rueckhalt,qc_ph,qc_nachkommadurchfluss,qc_nachkommasalzrueckhalt,qc_rundunggpd,qc_konzentrationflow,qc_lagen,qc_mstaerke,qc_breite,sc_d_parameter_min,sc_d_parameter_max,sc_d_parameter_anz_werte,sc_d_parameter_1_min,sc_d_parameter_1_max,sc_d_parameter_deltamax,sc_d_ergebnis_min_ok,sc_d_ergebnis_max_ok,sc_d_ergebnis_anz_min_ok,sc_d_ergebnis_anz_max_ok,sc_d_ergebnis_deltamax_ok,sc_d_ergebnis_nio,sc_d_links_werte01,sc_d_links_werte02,sc_d_links_werte03,sc_d_links_werte04,sc_d_links_werte05,sc_d_links_werte06,sc_d_links_werte07,sc_d_links_werte08,sc_d_links_werte09,sc_d_links_werte10,sc_d_links_werte11,sc_d_links_werte12,sc_d_links_werte13,sc_d_links_werte14,sc_d_links_werte15,sc_d_links_werte16,sc_d_links_werte17,sc_d_links_werte18,sc_d_links_werte19,sc_d_links_werte20,sc_d_links_min,sc_d_links_max,sc_d_links_anz_min,sc_d_links_anz_max,sc_d_links_deltamax,sc_d_rechts_werte01,sc_d_rechts_werte02,sc_d_rechts_werte03,sc_d_rechts_werte04,sc_d_rechts_werte05,sc_d_rechts_werte06,sc_d_rechts_werte07,sc_d_rechts_werte08,sc_d_rechts_werte09,sc_d_rechts_werte10,sc_d_rechts_werte11,sc_d_rechts_werte12,sc_d_rechts_werte13,sc_d_rechts_werte14,sc_d_rechts_werte15,sc_d_rechts_werte16,sc_d_rechts_werte17,sc_d_rechts_werte18,sc_d_rechts_werte19,sc_d_rechts_werte20,sc_d_rechts_min,sc_d_rechts_max,sc_d_rechts_anz_min,sc_d_rechts_anz_max,sc_d_rechts_deltamax,p_effective_area_per_leaf_m2,p_effective_pressure_bar,qc_flux_l/m2/h,qc_a-value_l/m2/h/bar,qc_b-value_m/s_10-6,qc_flux_m3/m2/day,sum_of_scrap,roll_position,pda_median,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median
118682,250.0,1715.0,9.0,23.4,20.0,3.4,2.18,2.26,2.38,2.4,0.0,0.0,0.0,0.0,0.0,26.7,,0.12,22.6,21.0,1684.0,1640.0,1610.0,1640.0,1648.0,0.0,0.0,0.0,0.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,40.0,35.0,35.0,40.0,0.0,0.0,0.0,0.0,34.0,30.0,59.0,60.0,60.0,0.0,0.0,57.0,57.0,58.0,0.0,0.0,20.0,79.6,79.6,79.5,79.6,0.0,0.0,0.0,0.0,80.0,0.0,45.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,833.851027,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2202.0,31.0,250.0,137.0,135.0,133.0,133.0,132.0,132.0,24.0,19.0,21.7,22.2,22.3,22.2,22.3,0.0,0.0,0.0,0.0,46.0,52.0,54.0,58.0,57.0,0.0,0.0,0.0,0.0,2892.0,2202.0,4.26087,38.0,444.0,841786.0,833464.0,192.0,1715.0,0.0,3.0,4.0,0.0,3.0,22.116258,99.51,1.37,99.49,1.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2202.0,2202.0,-28.396501,4.26087,0.0,0.0,0.0,113.0,0.0,904.0,0.0,0.0,400.0,26.0,19.0,864.5,99.137148,18.0,819.0,5.263158,18.0,819.0,0.0,99.182561,99.217391,0.782609,,1.12,99.1,1.0,99.404884,33.03703,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,150.0,25.0,15.0,7.0,1.0,1.0,10.0,4800.0,26.0,0.13,965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4646,8.71,36.149091,4.150297,3.606951,0.867578,0.0,0.0,1.09,805.0,59.0,57.0,39.75,21.7,26.0,133.0
118683,250.0,1715.0,9.0,23.4,20.0,3.4,2.18,2.26,2.38,2.4,0.0,0.0,0.0,0.0,0.0,26.7,,0.12,22.6,21.0,1684.0,1640.0,1610.0,1640.0,1648.0,0.0,0.0,0.0,0.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,40.0,35.0,35.0,40.0,0.0,0.0,0.0,0.0,34.0,30.0,59.0,60.0,60.0,0.0,0.0,57.0,57.0,58.0,0.0,0.0,20.0,79.6,79.6,79.5,79.6,0.0,0.0,0.0,0.0,80.0,0.0,45.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,771.886979,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2202.0,31.0,250.0,137.0,135.0,133.0,133.0,132.0,132.0,24.0,19.0,21.7,22.2,22.3,22.2,22.3,0.0,0.0,0.0,0.0,46.0,52.0,54.0,58.0,57.0,0.0,0.0,0.0,0.0,2892.0,2202.0,4.26087,38.0,444.0,841786.0,833464.0,192.0,1715.0,0.0,3.0,4.0,0.0,3.0,22.116258,99.51,1.37,99.49,1.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2202.0,2202.0,-28.396501,4.26087,0.0,0.0,0.0,113.0,0.0,904.0,0.0,0.0,400.0,26.0,19.0,864.5,99.137148,18.0,819.0,5.263158,18.0,819.0,0.0,99.182561,99.217391,0.782609,,1.12,99.1,1.0,99.453192,32.577132,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,150.0,25.0,15.0,7.0,1.0,1.0,10.0,4800.0,26.0,0.13,965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4646,8.71,35.64587,4.092522,3.266436,0.855501,0.0,0.0,1.09,805.0,59.0,57.0,39.75,21.7,26.0,133.0
118684,250.0,1715.0,9.0,23.4,20.0,3.4,2.18,2.26,2.38,2.4,0.0,0.0,0.0,0.0,0.0,26.7,,0.12,22.6,21.0,1684.0,1640.0,1610.0,1640.0,1648.0,0.0,0.0,0.0,0.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,40.0,35.0,35.0,40.0,0.0,0.0,0.0,0.0,34.0,30.0,59.0,60.0,60.0,0.0,0.0,57.0,57.0,58.0,0.0,0.0,20.0,79.6,79.6,79.5,79.6,0.0,0.0,0.0,0.0,80.0,0.0,45.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,768.458178,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2202.0,31.0,250.0,137.0,135.0,133.0,133.0,132.0,132.0,24.0,19.0,21.7,22.2,22.3,22.2,22.3,0.0,0.0,0.0,0.0,46.0,52.0,54.0,58.0,57.0,0.0,0.0,0.0,0.0,2892.0,2202.0,4.26087,38.0,444.0,841786.0,833464.0,192.0,1715.0,0.0,3.0,4.0,0.0,3.0,22.116258,99.51,1.37,99.49,1.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2202.0,2202.0,-28.396501,4.26087,0.0,0.0,0.0,113.0,0.0,904.0,0.0,0.0,400.0,26.0,19.0,864.5,99.137148,18.0,819.0,5.263158,18.0,819.0,0.0,99.182561,99.217391,0.782609,,1.12,99.1,1.0,99.407655,31.884684,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,150.0,25.0,15.0,7.0,1.0,1.0,10.0,4800.0,26.0,0.13,965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4646,8.71,34.888194,4.005533,3.464832,0.837317,0.0,0.0,1.09,805.0,59.0,57.0,39.75,21.7,26.0,133.0
118685,250.0,1715.0,9.0,23.4,20.0,3.4,2.18,2.26,2.38,2.4,0.0,0.0,0.0,0.0,0.0,26.7,,0.12,22.6,21.0,1684.0,1640.0,1610.0,1640.0,1648.0,0.0,0.0,0.0,0.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,40.0,35.0,35.0,40.0,0.0,0.0,0.0,0.0,34.0,30.0,59.0,60.0,60.0,0.0,0.0,57.0,57.0,58.0,0.0,0.0,20.0,79.6,79.6,79.5,79.6,0.0,0.0,0.0,0.0,80.0,0.0,45.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,911.572183,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2202.0,31.0,250.0,137.0,135.0,133.0,133.0,132.0,132.0,24.0,19.0,21.7,22.2,22.3,22.2,22.3,0.0,0.0,0.0,0.0,46.0,52.0,54.0,58.0,57.0,0.0,0.0,0.0,0.0,2892.0,2202.0,4.26087,38.0,444.0,841786.0,833464.0,192.0,1715.0,0.0,3.0,4.0,0.0,3.0,22.116258,99.51,1.37,99.49,1.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2202.0,2202.0,-28.396501,4.26087,0.0,0.0,0.0,113.0,0.0,904.0,0.0,0.0,400.0,26.0,19.0,864.5,99.137148,18.0,819.0,5.263158,18.0,819.0,0.0,99.182561,99.217391,0.782609,,1.12,99.1,1.0,99.48482,34.474326,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,150.0,25.0,15.0,7.0,1.0,1.0,10.0,4800.0,26.0,0.13,965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4646,8.71,37.721778,4.330859,3.255691,0.905323,0.0,0.0,1.09,805.0,59.0,57.0,39.75,21.7,26.0,133.0
118686,250.0,1715.0,9.0,23.4,20.0,3.4,2.18,2.26,2.38,2.4,0.0,0.0,0.0,0.0,0.0,26.7,,0.12,22.6,21.0,1684.0,1640.0,1610.0,1640.0,1648.0,0.0,0.0,0.0,0.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,40.0,35.0,35.0,40.0,0.0,0.0,0.0,0.0,34.0,30.0,59.0,60.0,60.0,0.0,0.0,57.0,57.0,58.0,0.0,0.0,20.0,79.6,79.6,79.5,79.6,0.0,0.0,0.0,0.0,80.0,0.0,45.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,1046.582519,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2202.0,31.0,250.0,137.0,135.0,133.0,133.0,132.0,132.0,24.0,19.0,21.7,22.2,22.3,22.2,22.3,0.0,0.0,0.0,0.0,46.0,52.0,54.0,58.0,57.0,0.0,0.0,0.0,0.0,2892.0,2202.0,4.26087,38.0,444.0,841786.0,833464.0,192.0,1715.0,0.0,3.0,4.0,0.0,3.0,22.116258,99.51,1.37,99.49,1.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2202.0,2202.0,-28.396501,4.26087,0.0,0.0,0.0,113.0,0.0,904.0,0.0,0.0,400.0,26.0,19.0,864.5,99.137148,18.0,819.0,5.263158,18.0,819.0,0.0,99.182561,99.217391,0.782609,,1.12,99.1,1.0,99.450385,34.713551,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,150.0,25.0,15.0,7.0,1.0,1.0,10.0,4800.0,26.0,0.13,965.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4646,8.71,37.983538,4.360911,3.498615,0.911605,0.0,0.0,1.09,805.0,59.0,57.0,39.75,21.7,26.0,133.0


## Check missing data

In [136]:
# Check all columns that have missing values
naCols = df8.isna().any()
df8_nans = df8.loc[:,naCols]

# Check the percentage of missing values in each column


dfUNAN = pd.DataFrame({'dtype':df8.dtypes,
                       'nunique':df8.nunique(dropna = True),
                       'nan_%':df8.isna().sum()*100/df8.shape[0]})

# dfUNAN[dfUNAN['nunique'] < 11].sort_values(by = ['nan_%'])

# Let's check all columns containing missing values:
dfUNAN[dfUNAN['nan_%']>0].sort_values(by = ['nan_%'],ascending=False)


Unnamed: 0,dtype,nunique,nan_%
sc_l_datum_hand,"datetime64[ns, UTC]",71,99.935966
pa_ctmc_richtwert_%,float64,1,99.897209
pa_tmc_gehalt_in_percentage,float64,9,63.732338
sc_l_datum_auto,"datetime64[ns, UTC]",88769,24.657292
qc_barcode_leak_test_values,object,90322,23.346281
qc_datum_leak_test_values,"datetime64[ns, UTC]",22521,23.346281
sc_datum_generate,"datetime64[ns, UTC]",46739,23.346281
qc_einlager_datum,"datetime64[ns, UTC]",6217,3.156201
qc_verpackungs_datum,"datetime64[ns, UTC]",112100,3.077001
qt_datum,datetime64[ns],578,2.008645


In [137]:
# And now only the floats:
dfUNAN[(dfUNAN['nan_%']>0) & (dfUNAN['dtype'] == 'float64')].sort_values(by = ['nan_%'],ascending=False)

Unnamed: 0,dtype,nunique,nan_%
pa_ctmc_richtwert_%,float64,1,99.897209
pa_tmc_gehalt_in_percentage,float64,9,63.732338
nr,float64,1255,0.087625
pp_plan_ausbeute_elemente,float64,7,0.007583


Issue:
- pa_ctmc_richtwert_% is also constant, since it has only 1 unique value. 99,8972% of its values are missing. Is that relevant for our data analysis or can we simply drop this column?

Issues:

- [x] Column with name "windung" instead of "winding"

# Fix column names

In [138]:
# Get a list with all variable (column) names
oldNames = list(df8.columns)

# Let's change all 'datum's to English or to German?
dictRename = {
    'date':'datum',
    'badwechsel':'bad-wechsel',
    'in_percentage':'%',
    'windung':'winding',
    'geschwin-digkeit':'geschwindigkeit'
}


# Initialize variable newNames
newNames = oldNames

for old, new in dictRename.items():
    newNames = list(
        map(
            lambda name: re.sub(old,new,name),
            newNames
            )
        )


df9 = df8.rename(columns=dict(zip(oldNames,newNames)))

# Group Columns

In this section we transform the dataset in one with a multiindex for the columns. 

## Sort the columns by process step

* Process step 1: Coating with ps
* Process step 2: Coating with pa
* Process step 3: Assembling
* Process step 4: Quality control

In [142]:
col_names=list(df9)

## Define the process steps

In [143]:
process_steps=['1ps','2pa','3ass', '4qc'] #Numbers to arrange process steps in the correct order

In [144]:
order_by_process_dict={}

## Grouping columns using a dictionary

In [145]:
for col in col_names:
    
    
    #Prozessschritt 1
    if col.startswith('ps'):
        
        order_by_process_dict[col]=process_steps[0]
    
    #Prozessschritt 2
    elif col.startswith('pa_') or col.startswith('ct') or col.startswith('pp') or col.startswith('qt') or col.startswith('reaction') or col.endswith('median'):
        
        order_by_process_dict[col]=process_steps[1]
        
    elif col.startswith('wind') or col.startswith('sc')or col.startswith('leak') or col.startswith('assembl'):
        
        order_by_process_dict[col]=process_steps[2]
        
    elif col.startswith('qc_') or col.startswith('element') or col.startswith('sum') or col.startswith('roll') or col.startswith('total') or col.startswith('p_') or col.startswith('nr') or col.startswith('component') or col.startswith('derived'):
        
        order_by_process_dict[col]=process_steps[3]

In [146]:
df10=pd.concat(dict((*df9.groupby(order_by_process_dict, 1),)), axis=1)
df10.columns.set_levels(['ps','pa','ass','qc'],level=0,inplace=True)

df10.columns.set_levels(['ps','pa','ass','qc'],level=0,inplace=True)

In [147]:
df10.head()

Unnamed: 0_level_0,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,ps,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,ass,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc,qc
Unnamed: 0_level_1,ps,ps_datum,ps_beschichtete_rollenlange_m,ps_c_losung_wt_%,ps_gap_micro_m,ps_dicke_as_micro_m,ps_dicke_1,ps_dicke_2,ps_dicke_3,ps_dicke_4,ps_dicke_bs,ps_raum_temperatur_start_c,ps_auftragsbank_temperatur_start_c,ps_bad_temperatur_0m_c,ps_bad_temperatur_500m_c,ps_bad_temperatur_1000m_c,ps_bad_temperatur_1500m_c,ps_bad_temperatur_2000m_c,ps_bad_temperatur_2500m_c,ps_bad_temperatur_3000m_c,ps_bad_temperatur_3500m_c,ps_bad_temperatur_4000m_c,ps_raum-feuchtigkeit_start_%,ps_auftragswerk_feuchtigkeit_0m_%,ps_auftragswerk_feuchtigkeit_600m_%,ps_auftragswerk_feuchtigkeit_1200m_%,ps_auftragswerk_feuchtigkeit_1800m_%,ps_auftragswerk_feuchtigkeit_2400m_%,ps_auftragswerk_feuchtigkeit_3000m_%,ps_auftragswerk_feuchtigkeit_3600m_%,ps_auftragswerk_feuchtigkeit_4200m_%,ps_datum_coating,ps_out_m,ps_scrap_%,ps_f_auftragswerk_median,ps_dicke_median,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwindigkeit_m/min,pa_bad-wechsel_m-pda,pa_bad-wechsel_hw1,pa_bad-wechsel_hw2,pa_bad-wechsel_chlor,pa_bad-wechsel_hw3,pa_raum_temperatur_start_c,pa_temperatur_m-pda-bad_c,pa_cm-pda_0m,pa_cm-pda_500m,pa_cm-pda_1000m,pa_cm-pda_1500m,pa_cm-pda_2000m,pa_cm-pda_2500m,pa_cm-pda_3000m,pa_cm-pda_3500m,pa_cm-pda_4000m,pa_ce-capro_lactam_%,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_c,pa_ctmc_richtwert_%,pa_ctmc_%,pa_temperatur_alkali-lsg_chem_vorbereitung_start_c,pa_temperatur_chlorbad_start_c,pa_chlorkonzentration_0m_ppm,pa_chlorkonzentration_500m_ppm,pa_chlorkonzentration_1000m_ppm,pa_chlorkonzentration_1500m_ppm,pa_chlorkonzentration_2000m_ppm,pa_chlorkonzentration_2500m_ppm,pa_chlorkonzentration_3000m_ppm,pa_chlorkonzentration_3500m_ppm,pa_chlorkonzentration_4000m_ppm,pa_air-knife_vor_amin_bad_0m_%,pa_air-knife_vor_amin_bad_600m_%,pa_air-knife_vor_amin_bad_1200m_%,pa_air-knife_vor_amin_bad_1800m_%,pa_air-knife_vor_amin_bad_2400m_%,pa_air-knife_vor_amin_bad_3000m_%,pa_air-knife_vor_amin_bad_3600m_%,pa_air-knife_vor_amin_bad_4200m_%,pa_airknife_vor_amin_trockner_0m_%,pa_airknife_vor_amin_trockner_600m_%,pa_airknife_vor_amin_trockner_1200m_%,pa_airknife_vor_amin_trockner_1800m_%,pa_airknife_vor_amin_trockner_2400m_%,pa_airknife_vor_amin_trockner_3000m_%,pa_airknife_vor_amin_trockner_3600m_%,pa_airknife_vor_amin_trockner_4200m_%,pa_raum_feuchte_start_%,pa_amin-trockner_temperatur_danfugt_c,pa_amin-trockner_feuchtigkeit_danfugt_bs_0m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_1000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_2000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_3000m_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_4000m_%,pa_amin-trockner_feuchtigkeit_danfugt_as_0m,pa_amin-trockner_feuchtigkeit_danfugt_as_1000m,pa_amin-trockner_feuchtigkeit_danfugt_as_2000m,pa_amin-trockner_feuchtigkeit_danfugt_as_3000m,pa_amin-trockner_feuchtigkeit_danfugt_as_4000m,pa_amin-trockner_temperatur_cofely_c,pa_amin-trockner_feuchtigkeit_cofely_0m_%,pa_amin-trockner_feuchtigkeit_cofely_600m_%,pa_amin-trockner_feuchtigkeit_cofely_1200m_%,pa_amin-trockner_feuchtigkeit_cofely_1800m_%,pa_amin-trockner_feuchtigkeit_cofely_2400m_%,pa_amin-trockner_feuchtigkeit_cofely_3000m_%,pa_amin-trockner_feuchtigkeit_cofely_3600m_%,pa_amin-trockner_feuchtigkeit_cofely_4200m_%,pa_decan-trockner_geblase_start_%,pa_decan-trockner_geblase_ende_%,pa_vertikale_feuchte_oben_start_%,pa_vertikale_feuchte_oben_mitte_%,pa_vertikale_feuchte_oben_ende_%,pa_staub-sauger_1_vor_aminbad_0_aus_>0_an,pa_staub-sauger_2_nach_aminbad,pa_staub-sauger_3_zw5_vor_hw2,pa_staub-sauger_4_nach_hw2,pa_staub-sauger_5,pa_bahn-geschwindigkeit_ref_m/min,pa_coating,pa_cm-pda_gewichts-%_richtwert_ref_low,pa_cm-pda_gewichts-%_richtwert_ref_high,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_ref_low_c,pa_temperatur_n-decan-lsg_chem_vorbereitung_start_ref_high_c,pa_temperatur_alkali-lsg_chem_vorbereitung_start_ref_low_c,pa_temperatur_alkali-lsg_chem_vorbereitung_start_ref_high_c,pa_airknife_vor_amin_bad_ref_%,pa_airknife_vor_amin_trockner_ref_low_%,pa_airknife_vor_amin_trockner_ref_high_%,pa_amin-trockner_temperatur_danfugt_ref_c,pa_amin-trockner_feuchtigkeit_danfugt_bs_ref_low_%,pa_amin-trockner_feuchtigkeit_danfugt_bs_ref_high_%,pa_amin-trockner_feuchtigkeit_danfugt_as_ref_low,pa_amin-trockner_feuchtigkeit_danfugt_as_ref_high,pa_amin-trockner_feuchtigkeit_cofely_ref_low_%,pa_amin-trockner_feuchtigkeit_cofely_ref_high_%,pa_decan-trockner_geblase_ref_%,pa_vertikale_feuchte_oben_ref_low_%,pa_vertikale_feuchte_oben_ref_high_%,pa_staub-sauger_ref_high_0_aus_>0_an,pa_chlorkonzentration_richtwert_low_ppm,pa_chlorkonzentration_richtwert_high_ppm,pa_datum_coating,pa_mpda_lsg,pa_ndecan/tmc,pa_chlorlsg,pa_sbs_lsg,pa_alkalilsg,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,pa_out_m,pa_scrap_allgemein_m,pa_defects_semket_#,pa_defects_dr_schenk_#,pa_defects_hand_#,pa_defects_total_#,pa_scrap_%,ct1_anfang_0m_%,ct1_anfang_flux_m3/m2/d,ct1_mitte_1_1400m_rej_%,ct1_mitte_1_1400m_flux,ct1_mitte_2_2900m_rej_%,ct1_mitte_2_2900m_flux,ct1_ende_4300m_rej_%,ct1_ende_flux_m3/m2/d,ct_retest_anfang_0m_%,ct_retest_anfang_flux_m3/m2/d,ct_retest_mitte_1_1400m_rej_%,ct_retest_mitte_1_1400m_flux,ct_retest_mitte_2_2900m_rej_%,ct_retest_mitte_2_2900m_rej_%_2,ct_retest_ende_4300m_rej_%,ct_retest_ende_flux_m3/m2/d,qt_datum,qt_roll_after_m,qt_roll_minus_scrap_meters_m,qt_scrap_%,qt_coating_scrap_%,pp_product_short_name,pp_plan_actual_datum_coating,pp_plan_product,pp_plan_ausbeute_elemente,pp_plan_end_datum_winding,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,pp_scrap_first_outer_dia_m,pp_scrap_last_core_m,pp_sequence_aw1_winding,pp_sequence_aw2_winding,pa_tmc_gehalt_%,ct1_test_ref_low,ct1_test_ref_high,pa_rollen_seit_letztem_bad-wechsel_mpda,pa_rollen_seit_letztem_bad-wechsel_chlor,pa_ref,pda_median,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,reaction_start,reaction_end,winding_pa_m,winding_scrap_first_m,winding_scrap_last_m,winding_product_short_name,winding_product_type,winding_product_size,winding_number_of_leaves,winding_product_line,winding_begin_datum,winding_end_datum,assembled_in_elements_#,assembled_in_elements_m,assembling_scrap_%,leak_test_ok_#,leak_test_ok_m,leak_test_scrap_%,winding_scrap_%,sc_d_datum,sc_datum_generate,sc_l_leak_auto,sc_l_datum_auto,sc_l_leak_hand,sc_l_datum_hand,sc_l_ergebnis_p_nio,sc_l_pressure_start,sc_l_pressure_prozent,sc_l_parameter_pressure_absolut,sc_l_parameter_pressure_prozent,sc_d_parameter_min,sc_d_parameter_max,sc_d_parameter_anz_werte,sc_d_parameter_1_min,sc_d_parameter_1_max,sc_d_parameter_deltamax,sc_d_ergebnis_min_ok,sc_d_ergebnis_max_ok,sc_d_ergebnis_anz_min_ok,sc_d_ergebnis_anz_max_ok,sc_d_ergebnis_deltamax_ok,sc_d_ergebnis_nio,sc_d_links_werte01,sc_d_links_werte02,sc_d_links_werte03,sc_d_links_werte04,sc_d_links_werte05,sc_d_links_werte06,sc_d_links_werte07,sc_d_links_werte08,sc_d_links_werte09,sc_d_links_werte10,sc_d_links_werte11,sc_d_links_werte12,sc_d_links_werte13,sc_d_links_werte14,sc_d_links_werte15,sc_d_links_werte16,sc_d_links_werte17,sc_d_links_werte18,sc_d_links_werte19,sc_d_links_werte20,sc_d_links_min,sc_d_links_max,sc_d_links_anz_min,sc_d_links_anz_max,sc_d_links_deltamax,sc_d_rechts_werte01,sc_d_rechts_werte02,sc_d_rechts_werte03,sc_d_rechts_werte04,sc_d_rechts_werte05,sc_d_rechts_werte06,sc_d_rechts_werte07,sc_d_rechts_werte08,sc_d_rechts_werte09,sc_d_rechts_werte10,sc_d_rechts_werte11,sc_d_rechts_werte12,sc_d_rechts_werte13,sc_d_rechts_werte14,sc_d_rechts_werte15,sc_d_rechts_werte16,sc_d_rechts_werte17,sc_d_rechts_werte18,sc_d_rechts_werte19,sc_d_rechts_werte20,sc_d_rechts_min,sc_d_rechts_max,sc_d_rechts_anz_min,sc_d_rechts_anz_max,sc_d_rechts_deltamax,nr,element_test_ok_#,element_test_ok_m,element_test_scrap_%,total_scrap_%,total_yield_%,qc_serien_nummer,qc_lasttest,qc_salzrckhalt,qc_durchfluss,qc_p_position,qc_erfassungs_datum,qc_verpackungs_datum,qc_einlager_datum,qc_barcode_leak_test_values,qc_datum_leak_test_values,qc_minimalersalzrueckhalt,qc_minimalerdurchfluss,qc_durchminimalersalzrueckhalt,qc_durchminimalerdurchfluss,qc_minimalerdurchflussgpd,qc_maximalersalzrueckhalt,qc_maximalerdurchfluss,qc_konzentration,qc_faktorkonzentration,qc_druckpsi,qc_temperaturc,qc_rueckhalt,qc_ph,qc_nachkommadurchfluss,qc_nachkommasalzrueckhalt,qc_rundunggpd,qc_datum_product_properties,qc_konzentrationflow,qc_lagen,qc_mstaerke,qc_breite,p_product,p_product_full_name,p_product_group,p_product_type,p_product_size,p_effective_area_per_leaf_m2,p_effective_pressure_bar,qc_flux_l/m2/h,qc_a-value_l/m2/h/bar,qc_b-value_m/s_10-6,qc_flux_m3/m2/day,sum_of_scrap,roll_position
0,cb031d4b18ff,2018-05-13,4487.0,31.0,250.0,136.0,136.0,134.0,134.0,135.0,134.0,24.0,19.0,19.7,20.6,21.1,21.6,22.0,22.2,22.5,22.6,22.9,57.0,78.0,78.0,78.0,78.0,78.0,76.0,72.0,67.0,2018-05-13,4487.0,2.456522,78.0,134.5,2018-05-17,250.0,4480.0,9.0,False,False,True,False,False,24.0,19.0,2.34,2.36,2.44,2.4,2.48,2.58,2.64,2.46,1.29,0.0,29.75,,0.12,23.23,20.0,1640.0,1652.0,1666.0,1684.0,1668.0,1646.0,1588.0,1638.0,840.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,38.0,30.0,51.0,58.0,57.0,60.0,59.0,50.0,59.0,54.0,59.0,58.0,20.0,79.0,79.2,80.1,79.8,80.2,80.2,80.1,80.2,80.0,0.0,48.0,48.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,968.554352,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2018-05-17,38.0,440.0,841786.0,833464.0,190.0,False,False,4480.0,0.0,47.0,184.0,0.0,47.0,0.156006,99.63,1.49,99.54,1.65,99.58,1.52,99.72,1.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-05-19,4487.0,4487.0,-0.15625,2.456522,6989995295da,2018-05-17,1c39db15c26b,0.0,2018-05-19,6989995295da,1c39db15c26b,use,0.0,0.0,0.0,136.0,,1.12,99.1,2,2,bc7e29194383,1.23,826.0,58.0,58.0,80.1,22.0,2018-05-17 00:00:00+00:00,2018-05-17 00:00:00+00:00,4480.0,0.0,0.0,6989995295da,6989995295da,400.0,26.0,9bec1f36ec0d,2018-05-18,2018-05-19,97.0,4413.5,97.838199,77.0,3503.5,20.618557,98.306218,2018-05-20 06:10:44+00:00,NaT,0.0,NaT,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2891.0,76.0,3458.0,1.298701,98.347826,1.652174,6f5dd5e75de0,1.0,99.455138,36.818009,7.0,2018-05-20 06:10:44+00:00,2018-05-21 05:58:53+00:00,2018-05-21 07:23:37+00:00,,NaT,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,1,150.0,25.0,15.0,7.0,1.0,1.0,10.0,2015-09-29 00:00:00+00:00,4800.0,26.0,0.13,965.0,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,8,1.4646,8.71,40.286235,4.625285,3.678446,0.96687,0.0,58.181818
1,cb031d4b18ff,2018-05-13,4487.0,31.0,250.0,136.0,136.0,134.0,134.0,135.0,134.0,24.0,19.0,19.7,20.6,21.1,21.6,22.0,22.2,22.5,22.6,22.9,57.0,78.0,78.0,78.0,78.0,78.0,76.0,72.0,67.0,2018-05-13,4487.0,2.456522,78.0,134.5,2018-05-17,250.0,4480.0,9.0,False,False,True,False,False,24.0,19.0,2.34,2.36,2.44,2.4,2.48,2.58,2.64,2.46,1.29,0.0,29.75,,0.12,23.23,20.0,1640.0,1652.0,1666.0,1684.0,1668.0,1646.0,1588.0,1638.0,840.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,38.0,30.0,51.0,58.0,57.0,60.0,59.0,50.0,59.0,54.0,59.0,58.0,20.0,79.0,79.2,80.1,79.8,80.2,80.2,80.1,80.2,80.0,0.0,48.0,48.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,987.375093,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2018-05-17,38.0,440.0,841786.0,833464.0,190.0,False,False,4480.0,0.0,47.0,184.0,0.0,47.0,0.156006,99.63,1.49,99.54,1.65,99.58,1.52,99.72,1.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-05-19,4487.0,4487.0,-0.15625,2.456522,6989995295da,2018-05-17,1c39db15c26b,0.0,2018-05-19,6989995295da,1c39db15c26b,use,0.0,0.0,0.0,136.0,,1.12,99.1,2,2,bc7e29194383,1.23,826.0,58.0,58.0,80.1,22.0,2018-05-17 00:00:00+00:00,2018-05-17 00:00:00+00:00,4480.0,0.0,0.0,6989995295da,6989995295da,400.0,26.0,9bec1f36ec0d,2018-05-18,2018-05-19,97.0,4413.5,97.838199,77.0,3503.5,20.618557,98.306218,2018-05-20 06:06:14+00:00,NaT,0.0,NaT,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2891.0,76.0,3458.0,1.298701,98.347826,1.652174,e83198853aa3,1.0,99.530406,36.787088,6.0,2018-05-20 06:06:14+00:00,2018-05-21 05:58:49+00:00,2018-05-21 07:23:37+00:00,,NaT,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,1,150.0,25.0,15.0,7.0,1.0,1.0,10.0,2015-09-29 00:00:00+00:00,4800.0,26.0,0.13,965.0,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,8,1.4646,8.71,40.252401,4.621401,3.165247,0.966058,0.0,116.363636
2,cb031d4b18ff,2018-05-13,4487.0,31.0,250.0,136.0,136.0,134.0,134.0,135.0,134.0,24.0,19.0,19.7,20.6,21.1,21.6,22.0,22.2,22.5,22.6,22.9,57.0,78.0,78.0,78.0,78.0,78.0,76.0,72.0,67.0,2018-05-13,4487.0,2.456522,78.0,134.5,2018-05-17,250.0,4480.0,9.0,False,False,True,False,False,24.0,19.0,2.34,2.36,2.44,2.4,2.48,2.58,2.64,2.46,1.29,0.0,29.75,,0.12,23.23,20.0,1640.0,1652.0,1666.0,1684.0,1668.0,1646.0,1588.0,1638.0,840.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,38.0,30.0,51.0,58.0,57.0,60.0,59.0,50.0,59.0,54.0,59.0,58.0,20.0,79.0,79.2,80.1,79.8,80.2,80.2,80.1,80.2,80.0,0.0,48.0,48.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,1001.529227,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2018-05-17,38.0,440.0,841786.0,833464.0,190.0,False,False,4480.0,0.0,47.0,184.0,0.0,47.0,0.156006,99.63,1.49,99.54,1.65,99.58,1.52,99.72,1.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-05-19,4487.0,4487.0,-0.15625,2.456522,6989995295da,2018-05-17,1c39db15c26b,0.0,2018-05-19,6989995295da,1c39db15c26b,use,0.0,0.0,0.0,136.0,,1.12,99.1,2,2,bc7e29194383,1.23,826.0,58.0,58.0,80.1,22.0,2018-05-17 00:00:00+00:00,2018-05-17 00:00:00+00:00,4480.0,0.0,0.0,6989995295da,6989995295da,400.0,26.0,9bec1f36ec0d,2018-05-18,2018-05-19,97.0,4413.5,97.838199,77.0,3503.5,20.618557,98.306218,2018-05-20 06:15:17+00:00,NaT,0.0,NaT,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2891.0,76.0,3458.0,1.298701,98.347826,1.652174,0c6c47811c04,1.0,99.52332,36.298863,9.0,2018-05-20 06:15:17+00:00,2018-05-21 05:58:58+00:00,2018-05-21 07:23:37+00:00,,NaT,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,1,150.0,25.0,15.0,7.0,1.0,1.0,10.0,2015-09-29 00:00:00+00:00,4800.0,26.0,0.13,965.0,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,8,1.4646,8.71,39.718186,4.560067,3.170593,0.953236,0.0,174.545455
3,cb031d4b18ff,2018-05-13,4487.0,31.0,250.0,136.0,136.0,134.0,134.0,135.0,134.0,24.0,19.0,19.7,20.6,21.1,21.6,22.0,22.2,22.5,22.6,22.9,57.0,78.0,78.0,78.0,78.0,78.0,76.0,72.0,67.0,2018-05-13,4487.0,2.456522,78.0,134.5,2018-05-17,250.0,4480.0,9.0,False,False,True,False,False,24.0,19.0,2.34,2.36,2.44,2.4,2.48,2.58,2.64,2.46,1.29,0.0,29.75,,0.12,23.23,20.0,1640.0,1652.0,1666.0,1684.0,1668.0,1646.0,1588.0,1638.0,840.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,38.0,30.0,51.0,58.0,57.0,60.0,59.0,50.0,59.0,54.0,59.0,58.0,20.0,79.0,79.2,80.1,79.8,80.2,80.2,80.1,80.2,80.0,0.0,48.0,48.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,727.561883,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2018-05-17,38.0,440.0,841786.0,833464.0,190.0,False,False,4480.0,0.0,47.0,184.0,0.0,47.0,0.156006,99.63,1.49,99.54,1.65,99.58,1.52,99.72,1.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-05-19,4487.0,4487.0,-0.15625,2.456522,6989995295da,2018-05-17,1c39db15c26b,0.0,2018-05-19,6989995295da,1c39db15c26b,use,0.0,0.0,0.0,136.0,,1.12,99.1,2,2,bc7e29194383,1.23,826.0,58.0,58.0,80.1,22.0,2018-05-17 00:00:00+00:00,2018-05-17 00:00:00+00:00,4480.0,0.0,0.0,6989995295da,6989995295da,400.0,26.0,9bec1f36ec0d,2018-05-18,2018-05-19,97.0,4413.5,97.838199,77.0,3503.5,20.618557,98.306218,2018-05-20 06:10:46+00:00,NaT,0.0,NaT,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2891.0,76.0,3458.0,1.298701,98.347826,1.652174,6b51542380df,1.0,99.567549,36.045929,12.0,2018-05-20 06:10:46+00:00,2018-05-21 05:59:06+00:00,2018-05-21 07:23:37+00:00,,NaT,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,1,150.0,25.0,15.0,7.0,1.0,1.0,10.0,2015-09-29 00:00:00+00:00,4800.0,26.0,0.13,965.0,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,8,1.4646,8.71,39.441426,4.528292,2.855092,0.946594,0.0,232.727273
4,cb031d4b18ff,2018-05-13,4487.0,31.0,250.0,136.0,136.0,134.0,134.0,135.0,134.0,24.0,19.0,19.7,20.6,21.1,21.6,22.0,22.2,22.5,22.6,22.9,57.0,78.0,78.0,78.0,78.0,78.0,76.0,72.0,67.0,2018-05-13,4487.0,2.456522,78.0,134.5,2018-05-17,250.0,4480.0,9.0,False,False,True,False,False,24.0,19.0,2.34,2.36,2.44,2.4,2.48,2.58,2.64,2.46,1.29,0.0,29.75,,0.12,23.23,20.0,1640.0,1652.0,1666.0,1684.0,1668.0,1646.0,1588.0,1638.0,840.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,38.0,30.0,51.0,58.0,57.0,60.0,59.0,50.0,59.0,54.0,59.0,58.0,20.0,79.0,79.2,80.1,79.8,80.2,80.2,80.1,80.2,80.0,0.0,48.0,48.0,46.0,0.0,0.0,0.0,0.0,0.0,9.0,952.281167,2.3,2.5,24.0,28.0,21.0,22.5,80.0,0.0,70.0,30.0,57.0,58.0,57.0,58.0,79.0,81.0,80.0,40.0,42.0,100.0,1560.0,1700.0,2018-05-17,38.0,440.0,841786.0,833464.0,190.0,False,False,4480.0,0.0,47.0,184.0,0.0,47.0,0.156006,99.63,1.49,99.54,1.65,99.58,1.52,99.72,1.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-05-19,4487.0,4487.0,-0.15625,2.456522,6989995295da,2018-05-17,1c39db15c26b,0.0,2018-05-19,6989995295da,1c39db15c26b,use,0.0,0.0,0.0,136.0,,1.12,99.1,2,2,bc7e29194383,1.23,826.0,58.0,58.0,80.1,22.0,2018-05-17 00:00:00+00:00,2018-05-17 00:00:00+00:00,4480.0,0.0,0.0,6989995295da,6989995295da,400.0,26.0,9bec1f36ec0d,2018-05-18,2018-05-19,97.0,4413.5,97.838199,77.0,3503.5,20.618557,98.306218,2018-05-20 06:24:27+00:00,NaT,0.0,NaT,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2891.0,76.0,3458.0,1.298701,98.347826,1.652174,58df9ba0a603,1.0,99.577392,36.31132,15.0,2018-05-20 06:24:27+00:00,2018-05-21 10:04:52+00:00,2018-05-21 10:40:32+00:00,,NaT,99.2,28.9,99.5,36.2,7600.0,99.99,52.2,2000.0,1,150.0,25.0,15.0,7.0,1.0,1.0,10.0,2015-09-29 00:00:00+00:00,4800.0,26.0,0.13,965.0,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,8,1.4646,8.71,39.731817,4.561632,2.810373,0.953564,0.0,290.909091


# End result

In [150]:
# Check the new column names:
# [s for s in newNames if 'winding' in s]
list(df10)

[('ps', 'ps'),
 ('ps', 'ps_datum'),
 ('ps', 'ps_beschichtete_rollenlange_m'),
 ('ps', 'ps_c_losung_wt_%'),
 ('ps', 'ps_gap_micro_m'),
 ('ps', 'ps_dicke_as_micro_m'),
 ('ps', 'ps_dicke_1'),
 ('ps', 'ps_dicke_2'),
 ('ps', 'ps_dicke_3'),
 ('ps', 'ps_dicke_4'),
 ('ps', 'ps_dicke_bs'),
 ('ps', 'ps_raum_temperatur_start_c'),
 ('ps', 'ps_auftragsbank_temperatur_start_c'),
 ('ps', 'ps_bad_temperatur_0m_c'),
 ('ps', 'ps_bad_temperatur_500m_c'),
 ('ps', 'ps_bad_temperatur_1000m_c'),
 ('ps', 'ps_bad_temperatur_1500m_c'),
 ('ps', 'ps_bad_temperatur_2000m_c'),
 ('ps', 'ps_bad_temperatur_2500m_c'),
 ('ps', 'ps_bad_temperatur_3000m_c'),
 ('ps', 'ps_bad_temperatur_3500m_c'),
 ('ps', 'ps_bad_temperatur_4000m_c'),
 ('ps', 'ps_raum-feuchtigkeit_start_%'),
 ('ps', 'ps_auftragswerk_feuchtigkeit_0m_%'),
 ('ps', 'ps_auftragswerk_feuchtigkeit_600m_%'),
 ('ps', 'ps_auftragswerk_feuchtigkeit_1200m_%'),
 ('ps', 'ps_auftragswerk_feuchtigkeit_1800m_%'),
 ('ps', 'ps_auftragswerk_feuchtigkeit_2400m_%'),
 ('ps', 'ps_

In [151]:
print("{} constant columns".format(len(constants)))
print("\n")

constants

24 constant columns




{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

In [152]:
trashColsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 3 columns):
qc_pa_beschichtungsjahr    118670 non-null object
ps_lsg                     118348 non-null object
derived_date               118670 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 2.7+ MB


In [153]:
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan]}