In [1]:
import math as mt

import numpy as np

import pandas as pd

# Enables interactive figures
# import mpld3

%matplotlib notebook
import matplotlib.pyplot as plt

#mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [9.5, 6]


## First things first: Read data

* We read the data in the .csv file using pd.read_csv()
* Then we saved it as a "picled" dataframe using the to_pickle() method. Since it is faster than always having to read the .csv file
* This was done with the following snippet of code, which is commented now.

In [7]:
# Read data from csv file
df = pd.read_csv("./Data/tu_berlin_data_analytics.csv",
                 parse_dates=True,
                 infer_datetime_format=True,
                )

# Warning on columns 159 161 234 2 and 56. 
# They should be checked!


# Save data as picled dataframe. Which is faster to read
df.to_pickle("./Data/processData.pkl")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df = pd.read_pickle("./Data/processData.pkl")

## Check general structure

In [8]:
# Check how many rows (observations) and columns (variables) the data frame has
df.shape
# 118687 rows and 374 columns

(118687, 374)

In [9]:
# Check more detailed information from df
df.info()
# It has 325 float variables, 5 integers, of wich some might be actually boolean and 44 strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Columns: 374 entries, pa_datum to derived_date
dtypes: float64(325), int64(5), object(44)
memory usage: 338.7+ MB


In [10]:
df.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,quiver_url,quiver_encoded_query,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01


## Going a bit deeper

In [12]:
# Check variable (column) names in the df
variables = list(df.columns)
# variables

In [52]:
# Saves variable names in a excel file 

# pd.DataFrame(variables).to_excel("output.xlsx")


Get all columns with of constant values:

In [73]:
df_constant = df.loc[:, (df == df.iloc[0]).all()]
# https://stackoverflow.com/questions/20209600/pandas-dataframe-remove-constant-column

constants = df_constant.head(1).to_dict(orient = 'list')

In [89]:
constants

{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

And let's get all the other variables and find which columns have only NA's:

In [84]:
df2 = df.loc[:, (df != df.iloc[0]).any()]

array([250., 280.])

In [105]:
mask = df2.isna().all()

type(mask)

df2.loc[:,mask].head(1)
# https://stackoverflow.com/questions/29281815/pandas-select-dataframe-columns-using-boolean
onlyNA = df2.loc[:,mask].head(1).to_dict(orient = 'list')
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan]}

In [106]:
df3 = df2.dropna(axis = 1, how = "all")
df3.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Let's check the 44 variables with dtype object:

In [107]:
df3.select_dtypes(object)
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qt_datum,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
5,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
6,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
7,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
8,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
9,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Apparently columns "pa_bad-wechsel_mpda_nach", "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values" are also full of NA's:

In [113]:
df3.loc[:,['pa_bad-wechsel_mpda_nach', "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values"]].describe()

Unnamed: 0,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qc_datum_leak_test_values
count,34075,34092,90978
unique,2,1,22521
top,x,x,2019-06-01T01:10:07.000Z
freq,34000,34092,663


We can also drop these columns:

In [121]:
print(df3.loc[:,'pa_bad-wechsel_mpda_nach'].unique())
print(df3.loc[:,'pa_bad-wechsel_chlor_nach'].unique())

li = ['pa_bad-wechsel_mpda_nach','pa_bad-wechsel_chlor_nach']

for l in li:
    onlyNA[l] = df3.loc[:,l].unique()
    
df4 = df3.drop(li, axis = 1)

[nan 'x' 'X']
[nan 'x']


In [122]:
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan],
 'pa_bad-wechsel_mpda_nach': array([nan, 'x', 'X'], dtype=object),
 'pa_bad-wechsel_chlor_nach': array([nan, 'x'], dtype=object)}

In [123]:
df4.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


In [124]:
df4.select_dtypes(object)

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,qt_datum,pp_product_short_name,pp_plan_actual_date_coating,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
5,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
6,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
7,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
8,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
9,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Select all columns containing "date", "datum", "start" or "end" in its label:


[Index(['pa_datum', 'pa_ps_beschichtete_rollenlange_m',
        'pa_beschichtete_rollenlange_m', 'pa_bahn-geschwin-digkeit_m/min',
        'pa_badwechsel_m-pda', 'pa_badwechsel_hw1', 'pa_badwechsel_hw2',
        'pa_badwechsel_chlor', 'pa_badwechsel_hw3',
        'pa_raum_temperatur_start_c',
        ...
        'chlor_con_median', 'f_danfugt_bs_median', 'f_danfugt_as_median',
        'f_codfely_median', 'temp_median', 'ps_f_auftragswerk_median',
        'ps_dicke_median', 'reaction_start', 'reaction_end', 'derived_date'],
       dtype='object', length=345)]

In [136]:
def searchInLabel(stringList, df):
    ''''''#Checks whter there is any "stringToCheck" among the variables (column names)
          # print(any(stringToCheck in s for s in variables)) 
        
    variables = list(df.columns)
        
    #Get all the items containing string
    matching = []
    
    for string in stringList:
        matching = matching + [s for s in variables if string in s]

    return matching

dateVars = searchInLabel(['date','datum'],df4) + ['reaction_start','reaction_end']

print(*dateVars,sep ='\n')

df4.loc[:,dateVars]



ps_date_coating
pa_date_coating
pp_plan_actual_date_coating
pp_plan_end_date_winding
windung_begin_date
winding_end_date
derived_date
pa_datum
ps_datum
qt_datum
qc_erfassungs_datum
qc_verpackungs_datum
qc_einlager_datum
sc_d_datum
sc_datum_generate
sc_l_datum_auto
sc_l_datum_hand
qc_datum_leak_test_values
qc_datum_product_properties
reaction_start
reaction_end


Unnamed: 0,ps_date_coating,pa_date_coating,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,derived_date,pa_datum,ps_datum,qt_datum,...,qc_verpackungs_datum,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end
0,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:58:53.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:10:44.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
1,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:58:49.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:06:14.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
2,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:58:58.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:15:17.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
3,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:59:06.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:10:46.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
4,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T10:04:52.000Z,2018-05-21T10:40:32.000Z,2018-05-20T06:24:27.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
5,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:59:11.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:15:18.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
6,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T10:05:05.000Z,2018-05-21T10:40:32.000Z,2018-05-20T06:19:49.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
7,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T10:05:06.000Z,2018-05-21T14:29:36.000Z,2018-05-20T06:24:26.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
8,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T05:58:55.000Z,2018-05-21T07:23:37.000Z,2018-05-20T06:19:50.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z
9,2018-05-13,2018-05-17,2018-05-17,2018-05-19,2018-05-18,2018-05-19,2018-05-01,2018-05-17,2018-05-13,2018-05-19,...,2018-05-21T10:04:55.000Z,2018-05-21T10:40:32.000Z,2018-05-20T06:33:18.000Z,,,,,2015-09-29T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z


sc_datum_generate, sc_l_datum_auto, sc_l_datum_hand, qc_datum_leak_test_values have a lot of NA's and sc_l_datum_hand seems to have only NA's, but it actually has some data:

In [138]:
df4.sc_l_datum_hand.describe()

count                           76
unique                          71
top       2019-09-24T19:11:07.000Z
freq                             2
Name: sc_l_datum_hand, dtype: object

Now we should transform the data into the appropriate type

# Check for errors in data 

Let's start from the beginning. Let's analyse the first column.
I analysed it with head(), tail() and info() methods.

Found: "pa_datum" seems to be fine. It is a string variable so we should change it to datatype "date". As well as all the other "datum" columns.


In [13]:
stringToCheck = 'date'


#Checks whter there is any "stringToCheck" among the variables (column names)
# print(any(stringToCheck in s for s in variables))

#Get all the items containing "stringToCheck"
matching = [s for s in variables if stringToCheck in s]

# print('\n')

print(*matching, sep = '\n')

print('\n')

print('There are {} variables containing the string "{}"'.format(len(matching),stringToCheck))

print(df[["pa_datum"]].dtypes)

ps_date_coating
pa_date_coating
pp_plan_actual_date_coating
pp_plan_end_date_winding
windung_begin_date
winding_end_date
derived_date


There are 7 variables containing the string "date"
pa_datum    object
dtype: object


In [188]:
df[matching].head()



Unnamed: 0,pa_datum,ps_datum,qt_datum,qc_erfassungs_datum,qc_verpackungs_datum,qc_einlager_datum,qc_entnahme_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties
0,2018-05-17,2018-05-13,2018-05-19,2018-05-20T06:10:44.000Z,2018-05-21T05:58:53.000Z,2018-05-21T07:23:37.000Z,,2018-05-20T06:10:44.000Z,,,,,2015-09-29T00:00:00.000Z
1,2018-05-17,2018-05-13,2018-05-19,2018-05-20T06:06:14.000Z,2018-05-21T05:58:49.000Z,2018-05-21T07:23:37.000Z,,2018-05-20T06:06:14.000Z,,,,,2015-09-29T00:00:00.000Z
2,2018-05-17,2018-05-13,2018-05-19,2018-05-20T06:15:17.000Z,2018-05-21T05:58:58.000Z,2018-05-21T07:23:37.000Z,,2018-05-20T06:15:17.000Z,,,,,2015-09-29T00:00:00.000Z
3,2018-05-17,2018-05-13,2018-05-19,2018-05-20T06:10:46.000Z,2018-05-21T05:59:06.000Z,2018-05-21T07:23:37.000Z,,2018-05-20T06:10:46.000Z,,,,,2015-09-29T00:00:00.000Z
4,2018-05-17,2018-05-13,2018-05-19,2018-05-20T06:24:27.000Z,2018-05-21T10:04:52.000Z,2018-05-21T10:40:32.000Z,,2018-05-20T06:24:27.000Z,,,,,2015-09-29T00:00:00.000Z


In [144]:
df[matching].tail()

Unnamed: 0,pa_datum,ps_datum,qt_datum,qc_erfassungs_datum,qc_verpackungs_datum,qc_einlager_datum,qc_entnahme_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties
118682,2018-05-18,2018-05-14,2018-05-19,2018-05-20T15:58:36.000Z,2018-05-21T11:12:34.000Z,2018-05-21T14:29:36.000Z,,2018-05-20T15:58:36.000Z,,,,,2015-09-29T00:00:00.000Z
118683,2018-05-18,2018-05-14,2018-05-19,2018-05-20T15:53:57.000Z,2018-05-21T11:12:31.000Z,2018-05-21T14:29:36.000Z,,2018-05-20T15:53:57.000Z,,,,,2015-09-29T00:00:00.000Z
118684,2018-05-18,2018-05-14,2018-05-19,2018-05-20T15:53:58.000Z,2018-05-21T11:12:28.000Z,2018-05-21T14:29:36.000Z,,2018-05-20T15:53:58.000Z,,,,,2015-09-29T00:00:00.000Z
118685,2018-05-18,2018-05-14,2018-05-19,2018-05-20T15:49:27.000Z,2018-05-21T13:57:17.000Z,2018-05-21T14:29:36.000Z,,2018-05-20T15:49:27.000Z,,,,,2015-09-29T00:00:00.000Z
118686,2018-05-18,2018-05-14,2018-05-19,2018-05-20T15:49:29.000Z,2018-05-21T13:57:15.000Z,2018-05-21T14:29:36.000Z,,2018-05-20T15:49:29.000Z,,,,,2015-09-29T00:00:00.000Z


In [172]:
df_Dates = df[matching]
df_Dates.qc_entnahme_datum.unique()

x = float('nan')
print(df_Dates.isna().any())
# Most columns contain NaN
print(df_Dates.isnull().all())
# qc_entnahme_datum contains only NaN => can be safely dumped 

pa_datum                       False
ps_datum                       False
qt_datum                        True
qc_erfassungs_datum             True
qc_verpackungs_datum            True
qc_einlager_datum               True
qc_entnahme_datum               True
sc_d_datum                      True
sc_datum_generate               True
sc_l_datum_auto                 True
sc_l_datum_hand                 True
qc_datum_leak_test_values       True
qc_datum_product_properties     True
dtype: bool
pa_datum                       False
ps_datum                       False
qt_datum                       False
qc_erfassungs_datum            False
qc_verpackungs_datum           False
qc_einlager_datum              False
qc_entnahme_datum               True
sc_d_datum                     False
sc_datum_generate              False
sc_l_datum_auto                False
sc_l_datum_hand                False
qc_datum_leak_test_values      False
qc_datum_product_properties    False
dtype: bool


# Check the data

In [112]:
df.pa_datum.tail()

118682    2018-05-18
118683    2018-05-18
118684    2018-05-18
118685    2018-05-18
118686    2018-05-18
Name: pa_datum, dtype: object

In [32]:
df.iloc[:,[0]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Data columns (total 1 columns):
pa_datum    118687 non-null object
dtypes: object(1)
memory usage: 927.3+ KB


In [173]:
print(df.columns)

Index(['pa_datum', 'pa_ps_beschichtete_rollenlange_m',
       'pa_beschichtete_rollenlange_m', 'pa_bahn-geschwin-digkeit_m/min',
       'pa_badwechsel_m-pda', 'pa_badwechsel_hw1', 'pa_badwechsel_hw2',
       'pa_badwechsel_chlor', 'pa_badwechsel_hw3',
       'pa_raum_temperatur_start_c',
       ...
       'f_danfugt_as_median', 'f_codfely_median', 'temp_median',
       'ps_f_auftragswerk_median', 'ps_dicke_median', 'reaction_start',
       'reaction_end', 'quiver_url', 'quiver_encoded_query', 'derived_date'],
      dtype='object', length=374)


In [187]:
df.qc_serien_nummer.duplicated()

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
118657    False
118658    False
118659    False
118660    False
118661    False
118662    False
118663    False
118664    False
118665    False
118666    False
118667    False
118668    False
118669    False
118670    False
118671    False
118672    False
118673    False
118674    False
118675    False
118676    False
118677    False
118678    False
118679    False
118680    False
118681    False
118682    False
118683    False
118684    False
118685    False
118686    False
Name: qc_serien_nummer, 

In [193]:
df[df['qc_serien_nummer'].duplicated()].shape

(824, 374)

In [199]:
823/118685

0.006934321944643384

In [200]:
df.head(1)

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,quiver_url,quiver_encoded_query,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
