# Explore and Clean Data

In [18]:
import math as mt

import numpy as np

import pandas as pd

# Enables interactive figures
# import mpld3

%matplotlib notebook
import matplotlib.pyplot as plt

#mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [9.5, 6]


## First things first: Read data <a name="first_things"></a>

* We read the data in the .csv file using pd.read_csv()
* Then we saved it as a "pickled" dataframe using the to_pickle() method. Since it is faster than always having to read the .csv file
* This was done with the following snippet of code, which is commented now.

In [19]:
# Read data from csv file


df = pd.read_csv("./Data/tu_berlin_data_analytics.csv",
                 infer_datetime_format=True,
                 true_values = ['x', 'X'], 
                 # some columns have 'x' and 'X' which we consider to be a boolean indicator
                )

# Warning on columns 159 161 234 256. 
# They should be checked!


# Save data as picled dataframe. Which is faster to read
df.to_pickle("./Data/processData.pkl")

In [20]:
# df = pd.read_pickle("./Data/processData.pkl")

## Check general structure

In [21]:
# Check how many rows (observations) and columns (variables) the data frame has
df.shape
# 118687 rows and 374 columns

(118687, 374)

In [22]:
# Check more detailed information from df
df.info()
# It has 325 float variables, 5 integers, of wich some might be actually boolean and 44 strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118687 entries, 0 to 118686
Columns: 374 entries, pa_datum to derived_date
dtypes: float64(325), int64(5), object(44)
memory usage: 338.7+ MB


In [23]:
df.dtypes

pa_datum                                                object
pa_ps_beschichtete_rollenlange_m                       float64
pa_beschichtete_rollenlange_m                          float64
pa_bahn-geschwin-digkeit_m/min                         float64
pa_badwechsel_m-pda                                    float64
pa_badwechsel_hw1                                      float64
pa_badwechsel_hw2                                      float64
pa_badwechsel_chlor                                    float64
pa_badwechsel_hw3                                      float64
pa_raum_temperatur_start_c                             float64
pa_temperatur_m-pda-bad_c                              float64
pa_cm-pda_0m                                           float64
pa_cm-pda_500m                                         float64
pa_cm-pda_1000m                                        float64
pa_cm-pda_1500m                                        float64
pa_cm-pda_2000m                                        

In [24]:
df.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,quiver_url,quiver_encoded_query,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,,,2018-05-01


## Going a bit deeper

In [25]:
# Check variable (column) names in the df
variables = list(df.columns)
# variables

In [52]:
# Saves variable names in a excel file 

# pd.DataFrame(variables).to_excel("output.xlsx")


Get all columns with of constant values:

In [26]:
df_constant = df.loc[:, (df == df.iloc[0]).all()]
# https://stackoverflow.com/questions/20209600/pandas-dataframe-remove-constant-column

constants = df_constant.head(1).to_dict(orient = 'list')

In [27]:
constants

{'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_raum_temperatur_start_ref_c': [24.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'component': ['MEB'],
 'p_net

And let's get all the remaining variables and find which columns have only NA's:

In [28]:
df2 = df.loc[:, (df != df.iloc[0]).any()]

In [29]:
mask = df2.isna().all()

type(mask)

df2.loc[:,mask].head(1)
# https://stackoverflow.com/questions/29281815/pandas-select-dataframe-columns-using-boolean
onlyNA = df2.loc[:,mask].head(1).to_dict(orient = 'list')
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan]}

In [30]:
# Drop columns constituting of only NA's
df3 = df2.dropna(axis = 1, how = "all")
df3.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Let's check the 43 variables with dtype object:

In [93]:
df3.select_dtypes(object).head()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qt_datum,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,,,2018-05-19,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


Apparently columns "pa_bad-wechsel_mpda_nach", "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values" are also full of NA's:

In [32]:
df3.loc[:,['pa_bad-wechsel_mpda_nach', "pa_bad-wechsel_chlor_nach", "qc_datum_leak_test_values"]].describe()

Unnamed: 0,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,qc_datum_leak_test_values
count,34075,34092,90978
unique,1,1,22521
top,True,True,2019-06-01T01:10:07.000Z
freq,34075,34092,663


We can't drop these columns!
Let's instead change the NA's to False:

In [45]:
# print(df3.loc[:,'pa_bad-wechsel_mpda_nach'].unique())
# print(df3.loc[:,'pa_bad-wechsel_chlor_nach'].unique())

# li = ['pa_bad-wechsel_mpda_nach','pa_bad-wechsel_chlor_nach']

# for l in li:
#    onlyNA[l] = df3.loc[:,l].unique()
    
#df4 = df3.drop(li, axis = 1)

df3.loc[:,['pa_bad-wechsel_mpda_nach', 'pa_bad-wechsel_chlor_nach']]

df4 = df3.fillna(value = {'pa_bad-wechsel_mpda_nach':False, 'pa_bad-wechsel_chlor_nach': False} )

In [50]:
df4.loc[:,['pa_bad-wechsel_mpda_nach', 'pa_bad-wechsel_chlor_nach']].dtypes


pa_bad-wechsel_mpda_nach     bool
pa_bad-wechsel_chlor_nach    bool
dtype: object

In [122]:
onlyNA

{'qc_entnahme_datum': [nan],
 'quiver_url': [nan],
 'quiver_encoded_query': [nan],
 'pa_bad-wechsel_mpda_nach': array([nan, 'x', 'X'], dtype=object),
 'pa_bad-wechsel_chlor_nach': array([nan, 'x'], dtype=object)}

In [123]:
df4.head()

Unnamed: 0,pa_datum,pa_ps_beschichtete_rollenlange_m,pa_beschichtete_rollenlange_m,pa_bahn-geschwin-digkeit_m/min,pa_badwechsel_m-pda,pa_badwechsel_hw1,pa_badwechsel_hw2,pa_badwechsel_chlor,pa_badwechsel_hw3,pa_raum_temperatur_start_c,...,chlor_con_median,f_danfugt_bs_median,f_danfugt_as_median,f_codfely_median,temp_median,ps_f_auftragswerk_median,ps_dicke_median,reaction_start,reaction_end,derived_date
0,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,250.0,4480.0,9.0,0.0,0.0,1.0,0.0,0.0,24.0,...,826.0,58.0,58.0,80.1,22.0,78.0,134.5,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


In [51]:
df4.select_dtypes(object).head()

Unnamed: 0,pa_datum,ps,ps_datum,nr,ps_date_coating,ps_lsg,pa_date_coating,qt_datum,pp_product_short_name,pp_plan_actual_date_coating,...,qc_datum_leak_test_values,qc_datum_product_properties,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr,reaction_start,reaction_end,derived_date
0,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
1,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
2,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
3,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01
4,2018-05-17,cb031d4b18ff,2018-05-13,2891,2018-05-13,230,2018-05-17,2018-05-19,6989995295da,2018-05-17,...,,2015-09-29T00:00:00.000Z,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W,2018-05-17T00:00:00.000Z,2018-05-17T00:00:00.000Z,2018-05-01


In [55]:
pd.to_datetime(df4.qc_datum_product_properties).head()

0   2015-09-29 00:00:00+00:00
1   2015-09-29 00:00:00+00:00
2   2015-09-29 00:00:00+00:00
3   2015-09-29 00:00:00+00:00
4   2015-09-29 00:00:00+00:00
Name: qc_datum_product_properties, dtype: datetime64[ns, UTC]

## Change objects to datetime
Now we should transform the data into the appropriate type:

In [76]:
df5 = df4

for col in df4.select_dtypes(object).columns:
    try:
        df5.loc[:,col] = pd.to_datetime(df4.loc[:,col])
    except:
        None

In [80]:
df5.select_dtypes(object).head()

Unnamed: 0,ps,nr,ps_lsg,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,winding_product_short_name,winding_product_type,winding_product_line,pa_tmc_gehalt_in_percentage,pa_ref,qc_serien_nummer,qc_barcode_leak_test_values,p_product,p_product_full_name,p_product_group,p_product_type,qc_pa_beschichtungsjahr
0,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6f5dd5e75de0,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
1,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,e83198853aa3,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
2,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,0c6c47811c04,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
3,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,6b51542380df,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W
4,cb031d4b18ff,2891,230,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,6989995295da,6989995295da,9bec1f36ec0d,,bc7e29194383,58df9ba0a603,,1c39db15c26b,b3ccc5e5f9b5,4d6474389c69,6989995295da,W


In [87]:
df5.select_dtypes(['datetime', 'datetimetz']).iloc[50001:50006,:]

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
50001,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:49+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:18:49+00:00,2018-11-15 06:24:36+00:00,2018-11-15 06:24:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50002,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:45+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:18:45+00:00,2018-11-15 06:29:04+00:00,2018-11-15 06:29:04+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50003,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:10+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:25:10+00:00,2018-11-15 06:26:50+00:00,2018-11-15 06:26:50+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50004,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:06+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:25:06+00:00,2018-11-15 06:33:32+00:00,2018-11-15 06:33:32+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50005,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:15+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:15+00:00,2018-11-15 06:31:18+00:00,2018-11-15 06:31:18+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01


In [90]:
df5.sc_l_datum_hand.describe() 
# Check wheter this column has only missing data

count                            76
unique                           71
top       2019-09-24 19:08:53+00:00
freq                              2
first     2018-06-07 10:04:35+00:00
last      2019-12-05 22:18:26+00:00
Name: sc_l_datum_hand, dtype: object

sc_datum_generate, sc_l_datum_auto, sc_l_datum_hand, qc_datum_leak_test_values have a lot of NA's and sc_l_datum_hand seems to have only NA's, but it actually has some data!

In [94]:
# Save data for future work in pickled format
df5.to_pickle("./Data/process_data_df5.pkl")

Select all columns containing "date", "datum", "start" or "end" in its label:


In [91]:
# def searchInLabel(stringList, df):
#     ''''''#Checks whter there is any "stringToCheck" among the variables (column names)
#           # print(any(stringToCheck in s for s in variables)) 
        
#     variables = list(df.columns)
        
#     #Get all the items containing string
#     matching = []
    
#     for string in stringList:
#         matching = matching + [s for s in variables if string in s]

#     return matching

# dateVars = searchInLabel(['date','datum'],df4) + ['reaction_start','reaction_end']

# print(*dateVars,sep ='\n')

# df4.loc[:,dateVars]

