# Explore and Clean Data 2

In [6]:
import math as mt

import numpy as np

import pandas as pd

import mpu.io

# Enables interactive figures
# import mpld3

%matplotlib notebook
import matplotlib.pyplot as plt

#mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [9.5, 6]

## Import previously saved data

Data was saved in a picled format. Let's reimport it.

The dataframe "df5": 
- consists of all the Features (=columns, =variables) that are not constant and that are not all NA
- it has 347 columns (27 were droped)
- the datetime and datetimetz variables were read with the correct dtype

The dictionary "constants":
- consists of all columns that are actually constant

In [16]:
df5 = pd.read_pickle('./Data/process_data_df5.pkl')
constants = mpu.io.read('./Data/process_constants.json')

## Divide and conquer
Let's check the different dtypes: 

In [34]:
print(*df5.dtypes.unique(),sep = '\n')

datetime64[ns]
float64
object
bool
int64
datetime64[ns, UTC]


Issues:
- I believe we can trust that the 'datetime' dtypes were correctely recogized.
- Some 'float' variables might actually be 'booleans' or 'categorical'
- Some 'object' variables might actually also be better interpreted as 'categorical'
- We should check what the 'integer' variables are

Minor issues:
- Variable names (column names) should be consistent (either all in German or all in English?) (e.g. '_%' instead of '_in_percentage', '_datum' instead of '_date')
- Be careful with typos e.g 'windung' instead of 'winding'

### constants
'constants' provide important information from reference values:

In [35]:
constants

{'component': ['MEB'],
 'p_netting': [0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum_temperatur_start_ref_

### datetime and datetimetz

In [27]:
df5.select_dtypes(['datetime', 'datetimetz']).iloc[50000:50006,:]

Unnamed: 0,pa_datum,ps_datum,ps_date_coating,pa_date_coating,qt_datum,pp_plan_actual_date_coating,pp_plan_end_date_winding,windung_begin_date,winding_end_date,qc_erfassungs_datum,...,qc_einlager_datum,sc_d_datum,sc_datum_generate,sc_l_datum_auto,sc_l_datum_hand,qc_datum_leak_test_values,qc_datum_product_properties,reaction_start,reaction_end,derived_date
50000,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:10:47+00:00,...,2018-11-16 17:17:40+00:00,2018-11-15 12:10:47+00:00,2018-11-15 05:32:36+00:00,2018-11-15 05:32:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50001,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:49+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:18:49+00:00,2018-11-15 06:24:36+00:00,2018-11-15 06:24:36+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50002,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:18:45+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:18:45+00:00,2018-11-15 06:29:04+00:00,2018-11-15 06:29:04+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50003,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:10+00:00,...,2018-11-16 13:07:59+00:00,2018-11-15 12:25:10+00:00,2018-11-15 06:26:50+00:00,2018-11-15 06:26:50+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50004,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:25:06+00:00,...,2018-11-16 15:34:10+00:00,2018-11-15 12:25:06+00:00,2018-11-15 06:33:32+00:00,2018-11-15 06:33:32+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01
50005,2018-11-05,2018-11-03,2018-11-03,2018-11-04,2018-11-04,2018-11-04,2018-11-15,2018-11-14,2018-11-15,2018-11-15 12:30:15+00:00,...,2018-11-16 19:51:32+00:00,2018-11-15 12:30:15+00:00,2018-11-15 06:31:18+00:00,2018-11-15 06:31:18+00:00,NaT,2019-01-29 10:29:37+00:00,2013-01-08 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-05 00:00:00+00:00,2018-11-01


{'component': ['MEB'],
 'p_netting': [0],
 'pa_amin-trockner_temperatur_cofely_ref_c': [20.0],
 'pa_ce-capro_lactam_ref_%': [0.0],
 'pa_ctmc_richtwert_ref_%': [0.12],
 'pa_raum_temperatur_start_ref_high_c': [27.0],
 'pa_raum_temperatur_start_ref_low_c': [18.0],
 'pa_staub-sauger_ref_low_0_aus_>0_an': [0.0],
 'pa_temperatur_chlorbad_start_ref_c': [20.0],
 'pa_temperatur_m-pda-bad_ref_high': [22.0],
 'pa_temperatur_m-pda-bad_ref_low': [17.0],
 'ps_auftragsbank_temperatur_start_ref_c': [19.0],
 'ps_auftragswerk_feuchtigkeit_ref_high_%': [65.0],
 'ps_auftragswerk_feuchtigkeit_ref_low_%': [45.0],
 'ps_bad_temperatur_ref_high_c': [23.0],
 'ps_bad_temperatur_ref_low_c': [17.0],
 'ps_c_losung_ref_wt_%': [31.0],
 'ps_dicke_as_ref_high_micro_m': [150.0],
 'ps_dicke_as_ref_low_micro_m': [140.0],
 'ps_gap_ref_high_micro_m': [280.0],
 'ps_gap_ref_low_micro_m': [250.0],
 'ps_raum-feuchtigkeit_start_ref_high_%': [100.0],
 'ps_raum-feuchtigkeit_start_ref_low_%': [45.0],
 'ps_raum_temperatur_start_ref_