In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

%matplotlib widget
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [6,5]
pd.options.display.max_columns = None
sns.set(style="ticks", color_codes=True)

In [3]:
df = pd.read_pickle('./Data/process_data_df10.pkl')

In [4]:
df.dtypes.value_counts()

float64                296
object                  15
datetime64[ns, UTC]     11
datetime64[ns]           9
bool                     7
int64                    4
category                 1
dtype: int64

In [5]:
df1 = df.select_dtypes(['object','bool','int64','category'])

In [6]:
df2 = df.copy()
df2 = df2.droplevel(level= 0, axis='columns')

XY  = df2[['nr','pp_actual_usage','p_product_group','p_product_type','p_product','qc_salzrckhalt','qc_durchfluss']]


To do:

- Check if `winding_product_line`, `p_product_size` are equal (3 unique values)

- Check if:

    - `pp_product_short_name`, `pa_rollen_seit_letztem_bad-wechsel_mpda`, `pa_rollen_seit_letztem_bad-wechsel_chlor`, `winding_product_type` (7)
    - `p_product_type` (8)
    - `pp_actual_product_short_name`, `winding_product_short_name` (9).
    
    are equal (7, 8 or 9 unique values)

- Check if `pp_plan_product`, `pp_actual_product`, `p_product`, `p_product_full_name` are equal (22 unique values)

## Drop Highly Correlated Features

Source: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/

### Identify Highly Correlated Features

In [9]:
# Create correlation matrix
corr_matrix = df2.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

### Drop Marked Features

In [11]:
# Drop features 
df2.drop(df2[to_drop], axis=1)

KeyError: "None of [Index([                      ('pa', 'pa_bad-wechsel_hw1'),\n                           ('pa', 'pa_bad-wechsel_chlor'),\n                             ('pa', 'pa_bad-wechsel_hw3'),\n                      ('pa', 'pa_bad-wechsel_chlor_nach'),\n       ('pa', 'pa_rollen_seit_letztem_bad-wechsel_chlor')],\n      dtype='object')] are in the [columns]"

### Identify Highly Correlated Features

In [12]:
# Create correlation matrix
corr_matrix = df1.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

### Drop Marked Features

In [13]:
# Drop features 
df1.drop(df1[to_drop], axis=1)

Unnamed: 0_level_0,ps,pa,pa,pa,pa,pa,pa,pa,pa,pa,pa,ass,ass,ass,qc,qc,qc,qc,qc,qc,qc,qc
Unnamed: 0_level_1,ps,pa_bad-wechsel_m-pda,pa_bad-wechsel_hw2,pa_bad-wechsel_mpda_nach,pp_product_short_name,pp_plan_product,pp_actual_product_short_name,pp_actual_product,pp_actual_usage,pa_rollen_seit_letztem_bad-wechsel_mpda,pa_ref,winding_product_short_name,winding_product_type,winding_product_line,qc_serien_nummer,qc_barcode_leak_test_values,qc_faktorkonzentration,p_product,p_product_full_name,p_product_group,p_product_type,p_product_size
0,cb031d4b18ff,False,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,2,bc7e29194383,6989995295da,6989995295da,9bec1f36ec0d,6f5dd5e75de0,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
1,cb031d4b18ff,False,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,2,bc7e29194383,6989995295da,6989995295da,9bec1f36ec0d,e83198853aa3,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
2,cb031d4b18ff,False,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,2,bc7e29194383,6989995295da,6989995295da,9bec1f36ec0d,0c6c47811c04,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
3,cb031d4b18ff,False,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,2,bc7e29194383,6989995295da,6989995295da,9bec1f36ec0d,6b51542380df,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
4,cb031d4b18ff,False,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,2,bc7e29194383,6989995295da,6989995295da,9bec1f36ec0d,58df9ba0a603,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118682,3fa606fdd9e8,True,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,1,0a0d4ada494a,6989995295da,6989995295da,6aa9aee40c62,55315e14346a,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
118683,3fa606fdd9e8,True,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,1,0a0d4ada494a,6989995295da,6989995295da,6aa9aee40c62,5e840146da5b,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
118684,3fa606fdd9e8,True,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,1,0a0d4ada494a,6989995295da,6989995295da,6aa9aee40c62,05ac5c0533e3,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8
118685,3fa606fdd9e8,True,True,False,6989995295da,1c39db15c26b,6989995295da,1c39db15c26b,use,1,0a0d4ada494a,6989995295da,6989995295da,6aa9aee40c62,b629f239aa69,,1,B400 LE ASD,b3ccc5e5f9b5,Brackwasserelemente,6989995295da,8


In [None]:
df1

In [14]:
to_drop

[('pa', 'pa_bad-wechsel_hw1'),
 ('pa', 'pa_bad-wechsel_chlor'),
 ('pa', 'pa_bad-wechsel_hw3'),
 ('pa', 'pa_bad-wechsel_chlor_nach'),
 ('pa', 'pa_rollen_seit_letztem_bad-wechsel_chlor')]

In [15]:
upper

Unnamed: 0_level_0,Unnamed: 1_level_0,pa,pa,pa,pa,pa,pa,pa,pa,pa,qc,qc
Unnamed: 0_level_1,Unnamed: 1_level_1,pa_bad-wechsel_m-pda,pa_bad-wechsel_hw1,pa_bad-wechsel_hw2,pa_bad-wechsel_chlor,pa_bad-wechsel_hw3,pa_bad-wechsel_mpda_nach,pa_bad-wechsel_chlor_nach,pa_rollen_seit_letztem_bad-wechsel_mpda,pa_rollen_seit_letztem_bad-wechsel_chlor,qc_faktorkonzentration,p_product_size
pa,pa_bad-wechsel_m-pda,,0.975242,0.063358,0.979485,0.974001,0.212693,0.216095,0.242863,0.241723,0.072534,0.092494
pa,pa_bad-wechsel_hw1,,,0.057402,0.982753,0.997057,0.199511,0.202889,0.245711,0.244548,0.060755,0.074344
pa,pa_bad-wechsel_hw2,,,,0.062495,0.064271,0.014433,0.014466,0.01987,0.020266,0.006339,0.052627
pa,pa_bad-wechsel_chlor,,,,,0.985667,0.203413,0.206803,0.246831,0.245686,0.063407,0.071885
pa,pa_bad-wechsel_hw3,,,,,,0.198186,0.201568,0.248556,0.247406,0.061418,0.073494
pa,pa_bad-wechsel_mpda_nach,,,,,,,0.996563,0.743275,0.743198,0.031905,0.101631
pa,pa_bad-wechsel_chlor_nach,,,,,,,,0.737828,0.744573,0.03183,0.101497
pa,pa_rollen_seit_letztem_bad-wechsel_mpda,,,,,,,,,0.99385,0.068286,0.050377
pa,pa_rollen_seit_letztem_bad-wechsel_chlor,,,,,,,,,,0.067764,0.049341
qc,qc_faktorkonzentration,,,,,,,,,,,0.027549


##

Source:https://seaborn.pydata.org/examples/many_pairwise_correlations.html

In [17]:
# Compute the correlation matrix
corr = df1.corr()

In [18]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)

In [19]:
mask[np.triu_indices_from(mask)] = True

In [38]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
# cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x29d7a0c1448>

In [40]:
df1.select_dtypes('object').corr()

In [2]:
from pandas_profiling import ProfileReport

In [13]:
clean_column_names(df)
#profile = ProfileReport(df1)


NameError: name 'clean_column_names' is not defined

In [14]:
df = pd.DataFrame(
    np.random.rand(100, 5),
    columns=['a', 'b', 'c', 'd', 'e']
)

In [19]:
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.

In [16]:
profile

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [20]:
profile1 = ProfileReport(df1.droplevel(level= 0, axis='columns'), title='Profile', html={'style':{'full_width':True}})

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
profile1

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [22]:
list(df2.columns)

['ps',
 'ps_datum',
 'ps_beschichtete_rollenlange_m',
 'ps_c_losung_wt_%',
 'ps_gap_micro_m',
 'ps_dicke_as_micro_m',
 'ps_dicke_1',
 'ps_dicke_2',
 'ps_dicke_3',
 'ps_dicke_4',
 'ps_dicke_bs',
 'ps_raum_temperatur_start_c',
 'ps_auftragsbank_temperatur_start_c',
 'ps_bad_temperatur_0m_c',
 'ps_bad_temperatur_500m_c',
 'ps_bad_temperatur_1000m_c',
 'ps_bad_temperatur_1500m_c',
 'ps_bad_temperatur_2000m_c',
 'ps_bad_temperatur_2500m_c',
 'ps_bad_temperatur_3000m_c',
 'ps_bad_temperatur_3500m_c',
 'ps_bad_temperatur_4000m_c',
 'ps_raum-feuchtigkeit_start_%',
 'ps_auftragswerk_feuchtigkeit_0m_%',
 'ps_auftragswerk_feuchtigkeit_600m_%',
 'ps_auftragswerk_feuchtigkeit_1200m_%',
 'ps_auftragswerk_feuchtigkeit_1800m_%',
 'ps_auftragswerk_feuchtigkeit_2400m_%',
 'ps_auftragswerk_feuchtigkeit_3000m_%',
 'ps_auftragswerk_feuchtigkeit_3600m_%',
 'ps_auftragswerk_feuchtigkeit_4200m_%',
 'ps_datum_coating',
 'ps_out_m',
 'ps_scrap_%',
 'ps_f_auftragswerk_median',
 'ps_dicke_median',
 'pa_datum',
 '

In [23]:
df2.shape

(118687, 343)

In [27]:
from pandas_profiling.model import correlations as crt

In [28]:
who

ProfileReport	 XY	 crt	 df	 df1	 df2	 model	 np	 pd	 
plt	 profile	 profile1	 sns	 


In [48]:
crt.calculate_correlations(df1.droplevel(level=0,axis=1),variables=d)

{}

In [34]:
sgs = df1.droplevel(level=0,axis=1).dtypes

In [47]:
#{key:value for key,value in sgs.index value }
k = list(sgs.index)
v = list(sgs.values)
d = dict(zip(v,k))

print(d)

{dtype('O'): 'p_product_type', dtype('bool'): 'pa_bad-wechsel_chlor_nach', CategoricalDtype(categories=['trash', 'use'], ordered=False): 'pp_actual_usage', dtype('int64'): 'p_product_size'}


[dtype('O'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 CategoricalDtype(categories=['trash', 'use'], ordered=False),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64')]