In [6]:
import numpy as np
import pandas as pd

import seaborn as sns

%matplotlib widget
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [10,8]
pd.options.display.max_columns = None
sns.set(style="ticks", color_codes=True)


In [7]:
df = pd.read_pickle('./Data/process_data_df10.pkl')

`pa_coating` is a dummy variable and doesn't belong to the original collected data. For further analysis it can be dropped.

### Removing unnecessary columns

In [3]:
colsToRemove = ['pa_coating']

df1 = df.copy()
df1.drop(labels=colsToRemove,axis='columns',level=1,inplace=True)

Let's check whether the variables where succesfully removed:

In [4]:
# Check if any elements of colsToRemove is in df
print('Are any of the `colsToRemove` in `df`?')
print('  ',any([i for i in colsToRemove if i in df.columns.get_level_values(level=1)]))
print('\n')

# Check if any elements of colsToRemove is in df1
print('Are any of the `colsToRemove` in `df1`?')
print('  ',any([i for i in colsToRemove if i in df1.columns.get_level_values(level=1)]))

Are any of the `colsToRemove` in `df`?
   True


Are any of the `colsToRemove` in `df1`?
   False


### Exploring `p_product`, `p_product_type` and `p_product_group` 

Nice! Now we could keep removing unnecessary columns, but let's focus in the feature we want to keep/analyse.

My 1st question is: 
> Do the features `qc_salzrckhalt` and `qc_durchfluss` vary significantly among different `p_product`, `p_product_type` and `p_product_group`? 

In [5]:
# df1.head()
# df1.loc[:,(slice(None),['ps','nr','p_product','p_product_group','p_product_type','qc_salzrckhalt','qc_durchfluss'])].head()

In [6]:
df2 = df1.copy()
df2 = df2.droplevel(level= 0, axis='columns')

XY  = df2[['nr','p_product_group','p_product_type','p_product','qc_salzrckhalt','qc_durchfluss']]
# Y  = df2[['qc_salzrckhalt','qc_durchfluss']]
# X  = df2[['nr','p_product','p_product_group','p_product_type']]

In [7]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

XY_mean = XY.groupby(groupCols)[goalCols].mean()
XY_var  = XY.groupby(groupCols)[goalCols].var()
# pd.concat({'mean':XY_mean,'var':XY_var},axis=1)

pd.merge(XY_mean,XY_var,
         sort=True,
         suffixes=('_mean','_var'),
         left_index=True,
         right_index=True)

# pd.options.display.max_rows = 10000
pd.reset_option("display.max_rows")
XY_mean.join(XY_var,
      lsuffix='_mean',
      rsuffix='_var')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qc_salzrckhalt_mean,qc_durchfluss_mean,qc_salzrckhalt_var,qc_durchfluss_var
p_product_group,p_product_type,p_product,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brackwasserelemente,5200bdfc01a1,B400 HP,98.989131,47.087445,71.476345,40.327167
Brackwasserelemente,5200bdfc01a1,B440 HP,99.16351,54.066717,51.370364,51.775506
Brackwasserelemente,6989995295da,B085 LE 4040,99.144588,7.833652,44.251208,0.885173
Brackwasserelemente,6989995295da,B400 LE,98.935167,35.869479,66.230018,14.618364
Brackwasserelemente,6989995295da,B400 LE ASD,99.05554,38.267354,53.562097,18.428664
Brackwasserelemente,6989995295da,B440 LE,98.782107,40.682883,86.792965,33.080534
Brackwasserelemente,6d2830b1e76d,B085 HF 4040,99.113559,10.15249,63.53515,2.749589
Brackwasserelemente,6d2830b1e76d,B400 HF,99.012014,45.510025,70.983417,46.215917
Brackwasserelemente,6d2830b1e76d,B440 HF,98.694982,47.14502,100.284774,45.320901
Brackwasserelemente,c77cb1692e0e,B085 ULP 4040,99.27132,8.298521,20.872972,1.029492


In [8]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

(XY.groupby(groupCols)[goalCols]
   .describe()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
p_product_group,p_product_type,p_product,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Brackwasserelemente,5200bdfc01a1,B400 HP,9948.0,98.989131,8.454368,0.0,99.675646,99.736654,99.78361,100.0,9948.0,47.087445,6.350367,0.0,44.535059,47.227508,49.992341,181.341813
Brackwasserelemente,5200bdfc01a1,B440 HP,1928.0,99.16351,7.167312,0.0,99.634706,99.715282,99.763482,99.853401,1928.0,54.066717,7.19552,0.0,49.69038,53.845347,59.209206,73.080644
Brackwasserelemente,6989995295da,B085 LE 4040,6068.0,99.144588,6.652158,0.0,99.525049,99.630998,99.701186,100.0,6068.0,7.833652,0.940836,0.0,7.311862,8.015189,8.407018,21.427962
Brackwasserelemente,6989995295da,B400 LE,2105.0,98.935167,8.138183,0.0,99.576432,99.645938,99.694566,99.812266,2105.0,35.869479,3.823397,0.0,34.446046,36.259536,37.887518,50.362399
Brackwasserelemente,6989995295da,B400 LE ASD,5359.0,99.05554,7.318613,0.0,99.529691,99.650936,99.714239,99.823783,5359.0,38.267354,4.292862,0.0,36.55681,39.073613,40.665634,48.690327
Brackwasserelemente,6989995295da,B440 LE,6503.0,98.782107,9.316274,0.0,99.636029,99.692065,99.743764,100.0,6503.0,40.682883,5.751568,0.0,37.865118,41.091,43.918736,129.762116
Brackwasserelemente,6d2830b1e76d,B085 HF 4040,4880.0,99.113559,7.970894,0.0,99.75195,99.800179,99.826633,100.0,4880.0,10.15249,1.658188,0.0,9.447252,10.321073,10.917885,51.105603
Brackwasserelemente,6d2830b1e76d,B400 HF,22093.0,99.012014,8.425166,0.0,99.710536,99.768654,99.806259,100.0,22093.0,45.510025,6.798229,0.0,41.701489,45.538887,49.899558,139.313074
Brackwasserelemente,6d2830b1e76d,B440 HF,24380.0,98.694982,10.014229,0.0,99.686608,99.738295,99.780531,100.0,24380.0,47.14502,6.73208,0.0,44.167822,47.45678,50.83307,96.870994
Brackwasserelemente,c77cb1692e0e,B085 ULP 4040,1895.0,99.27132,4.568695,0.0,99.40084,99.500566,99.572299,99.72264,1895.0,8.298521,1.014639,0.0,7.573814,8.050805,8.930207,10.84394


In [9]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

XY_mean = XY.groupby(groupCols)[goalCols].mean()
XY_std  = XY.groupby(groupCols)[goalCols].std()

smry= XY_mean.join(XY_std,
      lsuffix='_mean',
      rsuffix='_std')

dir(smry.index)

smry.index.get_level_values(level='p_product_group')

#smry['p_product_group']

%matplotlib widget

fig1 = plt.figure(figsize=[6,4])
plt.scatter(smry.index.get_level_values(level='p_product_group'),smry['qc_salzrckhalt_mean'])
plt.show()





Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
#sns.catplot(y='p_product_type',x='qc_salzrckhalt',hue='p_product_group',data=XY)
sns.catplot(y='p_product',x='qc_salzrckhalt',hue='p_product_type',data=XY[XY['p_product_group'] == 'Brackwasserelemente'])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [16]:
(XY
 .query("qc_salzrckhalt>0") # filter out zero values, since they appear to not make sense, i.e. 
 .dropna()                   # drop missing values
 .pipe((sns.catplot,'data'),
       y='p_product',x='qc_salzrckhalt',col='p_product_type',col_wrap = 4,hue='p_product_type',kind='strip'))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
(XY
 .query("qc_salzrckhalt>0")
 .dropna()
 .pipe((sns.catplot,'data'),
       x='qc_durchfluss',y='p_product',hue='p_product_group'))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
fig, ax = plt.subplots()
ax = (XY
 .query("qc_salzrckhalt>0")
 .dropna()
 .pipe((sns.scatterplot,'data'),
       x='qc_salzrckhalt',y='nr',hue='p_product_group',alpha=0.3))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Categorical variables:
**Is there a difference in the distribution of `qc_salzrckhalt` or `qc_durchfluss`**?
 
 product type, group,etc 