In [2]:
import numpy as np
import pandas as pd

import seaborn as sns

%matplotlib widget
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [10,8]
pd.options.display.max_columns = None
sns.set(style="ticks", color_codes=True)


In [3]:
df = pd.read_pickle('./Data/process_data_df10.pkl')

`pa_coating` is a dummy variable and doesn't belong to the original collected data. For further analysis it can be dropped.

### Removing unnecessary columns

In [4]:
colsToRemove = ['pa_coating']

df1 = df.copy()
df1.drop(labels=colsToRemove,axis='columns',level=1,inplace=True)

Let's check whether the variables where succesfully removed:

In [5]:
# Check if any elements of colsToRemove is in df
print('Are any of the `colsToRemove` in `df`?')
print('  ',any([i for i in colsToRemove if i in df.columns.get_level_values(level=1)]))
print('\n')

# Check if any elements of colsToRemove is in df1
print('Are any of the `colsToRemove` in `df1`?')
print('  ',any([i for i in colsToRemove if i in df1.columns.get_level_values(level=1)]))

Are any of the `colsToRemove` in `df`?
   True


Are any of the `colsToRemove` in `df1`?
   False


### Exploring `p_product`, `p_product_type` and `p_product_group` 

Nice! Now we could keep removing unnecessary columns, but let's focus in the feature we want to keep/analyse.

My 1st question is: 
> Do the features `qc_salzrckhalt` and `qc_durchfluss` vary significantly among different `p_product`, `p_product_type` and `p_product_group`? 

In [6]:
# df1.head()
# df1.loc[:,(slice(None),['ps','nr','p_product','p_product_group','p_product_type','qc_salzrckhalt','qc_durchfluss'])].head()

In [7]:
df2 = df1.copy()
df2 = df2.droplevel(level= 0, axis='columns')

XY  = df2[['nr','p_product_group','p_product_type','p_product','qc_salzrckhalt','qc_durchfluss']]
# Y  = df2[['qc_salzrckhalt','qc_durchfluss']]
# X  = df2[['nr','p_product','p_product_group','p_product_type']]

In [8]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

XY_mean = XY.groupby(groupCols)[goalCols].mean()
XY_var  = XY.groupby(groupCols)[goalCols].var()
# pd.concat({'mean':XY_mean,'var':XY_var},axis=1)

pd.merge(XY_mean,XY_var,
         sort=True,
         suffixes=('_mean','_var'),
         left_index=True,
         right_index=True)

# pd.options.display.max_rows = 10000
pd.reset_option("display.max_rows")
XY_mean.join(XY_var,
      lsuffix='_mean',
      rsuffix='_var')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qc_salzrckhalt_mean,qc_durchfluss_mean,qc_salzrckhalt_var,qc_durchfluss_var
p_product_group,p_product_type,p_product,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brackwasserelemente,5200bdfc01a1,B400 HP,98.989131,47.087445,71.476345,40.327167
Brackwasserelemente,5200bdfc01a1,B440 HP,99.16351,54.066717,51.370364,51.775506
Brackwasserelemente,6989995295da,B085 LE 4040,99.144588,7.833652,44.251208,0.885173
Brackwasserelemente,6989995295da,B400 LE,98.935167,35.869479,66.230018,14.618364
Brackwasserelemente,6989995295da,B400 LE ASD,99.05554,38.267354,53.562097,18.428664
Brackwasserelemente,6989995295da,B440 LE,98.782107,40.682883,86.792965,33.080534
Brackwasserelemente,6d2830b1e76d,B085 HF 4040,99.113559,10.15249,63.53515,2.749589
Brackwasserelemente,6d2830b1e76d,B400 HF,99.012014,45.510025,70.983417,46.215917
Brackwasserelemente,6d2830b1e76d,B440 HF,98.694982,47.14502,100.284774,45.320901
Brackwasserelemente,c77cb1692e0e,B085 ULP 4040,99.27132,8.298521,20.872972,1.029492


In [9]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

(XY
 .query("qc_salzrckhalt>0")
 .quary("qc_durchfluss>0")
 .groupby(groupCols)[goalCols]
 .describe()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_salzrckhalt,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss,qc_durchfluss
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
p_product_group,p_product_type,p_product,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Brackwasserelemente,5200bdfc01a1,B400 HP,9876.0,99.710801,0.178502,91.557525,99.67783,99.737492,99.783827,100.0,9876.0,47.430731,4.933277,28.5504,44.594856,47.2601,50.009889,181.341813
Brackwasserelemente,5200bdfc01a1,B440 HP,1918.0,99.680525,0.274059,89.335492,99.637241,99.715864,99.763822,99.853401,1918.0,54.348608,6.059473,36.527449,49.77783,53.896073,59.220259,73.080644
Brackwasserelemente,6989995295da,B085 LE 4040,6041.0,99.587711,0.558698,65.393205,99.527113,99.63171,99.701355,100.0,6041.0,7.868664,0.783317,4.868129,7.325624,8.018091,8.408966,21.427962
Brackwasserelemente,6989995295da,B400 LE,2091.0,99.597574,0.817313,62.738794,99.577679,99.647162,99.694783,99.812266,2091.0,36.109638,2.457657,26.176735,34.488017,36.274165,37.89583,50.362399
Brackwasserelemente,6989995295da,B400 LE ASD,5330.0,99.594492,0.408559,82.375931,99.532635,99.652235,99.714425,99.823783,5330.0,38.475563,3.242915,27.306449,36.614513,39.096125,40.674886,48.690327
Brackwasserelemente,6989995295da,B440 LE,6446.0,99.655607,0.705622,61.18925,99.63747,99.692894,99.744372,100.0,6446.0,41.04263,4.313453,29.413929,37.955369,41.133221,43.943729,129.762116
Brackwasserelemente,6d2830b1e76d,B085 HF 4040,4849.0,99.747199,0.851215,63.662601,99.753407,99.800617,99.826787,100.0,4849.0,10.217395,1.450472,6.574422,9.47127,10.330844,10.920083,51.105603
Brackwasserelemente,6d2830b1e76d,B400 HF,21935.0,99.725208,0.60674,57.28687,99.712373,99.769406,99.806455,100.0,21935.0,45.837838,5.614436,24.756848,41.784606,45.594092,49.92849,139.313074
Brackwasserelemente,6d2830b1e76d,B440 HF,24132.0,99.709252,0.423277,69.786252,99.688687,99.739226,99.780947,100.0,24132.0,47.629521,4.765416,32.375054,44.28347,47.524029,50.877484,96.870994
Brackwasserelemente,c77cb1692e0e,B085 ULP 4040,1891.0,99.481307,0.127754,98.782418,99.402499,99.500638,99.572352,99.72264,1891.0,8.316075,0.941071,4.617965,7.575635,8.056449,8.94055,10.84394


In [9]:
groupCols = ['p_product_group','p_product_type','p_product']
goalCols  = ['qc_salzrckhalt','qc_durchfluss']

XY_mean = XY.groupby(groupCols)[goalCols].mean()
XY_std  = XY.groupby(groupCols)[goalCols].std()

smry= XY_mean.join(XY_std,
      lsuffix='_mean',
      rsuffix='_std')

dir(smry.index)

smry.index.get_level_values(level='p_product_group')

#smry['p_product_group']

%matplotlib widget

fig1 = plt.figure(figsize=[6,4])
plt.scatter(smry.index.get_level_values(level='p_product_group'),smry['qc_salzrckhalt_mean'])
plt.show()





Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
#sns.catplot(y='p_product_type',x='qc_salzrckhalt',hue='p_product_group',data=XY)
sns.catplot(y='p_product',x='qc_salzrckhalt',hue='p_product_type',data=XY[XY['p_product_group'] == 'Brackwasserelemente'])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [16]:
(XY
 .query("qc_salzrckhalt>0") # filter out zero values, since they appear to not make sense, i.e. 
 .dropna()                   # drop missing values
 .pipe((sns.catplot,'data'),
       y='p_product',x='qc_salzrckhalt',col='p_product_type',col_wrap = 4,hue='p_product_type',kind='strip'))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
(XY
 .query("qc_salzrckhalt>0")
 .dropna()
 .pipe((sns.catplot,'data'),
       x='qc_durchfluss',y='p_product',hue='p_product_group'))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
fig, ax = plt.subplots()
ax = (XY
 .query("qc_salzrckhalt>0")
 .dropna()
 .pipe((sns.scatterplot,'data'),
       x='qc_salzrckhalt',y='nr',hue='p_product_group',alpha=0.3))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Categorical variables:
**Is there a difference in the distribution of `qc_salzrckhalt` or `qc_durchfluss`**?
 
 product type, group,etc 