In [1]:
%matplotlib inline


### Missing Data Imputation

Welcome to the "Handling Missing Data" practical session.  

Programming Language : Python 
Editor : Jupyter Notebook. 


Pandas : Data loading, processing, transformation and manipulation.
Scikit-learn : Example data source, ML and statistical analysis


This example illustrates how to apply different preprocessing and feature
imputation pipelines to different subsets of features, using
SimpleImputer, KNNImputer. This is particularly handy for the
case of datasets that contain heterogeneous data types, since we may want to
impute the numeric as well as categorical features


In this example, the numeric data is standard-scaled after mean-imputation,
while the categorical data is one-hot encoded after imputing missing values
with a new category (``'missing'``).

In addition, we show two different ways to dispatch the columns to the
particular pre-processor: by column names and by column data types.

Finally you'll be tasked with apply new transformeation on a new data set.

In [12]:
# Author:  Mamun Rashid <m.rrashid.1@gmail.com>

# License: BSD 3 clause

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn import set_config
set_config(display='diagram')

import pandas as pd

np.random.seed(0)

## Load the LongIsland_Heart_Data Set
heart_df = pd.read_csv('LongIsland_Heart_Data.csv')

heart_df.describe()
print(heart_df)

      age  sex   cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  4.0      10.0  60.0  2.0      1.0     12.0    3.0     11.0   
1    44.0  1.0  4.0       3.0   NaN  2.0      1.0      8.0    NaN     14.0   
2    60.0  1.0  4.0       5.0  27.0  2.0      1.0     19.0    3.0      6.0   
3    55.0  1.0  4.0      11.0  39.0  2.0      1.0     25.0    3.0     10.0   
4    66.0  1.0  3.0      33.0  22.0  3.0      2.0     53.0    3.0      5.0   
..    ...  ...  ...       ...   ...  ...      ...      ...    ...      ...   
195  54.0  0.0  4.0      41.0  95.0  3.0      1.0      NaN    NaN     14.0   
196  62.0  1.0  1.0       1.0  30.0  2.0      1.0      1.0    1.0      1.0   
197  55.0  1.0  4.0      37.0  33.0  3.0      1.0      4.0    2.0      NaN   
198  58.0  1.0  NaN       1.0   3.0  NaN      NaN      1.0    1.0      1.0   
199  62.0  1.0  2.0      34.0   NaN  2.0      NaN     47.0    3.0     14.0   

     slope   ca  thal  diagnosis of heart disease  
0      3.0 

In [23]:
# Cell 3: Check for missing data

# Description: This cell is to check if there is any missing data in the DataFrame.


heart_df.loc[ heart_df.isnull().sum(axis=1) >= 4,  :]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis of heart disease
69,63.0,1.0,2.0,1.0,26.0,3.0,1.0,,1.0,,,1.0,,2
120,62.0,1.0,3.0,1.0,,,1.0,1.0,,,1.0,1.0,1.0,3
130,,,3.0,,29.0,3.0,1.0,,,,1.0,1.0,3.0,2
156,64.0,1.0,,15.0,7.0,2.0,1.0,,3.0,,3.0,,1.0,3
175,58.0,1.0,4.0,,22.0,2.0,1.0,55.0,,14.0,,,1.0,1
186,61.0,,3.0,,,2.0,0.0,51.0,3.0,14.0,1.0,1.0,,4


In [3]:
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis of heart disease
0,63.0,1.0,4.0,10.0,60.0,2.0,1.0,12.0,3.0,11.0,3.0,1.0,1.0,3
1,44.0,1.0,4.0,3.0,,2.0,1.0,8.0,,14.0,1.0,,1.0,1
2,60.0,1.0,4.0,5.0,27.0,2.0,1.0,19.0,3.0,6.0,4.0,1.0,,3
3,55.0,1.0,4.0,11.0,39.0,2.0,1.0,25.0,3.0,10.0,2.0,1.0,1.0,2
4,66.0,1.0,3.0,33.0,22.0,3.0,2.0,53.0,3.0,5.0,3.0,1.0,1.0,1


Our data have the following useful features, used to test basic ML prototypes.

 Numeric Features:

 * ``chol``: float;
 -- serum cholestoral in mg/dl
 * ``thalach``: float.
 -- maximum heart rate achieved

 
 Categorical Features:

 * ``sex``: categories encoded as numeric ``{'1 = male', '2=female'}``;
 * ``cp``: ordinal integers ``{1, 2, 3, 4}``.
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic

In [24]:
X_reduced = heart_df.loc[:, ['chol', 'thalach','sex','cp']]
X_reduced.head()

Unnamed: 0,chol,thalach,sex,cp
0,60.0,12.0,1.0,4.0
1,,8.0,1.0,4.0
2,27.0,19.0,1.0,4.0
3,39.0,25.0,1.0,4.0
4,22.0,53.0,1.0,3.0


In [25]:
# What's going on here? Can we break this down?

# 1. We describe features as numeric or categorical

## We separate the numeric features
## For categorical variables we will use a simple imputer to replace the missing values with median of the remaining values"
numeric_features = ['chol', 'thalach']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

## We separate the categorical features
## For categorical variables we will use a simple imputer with a constant = "missing"
categorical_features = ['sex','cp']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))])


## We combine both "categorical" and "numerical" imputer in a preprocessor. 
## We use a column transformer to apply the transformation.

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_imputer', numeric_transformer, numeric_features),
        ('categorical_imputer', categorical_transformer, categorical_features)])


In [27]:
## Apply the pre-processor pipeline on the reduced data frame. 

print(X_reduced)

clf = Pipeline(steps=[('preprocessor', preprocessor)])
new_X_reduced = clf.fit_transform(X_reduced)

new_X_reduced_df = pd.DataFrame( new_X_reduced )
new_X_reduced_df.columns = [ numeric_features + categorical_features ]

print(new_X_reduced_df)


     chol  thalach  sex   cp
0    60.0     12.0  1.0  4.0
1     NaN      8.0  1.0  4.0
2    27.0     19.0  1.0  4.0
3    39.0     25.0  1.0  4.0
4    22.0     53.0  1.0  3.0
..    ...      ...  ...  ...
195  95.0      NaN  0.0  4.0
196  30.0      1.0  1.0  1.0
197  33.0      4.0  1.0  4.0
198   3.0      1.0  1.0  NaN
199   NaN     47.0  1.0  2.0

[200 rows x 4 columns]
     chol thalach  sex   cp
0    60.0    12.0  1.0  4.0
1    29.0     8.0  1.0  4.0
2    27.0    19.0  1.0  4.0
3    39.0    25.0  1.0  4.0
4    22.0    53.0  1.0  3.0
..    ...     ...  ...  ...
195  95.0    19.0  0.0  4.0
196  30.0     1.0  1.0  1.0
197  33.0     4.0  1.0  4.0
198   3.0     1.0  1.0 -1.0
199  29.0    47.0  1.0  2.0

[200 rows x 4 columns]


#### Your Task
1. What other types of imputations options are available in SimpleImputer ?
   Hint : Look at the SimpleImputer Documentation at scikit-learn

2. Instead of "median" use "mean" to impute values for the following two numeric features. 
    - oldpeak
    - trestbps

    2.1 What is the average value of 'oldpeak' before and after imputation. 


3. Instead of "constant" use "most_frequent" to impute values for the following two catetorical features. 
    - ca             [ ca : number of major vessels (0-3) colored by flourosopy ]
    - restecg           : resting electrocardiographic results
                            -- Value 0: normal
                            -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
                            -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria


In [None]:
#### Your Task
# 1.1 What other types of imputations options are available  ?
#    Hint : Look at the SimpleImputer Documentation at scikit-learn

## Answer
    # impute.IterativeImputer([estimator, ...]) : Multivariate imputer that estimates each feature from all the others.
    # impute.KNNImputer(*[, missing_values, ...])  : Imputation for completing missing values using k-Nearest Neighbors.


# 1.2 What other types of imputations options are available in SimpleImputer ?
#    Hint : Look at the SimpleImputer Documentation at scikit-learn


## Answer : 
# If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
# If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
# If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
# If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.



In [None]:
# 2. Instead of "median" use "mean" to impute values for the following two numeric features. 
#     - oldpeak
#     - trestbps


# 3. Instead of "constant" use "most_frequent" to impute values for the following two catetorical features. 
#     - ca             [ ca : number of major vessels (0-3) colored by flourosopy ]
#     - restecg           : resting electrocardiographic results
#                             -- Value 0: normal
#                             -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
#                             -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria


# 4. What is the average value of 'oldpeak' before and after imputation ?



# 1. We describe features as numeric or categorical

## ----------- Complete ------------ ## 
# numeric_features = [' ... ', '... ']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy=' ... '))])

# categorical_features = ['...','...']
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='...'))])


## We combine both "categorical" and "numerical" imputer in a preprocessor. 
## We use a column transformer to apply the transformation.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Apply the pre-processor pipeline on the reduced data frame. 

X_reduced = heart_df.loc[:, numeric_features + categorical_features ]

print( " Old Mean : " +  str( X_reduced['oldpeak'].mean() ) )

clf = Pipeline(steps=[('preprocessor', preprocessor)])
new_X_reduced = clf.fit_transform(X_reduced)

new_X_reduced_df = pd.DataFrame( new_X_reduced )
new_X_reduced_df.columns = [ numeric_features + categorical_features ]

print( " New Mean : " +  str( new_X_reduced_df['oldpeak'].mean() ) )

### 5. Instead of "SimpleImputer" use Use KNN Imputer for the following two numeric features. 
   - oldpeak     
   - trestbps

### 6. What is the average value of 'oldpeak' before and after imputation ?

In [None]:
# 5. Instead of "SimpleImputer" use Use KNN Imputer for the following two numeric features. 
#     - oldpeak
#     - trestbps

# 6. What is the average value of 'oldpeak' before and after imputation ?


## ----------- Complete ------------ ##
# numeric_features = ['...', '...']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', KNNImputer( ... ))])


## We separate the categorical features
## For categorical variables we will use a simple imputer with a constant = "missing"
categorical_features = ['ca','restecg']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value=-1))])


## We combine both "categorical" and "numerical" imputer in a preprocessor. 
## We use a column transformer to apply the transformation.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Apply the pre-processor pipeline on the reduced data frame. 

X_reduced = heart_df.loc[:, numeric_features + categorical_features ]

print( " Old Mean : " +  str( X_reduced['oldpeak'].mean() ) )

clf = Pipeline(steps=[('preprocessor', preprocessor)])
new_X_reduced = clf.fit_transform(X_reduced)

new_X_reduced_df = pd.DataFrame( new_X_reduced )
new_X_reduced_df.columns = [ numeric_features + categorical_features ]

print( " New Mean : " +  str( new_X_reduced_df['oldpeak'].mean() ) )