# MedianImputer
This notebook shows the functionality in the MedianImputer class. This transformer fills null values with the median value of each column.

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tubular
from tubular.imputers import MedianImputer

In [3]:
tubular.__version__

'0.2.8'

## Load Boston house price dataset from sklearn
Note, the load_boston script modifies the original Boston dataset to include nulls values and pandas categorical dtypes.

In [4]:
boston_df = tubular.testing.test_data.prepare_boston_df()
boston_df.shape

(506, 17)

In [5]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target,ZN_cat,CHAS_cat,RAD_cat
0,0.00632,18.0,2.31,0.0,0.538,6.575,,4.09,,296.0,15.3,396.9,4.98,24.0,18.0,0.0,
1,0.02731,,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,,0.0,2.0
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,,17.8,392.83,4.03,34.7,0.0,0.0,2.0
3,,,2.18,0.0,0.458,,45.8,6.0622,3.0,222.0,18.7,,,33.4,,0.0,3.0
4,0.06905,0.0,2.18,0.0,0.458,,,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,0.0,0.0,3.0


In [6]:
boston_df.isnull().sum()

CRIM        55
ZN          62
INDUS        0
CHAS         0
NOX         44
RM          56
AGE         42
DIS         51
RAD         62
TAX         52
PTRATIO     56
B           50
LSTAT       49
target       0
ZN_cat      62
CHAS_cat     0
RAD_cat     62
dtype: int64

## Simple usage

### Initialising MedianImputer

In [7]:
imp_1 = MedianImputer(
    columns = ['CRIM', 'NOX', 'RM'], 
    copy = True, 
    verbose = True
)

BaseTransformer.__init__() called


### MedianImputer fit
The fit method must be run before the transform method. It computes the median value from each of the specified columns. <br>
The median values are stored in an attribute called 'impute_values_'.

In [8]:
imp_1.fit(boston_df)

BaseTransformer.fit() called


MedianImputer(columns=['CRIM', 'NOX', 'RM'])

In [9]:
imp_1.impute_values_

{'CRIM': 0.25199, 'NOX': 0.538, 'RM': 6.1835}

In [10]:
boston_df[['CRIM', 'NOX', 'RM']].median()

CRIM    0.25199
NOX     0.53800
RM      6.18350
dtype: float64

### MedianImputer transform
Multiple column mappings were specified when creating imp_1 so these columns will be imputed when the transform method is run.

In [11]:
boston_df_2 = imp_1.transform(boston_df)

BaseTransformer.transform() called


In [12]:
boston_df_2[['CRIM', 'NOX', 'RM']].isnull().sum()

CRIM    0
NOX     0
RM      0
dtype: int64

In [13]:
for x in ['CRIM', 'NOX', 'RM']:
    print(x, (boston_df_2[[x]] == imp_1.impute_values_[x]).sum(), '\n')

CRIM CRIM    56
dtype: int64 

NOX NOX    66
dtype: int64 

RM RM    56
dtype: int64 

