# ModeImputer
This notebook shows the functionality in the ModeImputer class. This transformer fills null values with the mode value of each column.

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [2]:
import tubular
from tubular.imputers import ModeImputer

In [3]:
tubular.__version__

'0.3.0'

## Load California housing dataset from sklearn

In [4]:
cali = fetch_california_housing()
cali_df = pd.DataFrame(cali["data"], columns=cali["feature_names"])
cali_df["AveOccup"] = cali_df["AveOccup"].sample(frac=0.99, random_state=1)
cali_df["HouseAge"] = cali_df["HouseAge"].sample(frac=0.95, random_state=2)
cali_df["Population"] = cali_df["Population"].sample(frac=0.995, random_state=3)

In [5]:
cali_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
cali_df.isnull().sum()

MedInc           0
HouseAge      1032
AveRooms         0
AveBedrms        0
Population     103
AveOccup       206
Latitude         0
Longitude        0
dtype: int64

## Simple usage

### Initialising ModeImputer

In [7]:
imp_1 = ModeImputer(
    columns=["HouseAge", "AveOccup", "Population"], copy=True, verbose=True
)

BaseTransformer.__init__() called


### ModeImputer fit
The fit method must be run before the transform method. It computes the mode value from each of the specified columns. <br>
The mode values are stored in an attribute called 'impute_values_'.

In [8]:
imp_1.fit(cali_df)

BaseTransformer.fit() called


ModeImputer(columns=['HouseAge', 'AveOccup', 'Population'])

In [9]:
imp_1.impute_values_

{'HouseAge': 52.0, 'AveOccup': 3.0, 'Population': 891.0}

In [10]:
cali_df[["HouseAge", "AveOccup", "Population"]].mode().iloc[0]

HouseAge       52.0
AveOccup        3.0
Population    891.0
Name: 0, dtype: float64

### ModeImputer transform
Multiple column mappings were specified when creating imp_1 so these columns will be imputed when the transform method is run.

In [11]:
cali_df_2 = imp_1.transform(cali_df)

BaseTransformer.transform() called


In [12]:
cali_df_2[["HouseAge", "AveOccup", "Population"]].isnull().sum()

HouseAge      0
AveOccup      0
Population    0
dtype: int64

In [13]:
for x in ["HouseAge", "AveOccup", "Population"]:
    print(x, (cali_df_2[[x]] == imp_1.impute_values_[x]).sum(), "\n")

HouseAge HouseAge    2247
dtype: int64 

AveOccup AveOccup    241
dtype: int64 

Population Population    128
dtype: int64 

