In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
X = pd.DataFrame({'city':['tokyo', np.NaN, 'london', 'seattle', 'san francisco', 'tokyo'], 
                  'boolean':['yes', 'no', np.NaN, 'no', 'no', 'yes'], 
                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'], 
                  'quantitative_column':[1, 11, -.5, 10, np.NaN, 20]})

In [3]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [4]:
X.isnull().sum()

city                   1
boolean                1
ordinal_column         0
quantitative_column    1
dtype: int64

In [5]:
X['city'].value_counts()

tokyo            2
seattle          1
london           1
san francisco    1
Name: city, dtype: int64

In [6]:
type(X['city'].value_counts())

pandas.core.series.Series

In [7]:
X['city'].value_counts().index[0]

'tokyo'

In [8]:
X['city'].fillna(X['city'].value_counts().index[0])

0            tokyo
1            tokyo
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

# Build own Custom category imputer
Please note, i have written custom imputer as strategy = 'mode' can be used only with quantative feature not with qulitative.
We can use from sklearn.impute import SimpleImputer SimpleImputer(strategy="most_frequent") for the same.

In [9]:
#Build own Custom category imputer
from sklearn.base import TransformerMixin
#We will inherit the TransformerMixin class from scikit-learn, which includes a .fit_transform method that calls upon
#he .fit and .transform methods we will create. This allows us to maintain a similar structure in our transformer
class CustomCategoryImputer(TransformerMixin):
    'custom imputer'
    def __init__(self, cols=None):
        'default value of cols is None'
        self.cols = cols
        
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X
#We have set up our fit method to simply return self, as is the standard of .fit methods in scikit-learn.  
    def fit(self, *_):
        return self

In [10]:
list(X.columns)

['city', 'boolean', 'ordinal_column', 'quantitative_column']

In [11]:
#cci = CustomCategoryImputer(cols=list(X.columns))
cci = CustomCategoryImputer(cols=['city', 'boolean'])

In [12]:
cci.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


# Custom quantitative imputer

In [13]:
from sklearn.preprocessing import Imputer
#impute = Imputer(strategy='mean')
#X['quantitative_column'] = impute.fit_transform(X[['quantitative_column']])

In [14]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [15]:
import copy
xx = copy.deepcopy(X)

In [16]:
xx['quant'] = [10,10,10,12,None,12]

In [17]:
xx

Unnamed: 0,city,boolean,ordinal_column,quantitative_column,quant
0,tokyo,yes,somewhat like,1.0,10.0
1,,no,like,11.0,10.0
2,london,,somewhat like,-0.5,10.0
3,seattle,no,like,10.0,12.0
4,san francisco,no,somewhat like,,
5,tokyo,yes,dislike,20.0,12.0


In [18]:
from sklearn.preprocessing import Imputer
impute = Imputer(strategy='mean')
for cols in ['quantitative_column', 'quant']:
    xx[cols] = impute.fit_transform(xx[[cols]])

In [19]:
xx

Unnamed: 0,city,boolean,ordinal_column,quantitative_column,quant
0,tokyo,yes,somewhat like,1.0,10.0
1,,no,like,11.0,10.0
2,london,,somewhat like,-0.5,10.0
3,seattle,no,like,10.0,12.0
4,san francisco,no,somewhat like,8.3,10.8
5,tokyo,yes,dislike,20.0,12.0


In [20]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [21]:
from sklearn.preprocessing import Imputer
class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self, *_):
        return self

# mix of qualitative and quantative
Please note, i have written custom imputer as strategy = 'mode' can be used only with quantative feature not with qulitative
We can use from sklearn.impute import SimpleImputer SimpleImputer(strategy="most_frequent") for the same.
This custom impuer can be used for both qualitative and quantative
Also with scikit learn imputer either we can use it for whole dataframe(if all are quantative) or we can use for loop with list of similar type of coloumn.
But with custom imputer, we can use it with any other combination.

In [22]:
from sklearn.preprocessing import Imputer
class CustomImputer(TransformerMixin):
    'this a custom imputer'
    def __init__(self, cols=None, strategy='mean'):
        'default cols is the entire columns of dataframe'
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        'df is the dataframe'
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        if self.cols == None:
            self.cols = list(X.columns)
        for col in self.cols:
            if X[col].dtype == np.dtype('O') : X[col].fillna(X[col].value_counts().index[0], inplace=True)
            else : X[col] = impute.fit_transform(X[[col]])
            
        return X
    
    def fit(self, *_):
        return self

In [23]:
X['city'].dtype

dtype('O')

In [24]:
X['quantitative_column'].dtype

dtype('float64')

In [25]:
sd = CustomImputer(['quantitative_column'],'median')

In [26]:
sd.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,10.0
5,tokyo,yes,dislike,20.0


In [27]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [28]:
# import Pipeline from sklearn
from sklearn.pipeline import Pipeline
cqi = CustomImputer(cols=['quantitative_column'], strategy='mean')
cci = CustomImputer(cols=['city', 'boolean'])
imputer = Pipeline([('quant', cqi), ('category', cci)]) 
imputer.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [29]:
call = CustomImputer(list(X.columns))
X_filled = call.fit_transform(X)
X_filled

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [30]:
call = CustomImputer()
call.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [31]:
call = CustomImputer(['city'])
call.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [32]:
rr = [1,2,3,4]
rr = list(map(lambda x : x**2, rr))
rr

[1, 4, 9, 16]

# Encoding at the ordinal level

In [33]:
class CustomEncoder(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [34]:
ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])
ce.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,1,1.0
1,,no,2,11.0
2,london,,1,-0.5
3,seattle,no,2,10.0
4,san francisco,no,1,
5,tokyo,yes,0,20.0


# Encoding at the nominal level

In [35]:
# create our custom dummifier
class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols,drop_first=True)
    
    def fit(self, *_):
        return self

In [36]:
# ideally there is no need to create custom dumiffier, we can use multiple columns in get_dummies.
# pd.get_dummies(X,columns = ['city', 'boolean'],drop_first=True)
nl = CustomDummifier(['boolean'])
nl.fit_transform(X)

Unnamed: 0,city,ordinal_column,quantitative_column,boolean_yes
0,tokyo,somewhat like,1.0,1
1,,like,11.0,0
2,london,somewhat like,-0.5,0
3,seattle,like,10.0,0
4,san francisco,somewhat like,,0
5,tokyo,dislike,20.0,1
