In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'Fare':[25, 48, 71, 85, 90, 120],
                  'Embarked':['S','C','S','S','C','Q'],
                  'Gender':['M','F','F','F','M','M'],
                  'Age':[22, 34, 54, 29, 55, None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,M,22.0
1,48,C,F,34.0
2,71,S,F,54.0
3,85,S,F,29.0
4,90,C,M,55.0
5,120,Q,M,


## Encoders and Imputers

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
lab_enc = LabelEncoder()

In [5]:
df2 = lab_enc.fit_transform(df['Embarked'])
pd.Series(df2)

0    2
1    0
2    2
3    2
4    0
5    1
dtype: int32

In [7]:
df['Embarked'] = df2
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,2,M,22.0
1,48,0,F,34.0
2,71,2,F,54.0
3,85,2,F,29.0
4,90,0,M,55.0
5,120,1,M,


In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [9]:
ohe = OneHotEncoder()
si = SimpleImputer()

In [10]:
df = pd.DataFrame({'Fare':[25, 48, 71, 85, 90, 120],
                  'Embarked':['S','C','S','S','C','Q'],
                  'Gender':['M','F','F','F','M','M'],
                  'Age':[22, 34, 54, 29, 55, None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,M,22.0
1,48,C,F,34.0
2,71,S,F,54.0
3,85,S,F,29.0
4,90,C,M,55.0
5,120,Q,M,


In [11]:
ct = make_column_transformer((ohe, ['Embarked','Gender']),
                            (si, ['Age']),
                            remainder = 'passthrough') #passthrough to keep all other columns

In [13]:
ct.fit_transform(df)

array([[  0. ,   0. ,   1. ,   0. ,   1. ,  22. ,  25. ],
       [  1. ,   0. ,   0. ,   1. ,   0. ,  34. ,  48. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  54. ,  71. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  29. ,  85. ],
       [  1. ,   0. ,   0. ,   0. ,   1. ,  55. ,  90. ],
       [  0. ,   1. ,   0. ,   0. ,   1. ,  38.8, 120. ]])

### Ordinal Encoder

In [14]:
from sklearn.preprocessing import OrdinalEncoder

In [15]:
df = pd.DataFrame({'Shape': ['square', 'oval', 'square', 'circle'],
                  'Class': ['third', 'first', 'second', 'first'],
                  'Size': ['M', 'S', 'XL', 'M']})
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


In [17]:
ord_enc = OrdinalEncoder(categories = [['first', 'second', 'third'],['S', 'M', 'XL']])
df1 = ord_enc.fit_transform(df[['Class', 'Size']])

In [18]:
df1

array([[2., 1.],
       [0., 0.],
       [1., 2.],
       [0., 1.]])

In [19]:
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


### Binary Encoder

In [22]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.0


In [23]:
from category_encoders import BinaryEncoder

In [24]:
df = pd.DataFrame({'Cat_data': ['A', 'B', 'C', 'D', 'E', 'A', 'A', 'D']})
df

Unnamed: 0,Cat_data
0,A
1,B
2,C
3,D
4,E
5,A
6,A
7,D


In [25]:
bi_enc = BinaryEncoder()

In [26]:
df_bi = bi_enc.fit_transform(df)
df_bi

Unnamed: 0,Cat_data_0,Cat_data_1,Cat_data_2
0,0,0,1
1,0,1,0
2,0,1,1
3,1,0,0
4,1,0,1
5,0,0,1
6,0,0,1
7,1,0,0


### Comparing with OHE

In [27]:
ohe = OneHotEncoder(sparse = False)
ohe.fit_transform(df[['Cat_data']])

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

### KNN Imputer

In [28]:
df = pd.DataFrame({'Fare':[25, 48, 71, 85, 90, 120],
                  'Embarked':['S','C','S','S','C','Q'],
                  'Gender':['M','F','F','F','M','M'],
                  'Age':[22, 34, 54, 29, 55, None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,M,22.0
1,48,C,F,34.0
2,71,S,F,54.0
3,85,S,F,29.0
4,90,C,M,55.0
5,120,Q,M,


In [29]:
from sklearn.impute import KNNImputer

In [30]:
knn_ipm = KNNImputer(n_neighbors = 2)
knn_ipm.fit_transform(df[['Fare', 'Age']])

array([[ 25.,  22.],
       [ 48.,  34.],
       [ 71.,  54.],
       [ 85.,  29.],
       [ 90.,  55.],
       [120.,  42.]])

### Iterative Imputer

In [31]:
#Before using Iterative Imputer we need to enable it using below code
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer

In [32]:
df = pd.DataFrame({'Fare':[25, 48, 71, 85, 90, 120],
                  'Embarked':['S','C','S','S','C','Q'],
                  'Gender':['M','F','F','F','M','M'],
                  'Age':[22, 34, 54, 29, 55, None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,M,22.0
1,48,C,F,34.0
2,71,S,F,54.0
3,85,S,F,29.0
4,90,C,M,55.0
5,120,Q,M,


In [33]:
iter_impute = IterativeImputer()
iter_impute.fit_transform(df[['Fare', 'Age']])

array([[ 25.        ,  22.        ],
       [ 48.        ,  34.        ],
       [ 71.        ,  54.        ],
       [ 85.        ,  29.        ],
       [ 90.        ,  55.        ],
       [120.        ,  52.03920049]])