In [5]:
import pandas as pd

In [6]:
df=pd.DataFrame({'Fare':[25,48,71,85,90,120],
                'Embarked':['S','C','S','S','C','Q'],
                'Gender':['Male','Female','Female','Female','Male','Male'],
                'Age':[22.0,34.0,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,Male,22.0
1,48,C,Female,34.0
2,71,S,Female,54.0
3,85,S,Female,29.0
4,90,C,Male,55.0
5,120,Q,Male,


### Encoder & imputers

In [7]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

In [9]:
df_emb=le.fit_transform(df.Embarked)
pd.Series(df_emb)

0    2
1    0
2    2
3    2
4    0
5    1
dtype: int32

In [10]:
df2=df
df2.Embarked=df_emb
df2

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,2,Male,22.0
1,48,0,Female,34.0
2,71,2,Female,54.0
3,85,2,Female,29.0
4,90,0,Male,55.0
5,120,1,Male,


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [12]:
ohe=OneHotEncoder()
si=SimpleImputer()

ct=make_column_transformer((ohe,['Embarked','Gender']),
                         ( si,['Age']),
                          remainder='passthrough')

In [14]:
ct.fit_transform(df)

array([[  0. ,   0. ,   1. ,   0. ,   1. ,  22. ,  25. ],
       [  1. ,   0. ,   0. ,   1. ,   0. ,  34. ,  48. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  54. ,  71. ],
       [  0. ,   0. ,   1. ,   1. ,   0. ,  29. ,  85. ],
       [  1. ,   0. ,   0. ,   0. ,   1. ,  55. ,  90. ],
       [  0. ,   1. ,   0. ,   0. ,   1. ,  38.8, 120. ]])

### Ordinal Encoder

In [15]:
from sklearn.preprocessing import OrdinalEncoder

In [16]:
df=pd.DataFrame({"Shape":['square','oval','square','circle'],
                "Class":['third','first','second','first'],
                "Size":["M","S","XL","M"]})
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


In [17]:
Oe=OrdinalEncoder(categories=[['first','second','third'],['S','M','XL']])

In [18]:
df1=Oe.fit_transform(df[['Class','Size']])

In [19]:
df1

array([[2., 1.],
       [0., 0.],
       [1., 2.],
       [0., 1.]])

In [20]:
df

Unnamed: 0,Shape,Class,Size
0,square,third,M
1,oval,first,S
2,square,second,XL
3,circle,first,M


### Binary Encoder

In [21]:
!pip install category_encoders



In [22]:
from category_encoders import BinaryEncoder

In [23]:
bin_enc=BinaryEncoder()

In [24]:
col="A B C D E F G H I A D A".split()
df=pd.DataFrame({'Col' : col})
df

Unnamed: 0,Col
0,A
1,B
2,C
3,D
4,E
5,F
6,G
7,H
8,I
9,A


In [25]:
df_bin=bin_enc.fit_transform(df)
df_bin

Unnamed: 0,Col_0,Col_1,Col_2,Col_3
0,0,0,0,1
1,0,0,1,0
2,0,0,1,1
3,0,1,0,0
4,0,1,0,1
5,0,1,1,0
6,0,1,1,1
7,1,0,0,0
8,1,0,0,1
9,0,0,0,1


### Comparing OneHotEncoder with BinaryEncoder

In [26]:
ohe=OneHotEncoder(sparse=False)
ohe.fit_transform(df[['Col']])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.]])

### KNN Imputer

In [27]:
!pip install -U scikit-learn


Requirement already up-to-date: scikit-learn in c:\users\p.c\appdata\local\continuum\anaconda3\lib\site-packages (1.0.1)


In [28]:
! pip install KNNImputer

Collecting KNNImputer


  Could not find a version that satisfies the requirement KNNImputer (from versions: )
No matching distribution found for KNNImputer


In [29]:
from sklearn.impute import KNNImputer

In [30]:
df=pd.DataFrame({'Fare':[25,48,71,85,90,120],
                'Embarked':['S','C','S','S','C','Q'],
                'Gender':['Male','Female','Female','Female','Male','Male'],
                'Age':[22.0,34.0,54,29,55,None]})
df

Unnamed: 0,Fare,Embarked,Gender,Age
0,25,S,Male,22.0
1,48,C,Female,34.0
2,71,S,Female,54.0
3,85,S,Female,29.0
4,90,C,Male,55.0
5,120,Q,Male,


In [33]:
knn_imp=KNNImputer(n_neighbors=3)
knn_imp.fit_transform(df[['Fare','Age']])

array([[ 25.,  22.],
       [ 48.,  34.],
       [ 71.,  54.],
       [ 85.,  29.],
       [ 90.,  55.],
       [120.,  46.]])

### Iterative Imputer

In [34]:
!pip install sklearn.experimental

Collecting sklearn.experimental


  Could not find a version that satisfies the requirement sklearn.experimental (from versions: )
No matching distribution found for sklearn.experimental


In [35]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [37]:
itr_imp=IterativeImputer()
itr_imp.fit_transform(df[['Fare','Age']])

array([[ 25.        ,  22.        ],
       [ 48.        ,  34.        ],
       [ 71.        ,  54.        ],
       [ 85.        ,  29.        ],
       [ 90.        ,  55.        ],
       [120.        ,  52.03920049]])