In [17]:
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
set_config(display='diagram')

In [18]:
df = sns.load_dataset('titanic').drop(['pclass','sibsp','parch','adult_male','deck','embark_town','survived'],axis=1)
df.head()

Unnamed: 0,sex,age,fare,embarked,class,who,alive,alone
0,male,22.0,7.25,S,Third,man,no,False
1,female,38.0,71.2833,C,First,woman,yes,False
2,female,26.0,7.925,S,Third,woman,yes,True
3,female,35.0,53.1,S,First,woman,yes,False
4,male,35.0,8.05,S,Third,man,no,True


In [19]:
df.isna().sum()

sex           0
age         177
fare          0
embarked      2
class         0
who           0
alive         0
alone         0
dtype: int64

In [20]:
# Imputation
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[1]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[3]),
],remainder='passthrough')

trf1.fit_transform(df)

array([[22.0, 'S', 'male', ..., 'man', 'no', False],
       [38.0, 'C', 'female', ..., 'woman', 'yes', False],
       [26.0, 'S', 'female', ..., 'woman', 'yes', True],
       ...,
       [29.69911764705882, 'S', 'female', ..., 'woman', 'no', False],
       [26.0, 'C', 'male', ..., 'man', 'yes', True],
       [32.0, 'Q', 'male', ..., 'man', 'no', True]], dtype=object)

In [21]:
#Encoding
trf2 = ColumnTransformer([
  ("class_who_ordinal",OrdinalEncoder(categories=[["First","Second","Third"],["man","woman","child"]]),[4,5]),
  ("sex_alive_alone_onehot",OneHotEncoder(sparse_output=False,drop='first'),[0,6,7])
],remainder='passthrough')

trf2.fit_transform(df)

array([[2.0, 0.0, 1.0, ..., 22.0, 7.25, 'S'],
       [0.0, 1.0, 0.0, ..., 38.0, 71.2833, 'C'],
       [2.0, 1.0, 0.0, ..., 26.0, 7.925, 'S'],
       ...,
       [2.0, 1.0, 0.0, ..., nan, 23.45, 'S'],
       [0.0, 0.0, 1.0, ..., 26.0, 30.0, 'C'],
       [2.0, 0.0, 1.0, ..., 32.0, 7.75, 'Q']], dtype=object)

In [22]:
#Scaling
trf3 = ColumnTransformer([
    ('scale',StandardScaler(),[1,2])
],remainder='passthrough')

trf3.fit_transform(df)

array([[-0.5303766406838785, -0.5024451714361923, 'male', ..., 'man',
        'no', False],
       [0.571830994003175, 0.7868452935884461, 'female', ..., 'woman',
        'yes', False],
       [-0.25482473201211503, -0.4888542575852486, 'female', ...,
        'woman', 'yes', True],
       ...,
       [nan, -0.17626323901354432, 'female', ..., 'woman', 'no', False],
       [-0.25482473201211503, -0.04438103794142432, 'male', ..., 'man',
        'yes', True],
       [0.15850313099553, -0.49237782784290063, 'male', ..., 'man', 'no',
        True]], dtype=object)

In [23]:
# Pipeline
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3)
])
pipe.fit(df)

In [24]:
pipe.fit_transform(df)

array([[-0.7425961324256612, -0.03352007615769955, 2.0, ..., 'S', 'male',
        7.25],
       [0.7714843114902751, -0.03352007615769955, 0.0, ..., 'C',
        'female', 71.2833],
       [0.7714843114902751, -0.03352007615769955, 2.0, ..., 'S',
        'female', 7.925],
       ...,
       [0.7714843114902751, -0.03352007615769955, 2.0, ..., 'S',
        'female', 23.45],
       [-0.7425961324256612, -0.03352007615769955, 0.0, ..., 'C', 'male',
        30.0],
       [-0.7425961324256612, -0.03352007615769955, 2.0, ..., 'Q', 'male',
        7.75]], dtype=object)