## Column Transformer in Scikit Learn

In [None]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [5]:
df = pd.read_csv("./DATA/covid_toy.csv")

In [6]:
df.head(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["has_covid"]), df["has_covid"], test_size=0.2)

In [8]:
X_train.shape, X_test.shape

((80, 5), (20, 5))

In [15]:
df["cough"].unique()

array(['Mild', 'Strong'], dtype=object)

In [9]:
df["city"].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [10]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## Doing one by one

In [13]:
simple_imputer = SimpleImputer()

X_train_fever = simple_imputer.fit_transform(X_train[["fever"]])
X_test_fever = simple_imputer.transform(X_test[["fever"]])

X_train_fever.shape

(80, 1)

In [16]:
simple_ordinal_encoder = OrdinalEncoder(categories=[["Mild", "Strong"]])
X_train_cough = simple_ordinal_encoder.fit_transform(X_train[["cough"]])
X_test_cough = simple_ordinal_encoder.transform(X_test[["cough"]])

In [19]:
simple_ohe = OneHotEncoder(drop="first")

X_train_gender_city = simple_ohe.fit_transform(X_train[["gender", "city"]])

X_test_gender_city = simple_ohe.transform(X_test[["gender", "city"]])

X_train_gender_city.shape

(80, 4)

In [21]:
X_train_age = X_train["age"].values
X_train_age.shape

(80,)

In [23]:
X_test_age = X_test["age"].values

In [26]:
X_train_fever.dtype

dtype('float64')

In [32]:
X_train_age.reshape(-1, 1).shape

(80, 1)

In [33]:
X_train_fever.shape

(80, 1)

In [34]:
X_train_cough.shape

(80, 1)

In [36]:
X_train_gender_city.toarray().shape

(80, 4)

In [37]:
# Concat all columns

X_train_after_transformation = np.hstack((
    X_train_age.reshape(-1, 1), 
    X_train_fever, 
    X_train_cough, 
    X_train_gender_city.toarray()
))

In [38]:
X_train_after_transformation.shape

(80, 7)

## Using Scikit learn Column Transformation

In [44]:
X_train.shape

(80, 5)

In [39]:
from sklearn.compose import ColumnTransformer

In [46]:
col_transformation = ColumnTransformer(
    transformers=[
        # first 
        ("tnf1", SimpleImputer(), ["fever"]),
        # second
        ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ['cough']),
        # third
        ("tnf3", OneHotEncoder(drop="first"), ["gender", "city"])
        ],
    remainder="passthrough", # or drop to remove the column data from the dataset
)

In [48]:
X_train_new = col_transformation.fit_transform(X_train)
X_test_new = col_transformation.transform(X_test)

In [54]:
X_train_after_transformation[0]

array([ 75., 104.,   1.,   0.,   1.,   0.,   0.])

In [55]:
X_train_new[0]

array([104.,   1.,   0.,   1.,   0.,   0.,  75.])