### **Column transformer**

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
data = '''age,gender,fever,cough,city,has_covid
60,Male,103.0,Mild,Kolkata,No
27,Male,100.0,Mild,Delhi,Yes
42,Male,101.0,Mild,Delhi,No
31,Female,98.0,Mild,Kolkata,No
65,Female,101.0,Mild,Mumbai,No
84,Female,,Mild,Bangalore,Yes
14,Male,101.0,Strong,Bangalore,No
20,Female,,Strong,Mumbai,Yes
19,Female,100.0,Strong,Bangalore,No
64,Female,101.0,Mild,Delhi,No
75,Female,,Mild,Delhi,No
65,Female,98.0,Mild,Mumbai,Yes
25,Female,99.0,Strong,Kolkata,No
64,Male,102.0,Mild,Bangalore,Yes
51,Male,104.0,Mild,Bangalore,No
70,Male,103.0,Strong,Kolkata,Yes
69,Female,103.0,Mild,Kolkata,Yes
40,Female,98.0,Strong,Delhi,No
64,Female,98.0,Mild,Bangalore,Yes
42,Female,,Strong,Bangalore,Yes
12,Male,98.0,Strong,Bangalore,No
73,Male,98.0,Mild,Bangalore,Yes
71,Female,98.0,Strong,Kolkata,Yes
80,Female,98.0,Mild,Delhi,Yes
13,Female,100.0,Strong,Kolkata,No
23,Male,,Mild,Mumbai,No
19,Female,100.0,Mild,Kolkata,Yes
33,Female,102.0,Strong,Delhi,No
16,Male,104.0,Mild,Kolkata,No
34,Female,,Strong,Mumbai,Yes
15,Male,101.0,Mild,Delhi,Yes
83,Male,103.0,Mild,Kolkata,No
34,Female,101.0,Strong,Delhi,Yes
26,Female,98.0,Mild,Kolkata,No
74,Male,102.0,Mild,Mumbai,Yes
82,Female,102.0,Strong,Bangalore,No
38,Female,101.0,Mild,Bangalore,No
55,Male,100.0,Mild,Kolkata,No
49,Female,101.0,Mild,Delhi,Yes
50,Female,103.0,Mild,Kolkata,No
49,Female,102.0,Mild,Delhi,No
82,Male,,Mild,Kolkata,Yes
27,Male,100.0,Mild,Delhi,Yes
22,Female,99.0,Mild,Bangalore,Yes
20,Male,102.0,Strong,Delhi,No
72,Male,99.0,Mild,Bangalore,No
19,Female,101.0,Mild,Mumbai,No
18,Female,104.0,Mild,Bangalore,No
66,Male,99.0,Strong,Bangalore,No
44,Male,104.0,Mild,Mumbai,No
19,Male,101.0,Mild,Delhi,Yes
11,Female,100.0,Strong,Kolkata,Yes
47,Female,100.0,Strong,Bangalore,Yes
83,Male,98.0,Mild,Delhi,Yes
60,Female,99.0,Mild,Mumbai,Yes
81,Female,101.0,Mild,Mumbai,Yes
71,Male,,Strong,Kolkata,No
49,Female,99.0,Strong,Bangalore,No
23,Male,98.0,Strong,Mumbai,Yes
6,Female,104.0,Mild,Kolkata,Yes
24,Female,102.0,Strong,Bangalore,Yes
81,Female,98.0,Strong,Mumbai,No
56,Female,104.0,Strong,Bangalore,Yes
10,Male,100.0,Mild,Bangalore,No
42,Male,104.0,Mild,Mumbai,No
69,Female,102.0,Mild,Bangalore,No
51,Male,104.0,Mild,Kolkata,No
65,Male,99.0,Mild,Bangalore,No
54,Female,104.0,Strong,Kolkata,No
73,Female,103.0,Mild,Delhi,No
68,Female,101.0,Strong,Delhi,No
75,Female,104.0,Strong,Delhi,No
83,Female,101.0,Mild,Kolkata,No
34,Male,98.0,Strong,Kolkata,Yes
34,Female,104.0,Strong,Delhi,No
5,Male,102.0,Mild,Kolkata,Yes
80,Male,100.0,Mild,Bangalore,Yes
8,Female,101.0,Mild,Kolkata,No
11,Male,100.0,Mild,Bangalore,Yes
48,Female,103.0,Mild,Kolkata,Yes
14,Female,99.0,Mild,Mumbai,Yes
65,Male,99.0,Mild,Delhi,No
24,Male,98.0,Mild,Kolkata,Yes
17,Female,104.0,Mild,Kolkata,No
69,Female,98.0,Strong,Mumbai,No
16,Female,103.0,Mild,Bangalore,Yes
25,Male,104.0,Mild,Bangalore,Yes
47,Male,101.0,Strong,Bangalore,No
5,Female,100.0,Mild,Kolkata,No
46,Male,103.0,Strong,Bangalore,No
59,Female,99.0,Strong,Delhi,No
38,Male,,Mild,Delhi,Yes
82,Female,102.0,Strong,Kolkata,No
27,Male,100.0,Mild,Kolkata,Yes
79,Male,,Strong,Kolkata,Yes
12,Female,104.0,Mild,Bangalore,No
51,Female,101.0,Strong,Kolkata,Yes
20,Female,101.0,Mild,Bangalore,No
5,Female,98.0,Strong,Mumbai,No
10,Female,98.0,Strong,Kolkata,Yes'''

In [4]:
import io
file = io.StringIO(data)
df = pd.read_csv(file)
df.sample(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
3,31,Female,98.0,Mild,Kolkata,No
95,12,Female,104.0,Mild,Bangalore,No
6,14,Male,101.0,Strong,Bangalore,No
27,33,Female,102.0,Strong,Delhi,No
68,54,Female,104.0,Strong,Kolkata,No
14,51,Male,104.0,Mild,Bangalore,No
26,19,Female,100.0,Mild,Kolkata,Yes
78,11,Male,100.0,Mild,Bangalore,Yes
21,73,Male,98.0,Mild,Bangalore,Yes
59,6,Female,104.0,Mild,Kolkata,Yes


In [5]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


### **Tranformations**

- gender, city - one hot encoding

- fever - simple imputer

- cough - ordinal encoding

- has_covid - label encoding

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    df.drop(columns=['has_covid']),
    df['has_covid'],
    test_size=0.2
)

In [8]:
from sklearn.compose import ColumnTransformer

In [12]:
transformer = ColumnTransformer(
    transformers=[
        ('t1', SimpleImputer(), ['fever']),
        ('t2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
        ('t3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'])
    ],
    remainder="passthrough"
)

In [13]:
transformer.fit_transform(xtrain)

array([[104.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,  51.  ],
       [100.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,  11.  ],
       [101.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,  19.  ],
       [ 98.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   5.  ],
       [103.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,  73.  ],
       [102.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,  49.  ],
       [101.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,  20.  ],
       [101.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,  15.  ],
       [104.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,  16.  ],
       [ 98.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,  69.  ],
       [100.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ,  19.  ],
       [103.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,  69.  ],
       [ 99.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,  72.  ],
       [101.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,  81.  ],
       [ 98.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,  64.  ],
       [100.75,   0.  ,  

In [14]:
transformer.transform(xtest)

array([[102.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,  33.  ],
       [103.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,  16.  ],
       [104.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,  12.  ],
       [100.75,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ,  23.  ],
       [ 98.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,  24.  ],
       [ 98.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,  81.  ],
       [ 99.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,  25.  ],
       [101.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,  68.  ],
       [104.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,  51.  ],
       [101.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   8.  ],
       [104.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,  17.  ],
       [100.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,  13.  ],
       [101.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,  19.  ],
       [104.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,  54.  ],
       [ 99.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,  22.  ],
       [101.  ,   0.  ,  