In [46]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer


In [5]:
df = pd.read_csv("../data/covid_toy.csv")

In [6]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [25]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [16]:
X = df.drop("has_covid", axis = 1)
y = df["has_covid"]

print(X.shape,y.shape)

(100, 5) (100,)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, shuffle=True)
print(X_train.shape)

(80, 5)


In [31]:
# Simple Imputation: Fever
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[["fever"]])
X_test_fever = si.transform(X_test[["fever"]])

print(X_train_fever.shape)

(80, 1)


In [32]:
# Ordinal Imputation: Cough
or_i = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = or_i.fit_transform(X_train[["cough"]])
X_test_cough = or_i.transform(X_test[["cough"]])

print(X_train_cough.shape)

(80, 1)


In [50]:
# OHE Imputation: gender, city

ohe = OneHotEncoder(drop='first',sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])
X_test_gender_city = ohe.transform(X_test[["gender","city"]])

print(X_train_gender_city.shape)

(80, 4)


In [37]:
X_train["age"].values

array([81,  5, 19, 27, 73, 70, 49, 51, 64, 83, 65, 18, 16, 16, 27, 84, 51,
       69, 82, 69, 44, 74, 20, 12, 33, 42, 65, 23, 56, 64, 13, 31, 40, 49,
       19, 11, 14, 42, 38, 46, 71, 10, 60, 22, 19, 65, 19, 54, 81, 20, 48,
       82, 23, 66,  5, 49,  5, 34, 79,  6, 10, 69, 55, 34, 27, 47, 73, 42,
       80, 47, 38, 34, 25, 24, 12, 24, 75, 51, 82, 11])

In [None]:
# Concat all:

X_train_new = np.concatenate((X_train_fever, X_train_cough, X_train_gender_city), axis = 1)
X_test_new = np.concatenate((X_test_fever, X_test_cough, X_test_gender_city), axis = 1)

print(X_train_new.shape)

(80, 6)


In [61]:
trsf = [
    ("impute_fever", SimpleImputer(), ["fever"]),
    ("ordinal_cough", OrdinalEncoder(categories=[['Mild','Strong']]), ["cough"]),
    ("cat_city_gender", OneHotEncoder(sparse_output=False, drop="first"), ["gender", "city"])
]

In [62]:
transformer = ColumnTransformer(
    transformers= trsf,
    remainder="passthrough"
)

In [63]:
transformer.fit_transform(X_train).shape
transformer.transform(X_test).shape

(20, 7)