In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv("covid_toy.csv")
df.sample(5)
# gender,city-oneHotEncoding
# cough-oridnalEncoding
# hos_covid-label encoding


Unnamed: 0,age,gender,fever,cough,city,has_covid
26,19,Female,100.0,Mild,Kolkata,Yes
40,49,Female,102.0,Mild,Delhi,No
64,42,Male,104.0,Mild,Mumbai,No
96,51,Female,101.0,Strong,Kolkata,Yes
61,81,Female,98.0,Strong,Mumbai,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test=train_test_split(df,test_size=0.25)

X_train.shape,X_test.shape

((75, 6), (25, 6))

In [8]:
# handling null values of fever
from sklearn.impute import SimpleImputer
si=SimpleImputer()
X_train_fever=si.fit_transform(X_train[["fever"]])
X_test_fever=si.fit_transform(X_test[["fever"]])

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

transformer=ColumnTransformer(
    transformers=[
    ("tnf1",SimpleImputer(),["fever"]),
    ("tnf2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("tnf3",OneHotEncoder(),["city","gender"])],
    remainder="passthrough"
)

In [22]:
X_train_trans=transformer.fit_transform(X_train)
X_test_trans=transformer.fit_transform(X_test)
X_train_trans

array([[100.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 11, 'Yes'],
       [100.77611940298507, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 42, 'Yes'],
       [104.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 34, 'No'],
       [99.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 22, 'Yes'],
       [100.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 11, 'Yes'],
       [98.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 81, 'No'],
       [100.77611940298507, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 82, 'Yes'],
       [104.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 17, 'No'],
       [98.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 12, 'No'],
       [101.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 65, 'No'],
       [99.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 72, 'No'],
       [101.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 14, 'No'],
       [104.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 12, 'No'],
       [102.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 69, 'No'],
       [100.77611940298507, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 71, 'No'],
       [100.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 

In [30]:
imputed_cols = ["fever"]
ordinal_cols = ["cough"]
ohe_cols = transformer.named_transformers_['tnf3'].get_feature_names_out(["city", "gender"]).tolist()
passthrough_cols = [col for col in X_train.columns if col not in ["fever", "cough", "city", "gender"]]

# Combine all new column names
new_columns = imputed_cols + ordinal_cols + ohe_cols + passthrough_cols

# Create DataFrame
X_train_trans_df = pd.DataFrame(X_train_trans, columns=new_columns)

In [32]:
X_train_trans_df

Unnamed: 0,fever,cough,city_Bangalore,city_Delhi,city_Kolkata,city_Mumbai,gender_Female,gender_Male,age,has_covid
0,100.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,11,Yes
1,100.776119,1.0,1.0,0.0,0.0,0.0,1.0,0.0,42,Yes
2,104.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,34,No
3,99.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,22,Yes
4,100.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,11,Yes
...,...,...,...,...,...,...,...,...,...,...
70,103.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,60,No
71,99.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,66,No
72,98.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,31,No
73,102.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,49,No
