In [1]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, PowerTransformer, PolynomialFeatures
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sklearn

sklearn.set_config(display='diagram')

ModuleNotFoundError: No module named 'sklearn'

In [3]:
df = pd.read_csv("../data/covid_toy.csv")
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [4]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [8]:
X = df.drop("has_covid", axis = 1)
y = df["has_covid"]

print(X.shape,y.shape)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, shuffle=True)
print(X_train.shape)

(100, 5) (100,)
(80, 5)


# Feature Transformation

1. Missing data imputation
2. Handling categorical values
3. Outlier detection
4. Transformations - Fuctional/Power
5. Feature Scaling

In [42]:
# Function for winsorization

def winsorization(X,percentile = 2):
    lower_bound = np.percentile(X, percentile)
    upper_bound = np.percentile(X, 100 - percentile)
    return np.clip(X, lower_bound, upper_bound)

In [43]:
feature_transformation_ct = ColumnTransformer(transformers=[
    ("impute_fever", SimpleImputer(), ["fever"]),
    ("ohe_city_gender", OneHotEncoder(sparse_output=False, drop = "first"), ["city", "gender"]),
    ("ordinal_en", OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("winsorization", FunctionTransformer(func = winsorization), ["fever", "age"]),
    ("standardScaler", StandardScaler(), ["fever", "age"]),
],
remainder="passthrough")


# With other log, sqrt, and power transformers


feature_transformation_ct = ColumnTransformer(transformers=[
    ("impute_fever", SimpleImputer(), ["fever"]),
    ("ohe_city_gender", OneHotEncoder(sparse_output=False, drop = "first"), ["city", "gender"]),
    ("ordinal_en", OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("log_transform", FunctionTransformer(func=np.log1p), ["fever", "age"]),
    ("sq_transf", FunctionTransformer(func=np.square), ["fever", "age"]),
    ("yeo_johnson", PowerTransformer(), ["fever", "age"]),
    ("standardScaler", StandardScaler(), ["fever", "age"]),
],
remainder="passthrough")

In [44]:
X_train_new = feature_transformation_ct.fit_transform(X_train)
X_test_new = feature_transformation_ct.transform(X_test)

In [34]:
X_train_new.shape, X_test_new.shape

((80, 14), (20, 14))

In [35]:
X_train_new[1]

array([ 1.00000000e+02,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  4.61512052e+00,  1.79175947e+00,
        1.00000000e+04,  2.50000000e+01, -4.80451390e-01, -1.88106831e+00,
       -4.91423034e-01, -1.55894504e+00])

# Feature Creation:

In [None]:
numerical_pipeline = Pipeline([
    ("impute", SimpleImputer()),  # Step 1: Impute missing values
    ("polynomial", PolynomialFeatures(degree=2, interaction_only=True)),  # Step 2: Create polynomial features
    ("scaler", StandardScaler())  # Step 3: Scale features
])


feature_creation_ct = ColumnTransformer(
    transformers= [
        ("numerical", numerical_pipeline, ["fever", "age"])
    ]
)

feature_creation_ct.fit_transform(X_train)

array([[ 0.        ,  0.        ,  1.56614097,  1.56616067],
       [ 0.        , -0.52164053, -1.55894504, -1.56215914],
       [ 0.        , -0.52164053, -0.98327131, -0.99196675],
       [ 0.        , -0.52164053, -0.65431488, -0.66614253],
       [ 0.        ,  1.04328106,  1.23718454,  1.29654113],
       [ 0.        ,  1.04328106,  1.11382589,  1.17069152],
       [ 0.        ,  0.52164053,  0.25031528,  0.26978755],
       [ 0.        ,  0.        ,  0.33255438,  0.33210143],
       [ 0.        ,  0.        ,  0.86710857,  0.86686043],
       [ 0.        ,  0.        ,  1.64838007,  1.64843129],
       [ 0.        , -1.56492159,  0.90822812,  0.82857609],
       [ 0.        ,  1.56492159, -1.02439086, -1.0033706 ],
       [ 0.        ,  1.04328106, -1.10662996, -1.09460138],
       [ 0.        ,  1.56492159, -1.10662996, -1.0880849 ],
       [ 0.        , -0.52164053, -0.65431488, -0.66614253],
       [ 0.        ,  0.        ,  1.68949962,  1.6895666 ],
       [ 0.        ,  1.

In [48]:
feature_creation_ct

In [49]:
feature_transformation_ct

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (OneHotEncoder, OrdinalEncoder, StandardScaler, 
    PolynomialFeatures, PowerTransformer, FunctionTransformer
)
import numpy as np

# Custom Winsorization Function
def winsorization(X,percentile = 2):
    lower_bound = np.percentile(X, percentile)
    upper_bound = np.percentile(X, 100 - percentile)
    return np.clip(X, lower_bound, upper_bound)


# ==== 🔹 FUNCTION TO CREATE GENERALIZED PIPELINE 🔹 ====
def create_feature_pipeline(numerical_cols, categorical_cols, ordinal_cols, ordinal_categories):
    """Creates a reusable feature engineering pipeline for ML projects."""

    # Numerical Pipeline
    numerical_pipeline = Pipeline([
        ("impute", KNNImputer()),
        ("polynomial", PolynomialFeatures(degree=2, interaction_only=True)),
        ("winsorization", FunctionTransformer(winsorization)),
        ("log_transform", FunctionTransformer(func = np.log1p)),
        ("square_transform", FunctionTransformer(func = np.square)),
        ("power_transform", PowerTransformer(method="yeo-johnson")),
        ("scaler", StandardScaler())
    ])

    # Categorical Pipeline
    categorical_pipeline = ColumnTransformer(transformers=[
        ("ordinal_en", OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
        ("ohe", OneHotEncoder(sparse_output=False, drop="first"), categorical_cols)
    ])

    # Final Column Transformer
    feature_transformation_ct = ColumnTransformer(transformers=[
        ("num_pipeline", numerical_pipeline, numerical_cols),
        ("cat_pipeline", categorical_pipeline, categorical_cols + ordinal_cols)
    ], remainder="passthrough")

    return feature_transformation_ct



# # Save pipeline for future use
# joblib.dump(feature_pipeline, "feature_pipeline.pkl")

# # Load pipeline later
# loaded_pipeline = joblib.load("feature_pipeline.pkl")


In [65]:
# ==== 🔹 EXAMPLE USAGE 🔹 ====
# Define column names dynamically
numerical_features = ["fever", "age"]
categorical_features = ["city", "gender"]
ordinal_features = ["cough"]
ordinal_categories = [["Mild", "Strong"]]  # Order for ordinal encoding

# Create a generalized pipeline
feature_pipeline = create_feature_pipeline(numerical_features, categorical_features, ordinal_features, ordinal_categories)

In [66]:
feature_pipeline.fit_transform(X_train)
feature_pipeline.transform(X_test)


array([[ 1.11022302e-16,  1.53894868e+00, -1.06875660e+00,
        -1.04016764e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.11022302e-16, -1.57775849e+00,  1.42717556e+00,
         1.39220963e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.11022302e-16,  2.02196018e-02,  1.00638054e+00,
         1.03148346e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.11022302e-16, -1.03588162e+00,  1.12359704e+00,
         1.11282136e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.11022302e-16,  5.34947714e-01, -8.95014811e-01,
        -8.90238344e-01,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.11022302e-16,  1.04111437e+00,  4.21953858e-01,
         4.52715698e-01,  0.00000000e+00,  0.000000