In [7]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

np.random.seed(42)

n = 1000

df = pd.DataFrame({

    'Product_importance': np.random.choice(
        ['low', 'medium', 'high', 'very_high', 'ultra'],
        n,
        p=[0.4, 0.3, 0.2, 0.05, 0.05]
    ),

    'Destination_City': np.random.choice(
        ['New York', 'London', 'Paris', 'Tokyo', 'Chennai', 'Delhi']
        + [f'City_{i}' for i in range(40)],
        n
    ),

    'Shipping_Mode': np.random.choice(
        ['Air', 'Road', 'Ship', 'Rail', 'Drone'],
        n,
        p=[0.35, 0.3, 0.2, 0.1, 0.05]
    ),

    'Feature_x': np.random.normal(0, 1, n),
    'Feature_y': np.random.normal(5, 2, n),

    'Delivery_Status': np.random.choice([0, 1], n)
})

print("Synthetic Data Created ")
df.head()

Synthetic Data Created 


Unnamed: 0,Product_importance,Destination_City,Shipping_Mode,Feature_x,Feature_y,Delivery_Status
0,low,City_5,Air,1.002085,4.379267,0
1,ultra,City_9,Air,0.517197,3.062141,1
2,high,City_17,Air,0.229872,3.3324,1
3,medium,City_12,Air,1.479886,9.364713,1
4,low,City_1,Drone,0.298377,4.643561,0


In [8]:
X = df.drop('Delivery_Status', axis=1)
y = df['Delivery_Status']

In [9]:
class CategoryReducer(BaseEstimator, TransformerMixin):

    def __init__(self, min_freq=0.05, new_label="Other"):
        self.min_freq = min_freq
        self.new_label = new_label
        self.frequent_categories_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)

        for col in X.columns:
            freq = X[col].value_counts(normalize=True)
            self.frequent_categories_[col] = freq[freq >= self.min_freq].index.tolist()

        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()

        for col in X.columns:
            X[col] = X[col].apply(
                lambda x: x if x in self.frequent_categories_[col] else self.new_label
            )

        return X

In [10]:
cat_cols = X.select_dtypes(include=['object']).columns

In [11]:
categorical_pipeline = Pipeline(steps=[
    ('reducer', CategoryReducer(min_freq=0.05)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, cat_cols)
    ],
    remainder='passthrough'
)

In [20]:
X_encoded = preprocessor.fit_transform(X)

print("Encoding Completed")
print("New Shape:", X_encoded.shape)

Encoding Completed
New Shape: (1000, 10)


In [23]:
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
categorical_feature_names = onehot_encoder.get_feature_names_out(cat_cols)
numerical_feature_names = X.select_dtypes(include=['number']).columns.tolist()
encoded_features = list(categorical_feature_names) + numerical_feature_names
X_encoded_df = pd.DataFrame(
    X_encoded.toarray() if hasattr(X_encoded, "toarray") else X_encoded,
    columns=encoded_features
)
X_encoded_df.head()

Unnamed: 0,Product_importance_high,Product_importance_low,Product_importance_medium,Product_importance_very_high,Shipping_Mode_Drone,Shipping_Mode_Rail,Shipping_Mode_Road,Shipping_Mode_Ship,Feature_x,Feature_y
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.002085,4.379267
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.517197,3.062141
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229872,3.3324
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.479886,9.364713
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.298377,4.643561
