In [19]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

In [20]:
df = pd.read_csv("../Data/Train.csv")

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (10999, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [21]:
np.random.seed(42)
n = 1000

df = pd.DataFrame({
    'Product_importance': np.random.choice(['low','medium','high'], n),
    'Destination_City': np.random.choice(
        ['New York','London','Paris','Tokyo','Unknown'] + [f'City_{i}' for i in range(50)],
        n
    ),
    'Feature_x': np.random.normal(0,1,n),
    'Feature_y': np.random.normal(0,1,n)
})

print("Synthetic Data Created ✅")
df.head()

Synthetic Data Created ✅


Unnamed: 0,Product_importance,Destination_City,Feature_x,Feature_y
0,high,City_49,-0.089736,0.555604
1,low,Paris,1.440117,0.089581
2,high,City_45,-0.676392,-0.197338
3,high,City_25,1.80094,-0.151273
4,low,City_34,-0.040158,-0.194908


In [22]:
print("Unique Product Importance:", df['Product_importance'].nunique())
print("Unique Destination City:", df['Destination_City'].nunique())

df[['Product_importance','Destination_City']].head()

Unique Product Importance: 3
Unique Destination City: 55


Unnamed: 0,Product_importance,Destination_City
0,high,City_49
1,low,Paris
2,high,City_45
3,high,City_25
4,low,City_34


In [23]:
class CategoryReducer(BaseEstimator, TransformerMixin):

    def __init__(self, min_freq=0.02, new_label="Other"):
        self.min_freq = min_freq
        self.new_label = new_label
        self.valid_labels_ = {}

    def fit(self, X, y=None):

        X = pd.DataFrame(X)

        for col in X.columns:
            freq_ratio = X[col].value_counts(normalize=True)
            self.valid_labels_[col] = freq_ratio[freq_ratio >= self.min_freq].index.tolist()

        return self

    def transform(self, X):

        X = pd.DataFrame(X).copy()

        for col in X.columns:
            allowed = self.valid_labels_.get(col, [])
            X[col] = X[col].apply(lambda x: x if x in allowed else self.new_label)

        return X

In [24]:
class RareLabelEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, tol=0.05, replace_with='Other'):
        self.tol = tol
        self.replace_with = replace_with
        self.frequent_labels_ = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            freqs = X[col].value_counts(normalize=True)
            self.frequent_labels_[col] = freqs[freqs >= self.tol].index.tolist()
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col in self.frequent_labels_:
            X_copy[col] = np.where(
                X_copy[col].isin(self.frequent_labels_[col]),
                X_copy[col],
                self.replace_with
            )
        return X_copy

In [25]:
rare_encoder = CategoryReducer(min_freq=0.02)

df_reduced = rare_encoder.fit_transform(
    df[['Product_importance','Destination_City']]
)

print("Cities BEFORE:", df['Destination_City'].nunique())
print("Cities AFTER :", df_reduced['Destination_City'].nunique())

df_reduced.head()

Cities BEFORE: 55
Cities AFTER : 16


Unnamed: 0,Product_importance,Destination_City
0,high,Other
1,low,Other
2,high,City_45
3,high,Other
4,low,City_34


In [26]:
df[['Product_importance','Destination_City']] = df_reduced
df.head()

Unnamed: 0,Product_importance,Destination_City,Feature_x,Feature_y
0,high,Other,-0.089736,0.555604
1,low,Other,1.440117,0.089581
2,high,City_45,-0.676392,-0.197338
3,high,Other,1.80094,-0.151273
4,low,City_34,-0.040158,-0.194908


In [27]:
rare_encoder = RareLabelEncoder(tol=0.10)

df_rare = rare_encoder.fit_transform(
    df[['Product_importance','Destination_City']]
)

print(
    f"Unique cities before: {df['Destination_City'].nunique()} | "
    f"After: {df_rare['Destination_City'].nunique()}"
)

df_rare.head()

Unique cities before: 16 | After: 1


Unnamed: 0,Product_importance,Destination_City
0,high,Other
1,low,Other
2,high,Other
3,high,Other
4,low,Other


In [29]:
print("Before Encoding:")
print(df['Destination_City'].value_counts().head())

print("\nAfter Encoding:")
print(df_rare['Destination_City'].value_counts().head())

Before Encoding:
Destination_City
Other      655
City_23     31
City_34     27
City_45     25
City_28     24
Name: count, dtype: int64

After Encoding:
Destination_City
Other    1000
Name: count, dtype: int64
