In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("Train.csv")

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (10999, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [3]:

np.random.seed(42)
n = 1000

df = pd.DataFrame({
    'Product_importance': np.random.choice(['low','medium','high'], n),
    'Destination_City': np.random.choice(
        ['New York','London','Paris','Tokyo','Unknown'] + [f'City_{i}' for i in range(50)],
        n
    ),
    'Feature_x': np.random.normal(0,1,n),
    'Feature_y': np.random.normal(0,1,n)
})

print("Synthetic Data Created ✅")
df.head()

Synthetic Data Created ✅


Unnamed: 0,Product_importance,Destination_City,Feature_x,Feature_y
0,high,City_49,-0.089736,0.555604
1,low,Paris,1.440117,0.089581
2,high,City_45,-0.676392,-0.197338
3,high,City_25,1.80094,-0.151273
4,low,City_34,-0.040158,-0.194908


In [6]:
rare_encoder = CategoryReducer(min_freq=0.02)

df_reduced = rare_encoder.fit_transform(
    df[['Product_importance','Destination_City']]
)

print("Cities BEFORE:", df['Destination_City'].nunique())
print("Cities AFTER :", df_reduced['Destination_City'].nunique())

df_reduced.head()

Cities BEFORE: 55
Cities AFTER : 16


Unnamed: 0,Product_importance,Destination_City
0,high,Other
1,low,Other
2,high,City_45
3,high,Other
4,low,City_34


In [7]:

df[['Product_importance','Destination_City']] = df_reduced
df.head()

Unnamed: 0,Product_importance,Destination_City,Feature_x,Feature_y
0,high,Other,-0.089736,0.555604
1,low,Other,1.440117,0.089581
2,high,City_45,-0.676392,-0.197338
3,high,Other,1.80094,-0.151273
4,low,City_34,-0.040158,-0.194908


In [8]:
rare_encoder = RareLabelEncoder(tol=0.10)

df_rare = rare_encoder.fit_transform(
    df[['Product_importance','Destination_City']]
)

print(
    f"Unique cities before: {df['Destination_City'].nunique()} | "
    f"After: {df_rare['Destination_City'].nunique()}"
)

df_rare.head()

Unique cities before: 16 | After: 1


Unnamed: 0,Product_importance,Destination_City
0,high,Other
1,low,Other
2,high,Other
3,high,Other
4,low,Other


In [9]:

print("Before Encoding:")
print(df['Destination_City'].value_counts().head())

print("\nAfter Encoding:")
print(df_rare['Destination_City'].value_counts().head())

Before Encoding:
Destination_City
Other      655
City_23     31
City_34     27
City_45     25
City_31     24
Name: count, dtype: int64

After Encoding:
Destination_City
Other    1000
Name: count, dtype: int64


In [10]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
print("Categorical Columns:", cat_cols)
df[cat_cols].head()

Categorical Columns: ['Product_importance', 'Destination_City']


Unnamed: 0,Product_importance,Destination_City
0,high,Other
1,low,Other
2,high,City_45
3,high,Other
4,low,City_34


In [13]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("New Shape:", df_encoded.shape)
df_encoded.head()

New Shape: (1000, 19)


Unnamed: 0,Feature_x,Feature_y,Product_importance_2,Product_importance_3,Destination_City_City_17,Destination_City_City_19,Destination_City_City_20,Destination_City_City_23,Destination_City_City_28,Destination_City_City_29,Destination_City_City_31,Destination_City_City_34,Destination_City_City_39,Destination_City_City_41,Destination_City_City_43,Destination_City_City_45,Destination_City_City_5,Destination_City_City_9,Destination_City_Other
0,-0.089736,0.555604,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,1.440117,0.089581,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,-0.676392,-0.197338,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,1.80094,-0.151273,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,-0.040158,-0.194908,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
