In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import PolynomialFeatures

sns.set_theme(style="whitegrid")

In [3]:
df = pd.read_csv("../Data/Train.csv")

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (10999, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [4]:
print("Column\n", df.columns)
print("Missing Value\n", df.isnull().sum())

Column
 Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')
Missing Value
 ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64


In [6]:
mode_list = df['Mode_of_Shipment'].astype(str).apply(lambda x: [x])

hasher = FeatureHasher(n_features=4, input_type='string')

hashed_features = hasher.transform(mode_list)

hashed_df = pd.DataFrame(
    hashed_features.toarray(),
    columns=[f"Mode_hash_{i}" for i in range(4)]
)

print("Hashed Features Shape:", hashed_df.shape)
hashed_df.head()

Hashed Features Shape: (10999, 4)


Unnamed: 0,Mode_hash_0,Mode_hash_1,Mode_hash_2,Mode_hash_3
0,0.0,0.0,-1.0,0.0
1,0.0,0.0,-1.0,0.0
2,0.0,0.0,-1.0,0.0
3,0.0,0.0,-1.0,0.0
4,0.0,0.0,-1.0,0.0


In [8]:
cat_cols = ['Mode_of_Shipment', 'Warehouse_block']
combined = df[cat_cols].astype(str).agg(' '.join, axis=1)
combined_tokens = combined.apply(lambda x: x.split())

hasher_multi = FeatureHasher(n_features=6, input_type='string')

hashed_multi = hasher_multi.transform(combined_tokens)
hashed_multi_df = pd.DataFrame(
    hashed_multi.toarray(),
    columns=[f"Hash_{i}" for i in range(6)]
)
print("Hashed Multi Shape:", hashed_multi_df.shape)
hashed_multi_df.head()

Hashed Multi Shape: (10999, 6)


Unnamed: 0,Hash_0,Hash_1,Hash_2,Hash_3,Hash_4,Hash_5
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-2.0,0.0,0.0,0.0
2,0.0,0.0,-1.0,0.0,1.0,0.0
3,0.0,0.0,-2.0,0.0,0.0,0.0
4,0.0,0.0,-1.0,-1.0,0.0,0.0


In [9]:
df['Cost_per_Gram'] = df['Cost_of_the_Product'] / (df['Weight_in_gms'] + 1)

df[['Cost_of_the_Product','Weight_in_gms','Cost_per_Gram']].head()

Unnamed: 0,Cost_of_the_Product,Weight_in_gms,Cost_per_Gram
0,177,1233,0.143436
1,216,3088,0.069926
2,183,3374,0.054222
3,176,1177,0.149406
4,184,2484,0.074044


In [10]:
df['Discount_Ratio'] = df['Discount_offered'] / (df['Cost_of_the_Product'] + 1)

df[['Discount_offered','Cost_of_the_Product','Discount_Ratio']].head()

Unnamed: 0,Discount_offered,Cost_of_the_Product,Discount_Ratio
0,44,177,0.247191
1,59,216,0.271889
2,48,183,0.26087
3,10,176,0.056497
4,46,184,0.248649


In [11]:
df['Customer_Engagement'] = df['Customer_care_calls'] * df['Customer_rating']

df[['Customer_care_calls','Customer_rating','Customer_Engagement']].head()

Unnamed: 0,Customer_care_calls,Customer_rating,Customer_Engagement
0,4,2,8
1,4,5,20
2,2,2,4
3,3,3,9
4,2,2,4


In [12]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

cols = ['Cost_of_the_Product','Discount_offered']

poly_features = poly.fit_transform(df[cols])

poly_feature_names = poly.get_feature_names_out(cols)

poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

poly_df.head()

Unnamed: 0,1,Cost_of_the_Product,Discount_offered,Cost_of_the_Product Discount_offered
0,1.0,177.0,44.0,7788.0
1,1.0,216.0,59.0,12744.0
2,1.0,183.0,48.0,8784.0
3,1.0,176.0,10.0,1760.0
4,1.0,184.0,46.0,8464.0


In [13]:
df_final = pd.concat([df, hashed_df, hashed_multi_df, poly_df], axis=1)

print("Final Dataset Shape:", df_final.shape)
df_final.head()

Final Dataset Shape: (10999, 29)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,...,Hash_0,Hash_1,Hash_2,Hash_3,Hash_4,Hash_5,1,Cost_of_the_Product.1,Discount_offered.1,Cost_of_the_Product Discount_offered
0,1,D,Flight,4,2,177,3,low,F,44,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,177.0,44.0,7788.0
1,2,F,Flight,4,5,216,2,low,M,59,...,0.0,0.0,-2.0,0.0,0.0,0.0,1.0,216.0,59.0,12744.0
2,3,A,Flight,2,2,183,4,low,M,48,...,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,183.0,48.0,8784.0
3,4,B,Flight,3,3,176,4,medium,M,10,...,0.0,0.0,-2.0,0.0,0.0,0.0,1.0,176.0,10.0,1760.0
4,5,C,Flight,2,2,184,3,medium,F,46,...,0.0,0.0,-1.0,-1.0,0.0,0.0,1.0,184.0,46.0,8464.0
