In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split


In [24]:
df = pd.read_csv("Train.csv")

print("Dataset Shape:", df.shape)
display(df.head())

Dataset Shape: (1000, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,C,Flight,2,5,306,6,high,M,45,3838,1
1,2,F,Ship,7,2,114,3,high,M,35,2710,1
2,3,B,Ship,7,2,215,7,medium,F,44,4152,0
3,4,A,Road,5,3,126,5,medium,M,54,2245,0
4,5,F,Ship,3,5,113,3,medium,M,43,1806,1


In [25]:
df.drop(['ID'], axis=1, inplace=True)

print("After Dropping ID:", df.shape)
display(df.head())

After Dropping ID: (1000, 11)


Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,C,Flight,2,5,306,6,high,M,45,3838,1
1,F,Ship,7,2,114,3,high,M,35,2710,1
2,B,Ship,7,2,215,7,medium,F,44,4152,0
3,A,Road,5,3,126,5,medium,M,54,2245,0
4,F,Ship,3,5,113,3,medium,M,43,1806,1


In [26]:
X = df.drop('Reached.on.Time_Y.N', axis=1)
y = df['Reached.on.Time_Y.N']

print("Feature Shape:", X.shape)
print("Target Shape:", y.shape)

Feature Shape: (1000, 10)
Target Shape: (1000,)


In [27]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

print("Categorical Columns:", list(cat_cols))
print("Numerical Columns:", list(num_cols))

Categorical Columns: ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']
Numerical Columns: ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']


In [28]:
for col in num_cols:
    X[col] = X[col].fillna(X[col].median())

for col in cat_cols:
    X[col] = X[col].fillna("Missing")

print("Missing Values After Handling:")
print(X.isnull().sum())

Missing Values After Handling:
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64


In [29]:
X_cat = X[cat_cols].astype(str)

cat_dict = X_cat.to_dict(orient='records')

print("Sample Dictionary Format:")
print(cat_dict[:3])

Sample Dictionary Format:
[{'Warehouse_block': 'C', 'Mode_of_Shipment': 'Flight', 'Product_importance': 'high', 'Gender': 'M'}, {'Warehouse_block': 'F', 'Mode_of_Shipment': 'Ship', 'Product_importance': 'high', 'Gender': 'M'}, {'Warehouse_block': 'B', 'Mode_of_Shipment': 'Ship', 'Product_importance': 'medium', 'Gender': 'F'}]


In [30]:
hasher = FeatureHasher(n_features=8, input_type='dict')

hashed_features = hasher.transform(cat_dict)

hashed_df = pd.DataFrame(hashed_features.toarray())

print("Hashed Feature Shape:", hashed_df.shape)
display(hashed_df.head())

Hashed Feature Shape: (1000, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,-2.0,0.0,0.0,1.0,0.0,0.0,-1.0
1,0.0,-1.0,1.0,0.0,1.0,0.0,0.0,-1.0
2,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-1.0


In [31]:
hashed_df.reset_index(drop=True, inplace=True)
X[num_cols].reset_index(drop=True, inplace=True)

X_final = pd.concat([X[num_cols], hashed_df], axis=1)

print("Final Feature Shape:", X_final.shape)
display(X_final.head())

Final Feature Shape: (1000, 14)


Unnamed: 0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,0,1,2,3,4,5,6,7
0,2,5,306,6,45,3838,0.0,-2.0,0.0,0.0,1.0,0.0,0.0,-1.0
1,7,2,114,3,35,2710,0.0,-1.0,1.0,0.0,1.0,0.0,0.0,-1.0
2,7,2,215,7,44,4152,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,5,3,126,5,54,2245,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,3,5,113,3,43,1806,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-1.0
