In [55]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import (
    StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, QuantileTransformer,
    OneHotEncoder, OrdinalEncoder, PolynomialFeatures, FunctionTransformer, KBinsDiscretizer
)
from sklearn.feature_extraction import FeatureHasher

In [56]:
df = pd.read_csv('train.csv')
print(df)

          ID Warehouse_block Mode_of_Shipment  Customer_care_calls  \
0          1               D           Flight                    4   
1          2               F           Flight                    4   
2          3               A           Flight                    2   
3          4               B           Flight                    3   
4          5               C           Flight                    2   
...      ...             ...              ...                  ...   
10994  10995               A             Ship                    4   
10995  10996               B             Ship                    4   
10996  10997               C             Ship                    5   
10997  10998               F             Ship                    5   
10998  10999               D             Ship                    2   

       Customer_rating  Cost_of_the_Product  Prior_purchases  \
0                    2                  177                3   
1                    5         

In [57]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [59]:
#dropping unnecessary columms
if 'ID' in df.columns:
    df = df.drop('ID',axis=1)
targer_col = 'Reached.on.Time_Y.N'
x = df.drop(targer_col, axis =1)
y = df[targer_col]

print(f"Features Shape: {x.head()}")
print(f"Target Shape: {y.tail()}")

Features Shape:   Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0               D           Flight                    4                2   
1               F           Flight                    4                5   
2               A           Flight                    2                2   
3               B           Flight                    3                3   
4               C           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   
3                  176                4             medium      M   
4                  184                3             medium      F   

   Discount_offered  Weight_in_gms  
0                44           1233  
1                59           3088  
2

In [60]:
numeric_features = x.select_dtypes(include=['int64','float64']).columns
categorical_features = x.select_dtypes(include=['object']).columns

print(categorical_features)

print(numeric_features)

Index(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], dtype='object')
Index(['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms'],
      dtype='object')


In [61]:
x.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,D,Flight,4,2,177,3,low,F,44,1233
1,F,Flight,4,5,216,2,low,M,59,3088
2,A,Flight,2,2,183,4,low,M,48,3374
3,B,Flight,3,3,176,4,medium,M,10,1177
4,C,Flight,2,2,184,3,medium,F,46,2484


In [62]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Warehouse_block      10999 non-null  object
 1   Mode_of_Shipment     10999 non-null  object
 2   Customer_care_calls  10999 non-null  int64 
 3   Customer_rating      10999 non-null  int64 
 4   Cost_of_the_Product  10999 non-null  int64 
 5   Prior_purchases      10999 non-null  int64 
 6   Product_importance   10999 non-null  object
 7   Gender               10999 non-null  object
 8   Discount_offered     10999 non-null  int64 
 9   Weight_in_gms        10999 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 859.4+ KB


In [63]:
df['Product_importance'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [64]:
#ordinal feature encoding
importance_mapping={
    'low':1,
    'medium':2,
    'high':3
}

In [65]:
x['Product_importance']=x['Product_importance'].map(importance_mapping)

In [66]:
x['Product_importance'].head(10)

0    1
1    1
2    1
3    2
4    2
5    2
6    1
7    1
8    1
9    2
Name: Product_importance, dtype: int64

In [67]:
x[['Product_importance']].head(20)

Unnamed: 0,Product_importance
0,1
1,1
2,1
3,2
4,2
5,2
6,1
7,1
8,1
9,2


In [64]:
inv_mapping = {v: k for k, v in importance_mapping.items()}
x['Product_importance']=x['Product_importance'].map(inv_mapping)

In [66]:
custom_order = [['low', 'medium', 'high']]
encoder = OrdinalEncoder(categories=custom_order)

x['Product_importance'] = encoder.fit_transform(
    x[['Product_importance']])


In [68]:
x['Gender'].unique()

array(['F', 'M'], dtype=object)

In [69]:
x['Gender'].head(10)

0    F
1    M
2    M
3    M
4    F
5    F
6    F
7    F
8    F
9    F
Name: Gender, dtype: object

In [70]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
x['Gender'] = encoder.fit_transform(
    x['Gender'])

In [71]:
x['Gender'].head()

0    0
1    1
2    1
3    1
4    0
Name: Gender, dtype: int32

In [72]:
x[['Gender']].head(10)

Unnamed: 0,Gender
0,0
1,1
2,1
3,1
4,0
5,0
6,0
7,0
8,0
9,0


In [73]:
f_hash = x['Mode_of_Shipment'].astype(str).apply(lambda val: [val])
hasher = FeatureHasher(n_features=4, input_type='string')
hashed_f = hasher.transform(f_hash)
hashed_df = pd.DataFrame(
    hashed_f.toarray(),
    columns=[f"Mode_of_Shipment_hash_{i}" for i in range(4)]
)

In [74]:
print(hashed_df.shape)

(10999, 4)


In [75]:
hashed_df.head()

Unnamed: 0,Mode_of_Shipment_hash_0,Mode_of_Shipment_hash_1,Mode_of_Shipment_hash_2,Mode_of_Shipment_hash_3
0,0.0,0.0,-1.0,0.0
1,0.0,0.0,-1.0,0.0
2,0.0,0.0,-1.0,0.0
3,0.0,0.0,-1.0,0.0
4,0.0,0.0,-1.0,0.0


In [76]:
f_hash_warehouse = x['Warehouse_block'].astype(str).apply(lambda val: [val])
hasher_warehouse = FeatureHasher(n_features=6, input_type='string')
hashed_f_warehouse = hasher_warehouse.transform(f_hash_warehouse)
hashed_df_warehouse = pd.DataFrame(
    hashed_f_warehouse.toarray(),
    columns=[f"Warehouse_block_hash_{i}" for i in range(6)]
)

In [77]:
print(hashed_df_warehouse.shape)

(10999, 6)


In [78]:
hashed_df_warehouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Warehouse_block_hash_0  10999 non-null  float64
 1   Warehouse_block_hash_1  10999 non-null  float64
 2   Warehouse_block_hash_2  10999 non-null  float64
 3   Warehouse_block_hash_3  10999 non-null  float64
 4   Warehouse_block_hash_4  10999 non-null  float64
 5   Warehouse_block_hash_5  10999 non-null  float64
dtypes: float64(6)
memory usage: 515.7 KB


In [79]:
hashed_df_warehouse.head()

Unnamed: 0,Warehouse_block_hash_0,Warehouse_block_hash_1,Warehouse_block_hash_2,Warehouse_block_hash_3,Warehouse_block_hash_4,Warehouse_block_hash_5
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,-1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,-1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,-1.0,0.0,0.0


In [80]:
cols = ['Mode_of_Shipment', 'Warehouse_block']
combined = df[cols].astype(str).agg(' '.join, axis=1)
tokens = combined.apply(lambda x: x.split())
hasher = FeatureHasher(n_features=6, input_type='string')
hashed = hasher.transform(tokens)
hashed_df = pd.DataFrame(
    hashed.toarray(),
    columns=[f"hash_{i}" for i in range(6)]
)

In [81]:
x= pd.concat([x, hashed_df], axis=1)
print("New shape:", x.shape)
x.head()

New shape: (10999, 16)


Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,hash_0,hash_1,hash_2,hash_3,hash_4,hash_5
0,D,Flight,4,2,177,3,1,0,44,1233,0.0,0.0,0.0,0.0,0.0,0.0
1,F,Flight,4,5,216,2,1,1,59,3088,0.0,0.0,-2.0,0.0,0.0,0.0
2,A,Flight,2,2,183,4,1,1,48,3374,0.0,0.0,-1.0,0.0,1.0,0.0
3,B,Flight,3,3,176,4,2,1,10,1177,0.0,0.0,-2.0,0.0,0.0,0.0
4,C,Flight,2,2,184,3,2,0,46,2484,0.0,0.0,-1.0,-1.0,0.0,0.0


In [82]:
#polynomialfeature generation
num_cols = ['Weight_in_gms', 'Discount_offered']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df[num_cols])
poly_df = pd.DataFrame(
    poly_data,
    columns=poly.get_feature_names_out(num_cols)
)

In [83]:
poly_df.head()

Unnamed: 0,Weight_in_gms,Discount_offered,Weight_in_gms^2,Weight_in_gms Discount_offered,Discount_offered^2
0,1233.0,44.0,1520289.0,54252.0,1936.0
1,3088.0,59.0,9535744.0,182192.0,3481.0
2,3374.0,48.0,11383876.0,161952.0,2304.0
3,1177.0,10.0,1385329.0,11770.0,100.0
4,2484.0,46.0,6170256.0,114264.0,2116.0


In [84]:
num_cols = ['Cost_of_the_Product', 'Discount_offered']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df[num_cols])
poly_df = pd.DataFrame(
    poly_data,
    columns=poly.get_feature_names_out(num_cols)
)

In [85]:
poly_df.head()

Unnamed: 0,Cost_of_the_Product,Discount_offered,Cost_of_the_Product^2,Cost_of_the_Product Discount_offered,Discount_offered^2
0,177.0,44.0,31329.0,7788.0,1936.0
1,216.0,59.0,46656.0,12744.0,3481.0
2,183.0,48.0,33489.0,8784.0,2304.0
3,176.0,10.0,30976.0,1760.0,100.0
4,184.0,46.0,33856.0,8464.0,2116.0


In [86]:
x.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,hash_0,hash_1,hash_2,hash_3,hash_4,hash_5
0,D,Flight,4,2,177,3,1,0,44,1233,0.0,0.0,0.0,0.0,0.0,0.0
1,F,Flight,4,5,216,2,1,1,59,3088,0.0,0.0,-2.0,0.0,0.0,0.0
2,A,Flight,2,2,183,4,1,1,48,3374,0.0,0.0,-1.0,0.0,1.0,0.0
3,B,Flight,3,3,176,4,2,1,10,1177,0.0,0.0,-2.0,0.0,0.0,0.0
4,C,Flight,2,2,184,3,2,0,46,2484,0.0,0.0,-1.0,-1.0,0.0,0.0


In [87]:
#final categorical encoded data
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      10999 non-null  object 
 1   Mode_of_Shipment     10999 non-null  object 
 2   Customer_care_calls  10999 non-null  int64  
 3   Customer_rating      10999 non-null  int64  
 4   Cost_of_the_Product  10999 non-null  int64  
 5   Prior_purchases      10999 non-null  int64  
 6   Product_importance   10999 non-null  int64  
 7   Gender               10999 non-null  int32  
 8   Discount_offered     10999 non-null  int64  
 9   Weight_in_gms        10999 non-null  int64  
 10  hash_0               10999 non-null  float64
 11  hash_1               10999 non-null  float64
 12  hash_2               10999 non-null  float64
 13  hash_3               10999 non-null  float64
 14  hash_4               10999 non-null  float64
 15  hash_5               10999 non-null 