In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import PolynomialFeatures

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Train.csv to Train.csv


In [8]:
df = pd.read_csv("Train.csv")
print("Dataset Size:", df.shape)
df.head()

Dataset Size: (10999, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [9]:
importance_tokens = df['Product_importance'].astype(str).apply(lambda x: [x])

hasher_imp = FeatureHasher(n_features=5, input_type='string')

hashed_importance = hasher_imp.transform(importance_tokens)

hashed_imp_df = pd.DataFrame(
    hashed_importance.toarray(),
    columns=[f"Imp_hash_{i}" for i in range(5)]
)

print("Hashed Importance Shape:", hashed_imp_df.shape)
hashed_imp_df.head()

Hashed Importance Shape: (10999, 5)


Unnamed: 0,Imp_hash_0,Imp_hash_1,Imp_hash_2,Imp_hash_3,Imp_hash_4
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0


In [10]:
combined_cols = df[['Warehouse_block', 'Gender']].astype(str)

combined_tokens = combined_cols.apply(lambda row: row.tolist(), axis=1)

hasher_combo = FeatureHasher(n_features=8, input_type='string')

hashed_combo = hasher_combo.transform(combined_tokens)

hashed_combo_df = pd.DataFrame(
    hashed_combo.toarray(),
    columns=[f"Combo_hash_{i}" for i in range(8)]
)

print("Combined Hash Shape:", hashed_combo_df.shape)
hashed_combo_df.head()

Combined Hash Shape: (10999, 8)


Unnamed: 0,Combo_hash_0,Combo_hash_1,Combo_hash_2,Combo_hash_3,Combo_hash_4,Combo_hash_5,Combo_hash_6,Combo_hash_7
0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0
4,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0


In [11]:
df['Value_Density'] = df['Cost_of_the_Product'] / (df['Weight_in_gms'] + 10)

In [12]:
df['Discount_Intensity'] = df['Discount_offered'] / (df['Cost_of_the_Product'] + df['Weight_in_gms'])

In [13]:
df['Service_Load'] = df['Customer_care_calls'] + df['Prior_purchases']

In [14]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

poly_cols = ['Weight_in_gms', 'Discount_offered']

poly_features = poly.fit_transform(df[poly_cols])

poly_names = poly.get_feature_names_out(poly_cols)

poly_df = pd.DataFrame(poly_features, columns=poly_names)

poly_df.head()

Unnamed: 0,Weight_in_gms,Discount_offered,Weight_in_gms Discount_offered
0,1233.0,44.0,54252.0
1,3088.0,59.0,182192.0
2,3374.0,48.0,161952.0
3,1177.0,10.0,11770.0
4,2484.0,46.0,114264.0


In [16]:
df_final = pd.concat([df, hashed_imp_df, hashed_combo_df, poly_df], axis=1)

print("Final Shape:", df_final.shape)
df_final.head()

Final Shape: (10999, 31)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,...,Combo_hash_1,Combo_hash_2,Combo_hash_3,Combo_hash_4,Combo_hash_5,Combo_hash_6,Combo_hash_7,Weight_in_gms,Discount_offered.1,Weight_in_gms Discount_offered
0,1,D,Flight,4,2,177,3,low,F,44,...,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,1233.0,44.0,54252.0
1,2,F,Flight,4,5,216,2,low,M,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3088.0,59.0,182192.0
2,3,A,Flight,2,2,183,4,low,M,48,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3374.0,48.0,161952.0
3,4,B,Flight,3,3,176,4,medium,M,10,...,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,1177.0,10.0,11770.0
4,5,C,Flight,2,2,184,3,medium,F,46,...,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,2484.0,46.0,114264.0
