In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer  # Explicitly enable experimental features
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import (
    StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, QuantileTransformer,
    OneHotEncoder, OrdinalEncoder, PolynomialFeatures, FunctionTransformer, KBinsDiscretizer
)
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans


sns.set_theme(style="whitegrid")


In [8]:
pip install KNNImputer


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement KNNImputer (from versions: none)
ERROR: No matching distribution found for KNNImputer


In [9]:
df = pd.read_csv('Train.csv')
print(df)
df.head()

          ID Warehouse_block Mode_of_Shipment  Customer_care_calls  \
0          1               D           Flight                    4   
1          2               F           Flight                    4   
2          3               A           Flight                    2   
3          4               B           Flight                    3   
4          5               C           Flight                    2   
...      ...             ...              ...                  ...   
10994  10995               A             Ship                    4   
10995  10996               B             Ship                    4   
10996  10997               C             Ship                    5   
10997  10998               F             Ship                    5   
10998  10999               D             Ship                    2   

       Customer_rating  Cost_of_the_Product  Prior_purchases  \
0                    2                  177                3   
1                    5         

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [10]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [11]:
if 'ID' in df.columns:
    df = df.drop('ID',axis=1)
targer_col = 'Reached.on.Time_Y.N'
x = df.drop(targer_col, axis =1)
y = df[targer_col]

print(f"Features Shape: {x.head()}")
print(f"Target Shape: {y.tail()}")


Features Shape:   Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0               D           Flight                    4                2   
1               F           Flight                    4                5   
2               A           Flight                    2                2   
3               B           Flight                    3                3   
4               C           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   
3                  176                4             medium      M   
4                  184                3             medium      F   

   Discount_offered  Weight_in_gms  
0                44           1233  
1                59           3088  
2

In [12]:
numeric_features = x.select_dtypes(include=['int64','float64']).columns
categorical_features = x.select_dtypes(include=['object']).columns

print(categorical_features)

print(numeric_features)

Index(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], dtype='object')
Index(['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms'],
      dtype='object')


In [13]:
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

numerical_transformer

In [14]:
categorical_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
categorical_transformer
                        
                            


In [15]:
proprocessor = ColumnTransformer(
    transformers = [
        ('num',numerical_transformer,numeric_features),
        ('cat',categorical_transformer, categorical_features)
    ])
proprocessor

In [16]:
pd.set_option('display.max_columns',None)
import warnings

warnings.filterwarnings('ignore')

np.random.seed(42)
n = 1000
df = pd.DataFrame({
    'Age': np.random.normal(35 ,10,n),
    'Income': np.random.lognormal(10,1,n),
    'City': np.random.choice(['New York','London','Paris','Tokyo','Unknown']+[f'City_{i}' for i in range(50)],n),
    'Score': np.random.uniform(1,100,n),
    "Category_A" : np.random.choice(['A','B','C',np.nan],n, p =[0.3,0.3,0.3,0.1]),
    "Feature_x" : np.random.normal(0,1,n),
    "Feature_y" : np.random.normal(0,1,n),
    "Target" : np.random.randint(0,2,n)
})
df.loc[1:10, 'Age']=150 
df.to_csv("Synthetic Data")
df.to_excel("Synthetic-Data.xlsx",index=False)


df

Unnamed: 0,Age,Income,City,Score,Category_A,Feature_x,Feature_y,Target
0,39.967142,89264.168397,City_24,73.540614,C,0.204759,0.499096,1
1,150.000000,55527.500577,City_25,61.066441,C,0.765878,-1.692271,0
2,150.000000,23379.862874,City_32,67.614295,C,-1.615887,-0.170386,0
3,150.000000,11534.100901,City_5,9.033769,,0.299648,-0.812824,0
4,150.000000,44277.118652,City_29,95.197580,A,0.932435,0.828059,0
...,...,...,...,...,...,...,...,...
995,32.188997,64225.155190,City_42,71.757165,A,2.394362,0.069337,1
996,52.976865,21449.974612,City_28,30.041989,B,2.185095,0.862543,1
997,41.408429,9119.090469,City_34,53.021560,,-0.686715,-1.694973,0
998,29.288210,18712.238164,City_5,70.085446,A,1.623885,0.820232,1


In [19]:
from sklearn.impute import KNNImputer


knn_imputer = KNNImputer(n_neighbors=5)
df_knn = knn_imputer.fit_transform(df[['Age','Score','Feature_x']])
print("KNN Imputation output shape:", df_knn.shape)
df_knn

KNN Imputation output shape: (1000, 3)


array([[ 39.96714153,  73.540614  ,   0.20475888],
       [150.        ,  61.06644075,   0.76587828],
       [150.        ,  67.61429519,  -1.61588679],
       ...,
       [ 41.40842861,  53.02156043,  -0.68671534],
       [ 29.2882101 ,  70.08544577,   1.62388499],
       [ 40.72582781,  90.12614701,   0.38316819]])

In [2]:
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# --- GENERATE SYNTHETIC SHIPMENT DATA ---
# Creating a dataset that mimics the ShipmentSure problem (delays, costs, weights, etc.)
np.random.seed(42)
n = 1000
df = pd.DataFrame({
    'Warehouse_block': np.random.choice(['A', 'B', 'C', 'D', 'F'], n),
    'Mode_of_Shipment': np.random.choice(['Flight', 'Ship', 'Road', np.nan], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values example
    'Customer_care_calls': np.random.randint(2, 7, n),
    'Customer_rating': np.random.randint(1, 6, n),
    'Cost_of_the_Product': np.random.gamma(shape=2, scale=100, size=n), # Skewed distribution
    'Prior_purchases': np.random.randint(2, 10, n),
    'Product_importance': np.random.choice(['low', 'medium', 'high', 'critical'], n, p=[0.4, 0.4, 0.15, 0.05]),
    'Gender': np.random.choice(['M', 'F'], n),
    'Discount_offered': np.random.lognormal(2, 0.5, n).astype(int), # Highly skewed, potential outliers
    'Weight_in_gms': np.random.normal(4000, 1000, n),
    'Destination_City': np.random.choice(['New York', 'London', 'Paris', 'Tokyo', 'Unknown'] + [f'City_{i}' for i in range(50)], n), # High cardinality
    'Order_Hour': np.random.randint(0, 24, n),
    'Reached_on_Time': np.random.randint(0, 2, n)
})

# Introduce Missing Values in Weight (simulating data entry errors)
df.loc[0:20, 'Weight_in_gms'] = np.nan

# Introduce Extreme Outliers in Cost (e.g. VIP shipments)
df.loc[900:910, 'Cost_of_the_Product'] = 5000 

X = df.drop('Reached_on_Time', axis=1)
y = df['Reached_on_Time']

print("✅ Synthetic Shipment Data Created.")
print(df.head(3))

✅ Synthetic Shipment Data Created.
  Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0               D              nan                    4                2   
1               F             Ship                    2                2   
2               C             Road                    6                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0           311.610821                6                low      F   
1            49.650596                3             medium      M   
2           259.168778                3                low      M   

   Discount_offered  Weight_in_gms Destination_City  Order_Hour  \
0                 3            NaN           City_3          18   
1                 7            NaN          City_29          12   
2                 7            NaN          City_46          23   

   Reached_on_Time  
0                1  
1                0  
2                0  


In [13]:
# distance = sqrt({x2-x1)2 + (y2-y1)2})
knn_imputer = KNNImputer(n_neighbors= 5)
df_knn = knn_imputer.fit_transform(df[['Weight_in_gms','Cost_of_the_Product','Discount_offered']])
print("Knn Imputation Output shape:", df_knn.shape)
type(df_knn)

Knn Imputation Output shape: (1000, 3)


numpy.ndarray

In [24]:
df_knn = pd.DataFrame(knn_imputer.fit_transform(df[['Weight_in_gms','Cost_of_the_Product','Discount_offered']]),
                      columns = ['Weight_in_gms','Cost_of_the_Product','Discount_offered'])
df_knn.describe()
df_knn
    

Unnamed: 0,Weight_in_gms,Cost_of_the_Product,Discount_offered
0,3473.738849,311.610821,3.0
1,3986.615693,49.650596,7.0
2,4040.379338,259.168778,7.0
3,3508.767069,244.611753,10.0
4,4204.831921,268.205824,9.0
...,...,...,...
995,574.204647,328.756075,12.0
996,2161.851553,61.328146,8.0
997,2334.039534,92.280223,4.0
998,4266.537931,193.313501,6.0


In [23]:
# Multivariant Imputation by chained Equation (MICE)

iter_imputer = IterativeImputer(max_iter=10,random_state=0)
df_mice = iter_imputer.fit_transform(df[['Weight_in_gms','Cost_of_the_Product','Discount_offered']])
print(" MICE Iputation output shape:", df_mice.shape)
df_mice

 MICE Iputation output shape: (1000, 3)


array([[4.01998672e+03, 3.11610821e+02, 3.00000000e+00],
       [4.01999232e+03, 4.96505956e+01, 7.00000000e+00],
       [4.01998784e+03, 2.59168778e+02, 7.00000000e+00],
       ...,
       [2.33403953e+03, 9.22802231e+01, 4.00000000e+00],
       [4.26653793e+03, 1.93313501e+02, 6.00000000e+00],
       [3.02866731e+03, 3.28258748e+02, 1.60000000e+01]])

In [30]:
# 3. Custom Transformer for Winsorization
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_quantile=0.01, upper_quantile=0.99):
        self.lower = lower_quantile
        self.upper = upper_quantile
        self.limits_ = {}

    def fit(self, X, y=None):
        for col in X.columns:
            self.limits_[col] = (X[col].quantile(self.lower), X[col].quantile(self.upper))
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in X.columns:
            low, high = self.limits_[col]
            X_copy[col] = X_copy[col].clip(lower=low, upper=high)
        return X_copy

winsorizer = Winsorizer(upper_quantile=0.95) # Cap at 95th percentile
df_capped = winsorizer.fit_transform(df[['Cost_of_the_Product']])
print(f"Max Cost Before: {df['Cost_of_the_Product'].max():.2f}, After: {df_capped['Cost_of_the_Product'].max():.2f}")

Max Cost Before: 5000.00, After: 509.34
