In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [16]:
df = pd.read_csv("Train.csv")

print("First 5 Rows:")
print(df.head())

print("\nShape of Dataset:", df.shape)

First 5 Rows:
   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0   1               C           Flight                    2                5   
1   2               F             Ship                    7                2   
2   3               B             Ship                    7                2   
3   4               A             Road                    5                3   
4   5               F             Ship                    3                5   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  306                6               high      M   
1                  114                3               high      M   
2                  215                7             medium      F   
3                  126                5             medium      M   
4                  113                3             medium      M   

   Discount_offered  Weight_in_gms  Reached.on.Time_Y.N  
0                45           38

In [17]:
if 'ID' in df.columns:
    df.drop('ID', axis=1, inplace=True)
    print("\nID column removed")


ID column removed


In [30]:
df.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,C,Flight,2,5,306,6,high,M,45,3838,1
1,F,Ship,7,2,114,3,high,M,35,2710,1
2,B,Ship,7,2,215,7,medium,F,44,4152,0
3,A,Road,5,3,126,5,medium,M,54,2245,0
4,F,Ship,3,5,113,3,medium,M,43,1806,1


In [18]:
target = 'Reached.on.Time_Y.N'

X = df.drop(target, axis=1)
y = df[target]

print("\nInput Features Shape:", X.shape)
print("Target Shape:", y.shape)


Input Features Shape: (1000, 10)
Target Shape: (1000,)


In [19]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print("\nNumerical Columns:", list(num_cols))
print("Categorical Columns:", list(cat_cols))


Numerical Columns: ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']
Categorical Columns: ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']


In [20]:
print("\nMissing Values Before Cleaning:")
print(X.isnull().sum())


Missing Values Before Cleaning:
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64


In [21]:
for col in num_cols:
    X[col].fillna(X[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)


In [22]:
for col in cat_cols:
    X[col].fillna("Missing", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna("Missing", inplace=True)


In [23]:
print("\nMissing Values After Cleaning:")
print(X.isnull().sum())


Missing Values After Cleaning:
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64


In [24]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

print("\nShape After Encoding:", X.shape)


Shape After Encoding: (1000, 15)


In [25]:
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

print("\nData After Scaling (First 5 Rows):")
print(X.head())


Data After Scaling (First 5 Rows):
   Customer_care_calls  Customer_rating  Cost_of_the_Product  Prior_purchases  \
0            -1.458242         1.418235             1.685582         0.006219   
1             1.431655        -0.689098            -1.376677        -1.159787   
2             1.431655        -0.689098             0.234199         0.394887   
3             0.275696         0.013346            -1.185286        -0.382450   
4            -0.880263         1.418235            -1.392626        -1.159787   

   Discount_offered  Weight_in_gms  Warehouse_block_B  Warehouse_block_C  \
0          0.703501       0.239186              False               True   
1          0.162887      -0.562394              False              False   
2          0.649440       0.462321               True              False   
3          1.190054      -0.892833              False              False   
4          0.595378      -1.204796              False              False   

   Warehouse_block_D

In [26]:
syn_df = pd.read_excel("Synthetic-Data.xlsx")

print("\nSynthetic Data Preview:")
print(syn_df.head())

print("\nMissing Values Before KNN:")
print(syn_df.isnull().sum())


Synthetic Data Preview:
          Age        Income     City      Score Category_A  Feature_x  \
0   39.967142  89264.168397  City_24  73.540614          C   0.204759   
1  150.000000  55527.500577  City_25  61.066441          C   0.765878   
2  150.000000  23379.862874  City_32  67.614295          C  -1.615887   
3  150.000000  11534.100901   City_5   9.033769        NaN   0.299648   
4  150.000000  44277.118652  City_29  95.197580          A   0.932435   

   Feature_y  Target  
0   0.499096       1  
1  -1.692271       0  
2  -0.170386       0  
3  -0.812824       0  
4   0.828059       0  

Missing Values Before KNN:
Age            0
Income         0
City           0
Score          0
Category_A    94
Feature_x      0
Feature_y      0
Target         0
dtype: int64


In [27]:
imputer = KNNImputer(n_neighbors=5)

numeric_cols = syn_df.select_dtypes(include=['int64', 'float64']).columns

syn_imputed = imputer.fit_transform(syn_df[numeric_cols])

syn_imputed_df = pd.DataFrame(syn_imputed, columns=numeric_cols)

In [28]:
syn_df[numeric_cols] = syn_imputed_df

In [29]:
print("\nMissing Values After KNN:")
print(syn_df.isnull().sum())

print("\nFinal Synthetic Data:")
print(syn_df.head())


Missing Values After KNN:
Age            0
Income         0
City           0
Score          0
Category_A    94
Feature_x      0
Feature_y      0
Target         0
dtype: int64

Final Synthetic Data:
          Age        Income     City      Score Category_A  Feature_x  \
0   39.967142  89264.168397  City_24  73.540614          C   0.204759   
1  150.000000  55527.500577  City_25  61.066441          C   0.765878   
2  150.000000  23379.862874  City_32  67.614295          C  -1.615887   
3  150.000000  11534.100901   City_5   9.033769        NaN   0.299648   
4  150.000000  44277.118652  City_29  95.197580          A   0.932435   

   Feature_y  Target  
0   0.499096     1.0  
1  -1.692271     0.0  
2  -0.170386     0.0  
3  -0.812824     0.0  
4   0.828059     0.0  
