# Advanced Imputation with Scikit-learn

### Objective: Use Scikit-learn’s imputation techniques (SimpleImputer & KNNImputer) to handle missing values more intelligently

# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer


# 2. Create a Sample Dataset

In [2]:
data = {
    'age': [25, np.nan, 32, 40, np.nan, 28],
    'salary': [50000, 60000, np.nan, 72000, 68000, np.nan],
    'city': ['Delhi', np.nan, 'Mumbai', 'Chennai', np.nan, 'Delhi']
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)


Original Data:
    age   salary     city
0  25.0  50000.0    Delhi
1   NaN  60000.0      NaN
2  32.0      NaN   Mumbai
3  40.0  72000.0  Chennai
4   NaN  68000.0      NaN
5  28.0      NaN    Delhi


# 3. Imputation Using Scikit-learn 
# A. Imputing Numerical Columns with SimpleImputer (Mean / Median)

In [3]:
num_imputer = SimpleImputer(strategy='median')

df[['age', 'salary']] = num_imputer.fit_transform(df[['age', 'salary']])


# B. Imputing Categorical Columns with SimpleImputer (Most Frequent)

In [4]:
cat_imputer = SimpleImputer(strategy='most_frequent')

df[['city']] = cat_imputer.fit_transform(df[['city']])


# 4. STEP:Advanced Numerical Imputation → KNNImputer

In [5]:
knn = KNNImputer(n_neighbors=3)

df[['age', 'salary']] = knn.fit_transform(df[['age', 'salary']])


# STEP:5 Final Dataset

In [6]:
print("\nAfter Advanced Imputation:")
print(df)



After Advanced Imputation:
    age   salary     city
0  25.0  50000.0    Delhi
1  30.0  60000.0    Delhi
2  32.0  64000.0   Mumbai
3  40.0  72000.0  Chennai
4  30.0  68000.0    Delhi
5  28.0  64000.0    Delhi
