# Categorical and Numerical Imputation

# Objective: Clean a dataset that contains both numerical and categorical missing values using different imputation strategies.

# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np


# 2. Create a Mixed Dataset with Missing Values

In [2]:
data = {
    'age': [25, np.nan, 35, 40, np.nan, 28],
    'salary': [50000, np.nan, 56000, 72000, np.nan, 59000],
    'city': ['Delhi', 'Mumbai', np.nan, 'Chennai', np.nan, 'Delhi'],
    'gender': ['M', np.nan, 'F', 'F', 'M', np.nan]
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)


Original Dataset:
    age   salary     city gender
0  25.0  50000.0    Delhi      M
1   NaN      NaN   Mumbai    NaN
2  35.0  56000.0      NaN      F
3  40.0  72000.0  Chennai      F
4   NaN      NaN      NaN      M
5  28.0  59000.0    Delhi    NaN


# 3. Identify Missing Values

In [3]:
print("\nMissing Values Per Column:")
print(df.isnull().sum())



Missing Values Per Column:
age       2
salary    2
city      2
gender    2
dtype: int64


# 4. Handle Numerical Missing Values

In [6]:
df['age'] = df['age'].fillna(df['age'].median())
df['salary'] = df['salary'].fillna(df['salary'].median())


# 5. Handle Categorical Missing Values

In [11]:
df['city']  = df['city'].fillna(df['city'].mode()[0])
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])


# 6. Final Check

In [12]:
print("\nMissing Values After Imputation:")
print(df.isnull().sum())



Missing Values After Imputation:
age       0
salary    0
city      0
gender    0
dtype: int64


# 7. Final Cleaned Dataset

In [13]:
print("\nCleaned Dataset:")
print(df)



Cleaned Dataset:
    age   salary     city gender
0  25.0  50000.0    Delhi      M
1  32.0  59250.0   Mumbai      F
2  35.0  56000.0    Delhi      F
3  40.0  72000.0  Chennai      F
4  32.0  59250.0    Delhi      M
5  28.0  59000.0    Delhi      F
