### Imputing Missing Data
**Description**: Impute missing data using various strategies like mean, median, or mode.

In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer

def check_null_percentage(df, column, threshold=0.5):
    """Check if the null percentage in a column exceeds threshold."""
    null_ratio = df[column].isnull().mean()
    if null_ratio > threshold:
        raise ValueError(f"Too many missing values in '{column}' ({null_ratio:.2%}) to reliably impute.")
    return null_ratio

def impute_mean(df, column):
    try:
        check_null_percentage(df, column)
        imputer = SimpleImputer(strategy='mean')
        df[column] = imputer.fit_transform(df[[column]])
        print(f"Imputed missing values in '{column}' with mean.")
    except Exception as e:
        print(f"Mean imputation failed for '{column}': {e}")

def impute_median(df, column):
    try:
        check_null_percentage(df, column)
        imputer = SimpleImputer(strategy='median')
        df[column] = imputer.fit_transform(df[[column]])
        print(f"Imputed missing values in '{column}' with median.")
    except Exception as e:
        print(f"Median imputation failed for '{column}': {e}")

def impute_mode(df, column):
    try:
        check_null_percentage(df, column)
        imputer = SimpleImputer(strategy='most_frequent')
        df[column] = imputer.fit_transform(df[[column]])
        print(f"Imputed missing values in '{column}' with mode.")
    except Exception as e:
        print(f"Mode imputation failed for '{column}': {e}")

# Sample dataframe
data = {
    'Age': [25, 30, None, 22, 28, None],
    'Income': [50000, 60000, 55000, None, 62000, 58000],
    'Gender': ['M', None, 'F', 'F', 'M', None]
}
df = pd.DataFrame(data)
print("Original Data:\n", df)

# Impute columns
impute_mean(df, 'Age')
impute_median(df, 'Income')
impute_mode(df, 'Gender')

print("\nData After Imputation:\n", df)

Original Data:
     Age   Income Gender
0  25.0  50000.0      M
1  30.0  60000.0   None
2   NaN  55000.0      F
3  22.0      NaN      F
4  28.0  62000.0      M
5   NaN  58000.0   None
Imputed missing values in 'Age' with mean.
Imputed missing values in 'Income' with median.
Mode imputation failed for 'Gender': '<' not supported between instances of 'NoneType' and 'str'

Data After Imputation:
      Age   Income Gender
0  25.00  50000.0      M
1  30.00  60000.0   None
2  26.25  55000.0      F
3  22.00  58000.0      F
4  28.00  62000.0      M
5  26.25  58000.0   None


In [2]:
# Write your code from here

import pandas as pd
from sklearn.impute import SimpleImputer

# Sample dataframe with missing values
data = {
    'Age': [25, 30, None, 22, 28, None],
    'Income': [50000, 60000, 55000, None, 62000, 58000],
    'Gender': ['M', None, 'F', 'F', 'M', None]
}
df = pd.DataFrame(data)
print("Original Data:\n", df)

# Impute numeric columns with mean
mean_imputer = SimpleImputer(strategy='mean')
df['Age'] = mean_imputer.fit_transform(df[['Age']])

# Impute numeric columns with median
median_imputer = SimpleImputer(strategy='median')
df['Income'] = median_imputer.fit_transform(df[['Income']])

# Impute categorical columns with mode (most frequent)
mode_imputer = SimpleImputer(strategy='most_frequent')
df['Gender'] = mode_imputer.fit_transform(df[['Gender']])

print("\nData After Imputation:\n", df)

Original Data:
     Age   Income Gender
0  25.0  50000.0      M
1  30.0  60000.0   None
2   NaN  55000.0      F
3  22.0      NaN      F
4  28.0  62000.0      M
5   NaN  58000.0   None


TypeError: '<' not supported between instances of 'NoneType' and 'str'