# Import necessary libraries

In [2]:
import pandas as pd
import numpy as np

# Sample data creation


In [3]:
np.random.seed(0)  # Setting seed for reproducibility
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': ['a', 'b', 'c', np.nan, 'e'],
    'D': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)


Original Data:
     A    B    C  D
0  1.0  NaN    a  1
1  2.0  2.0    b  2
2  NaN  3.0    c  3
3  4.0  4.0  NaN  4
4  5.0  5.0    e  5


# Numerical Variables


## 1. Mean/Median Imputation


In [4]:
mean_imputed_df = df.copy()
mean_imputed_df['A'].fillna(mean_imputed_df['A'].mean(), inplace=True)
print("\nMean Imputed Data:")
print(mean_imputed_df)


Mean Imputed Data:
     A    B    C  D
0  1.0  NaN    a  1
1  2.0  2.0    b  2
2  3.0  3.0    c  3
3  4.0  4.0  NaN  4
4  5.0  5.0    e  5


## 2. Arbitrary Value Imputation


In [5]:
arbitrary_imputed_df = df.copy()
arbitrary_imputed_df['A'].fillna(9999, inplace=True)  # Choosing an arbitrary value of 9999
print("\nArbitrary Value Imputed Data:")
print(arbitrary_imputed_df)


Arbitrary Value Imputed Data:
        A    B    C  D
0     1.0  NaN    a  1
1     2.0  2.0    b  2
2  9999.0  3.0    c  3
3     4.0  4.0  NaN  4
4     5.0  5.0    e  5


## 3. End of tail Imputation

In [6]:
end_of_tail_imputed_df = df.copy()
end_of_tail_value = end_of_tail_imputed_df['A'].mean() + 3 * end_of_tail_imputed_df['A'].std()
end_of_tail_imputed_df['A'].fillna(end_of_tail_value, inplace=True)
print("\nEnd of Tail Imputed Data:")
print(end_of_tail_imputed_df)


End of Tail Imputed Data:
          A    B    C  D
0  1.000000  NaN    a  1
1  2.000000  2.0    b  2
2  8.477226  3.0    c  3
3  4.000000  4.0  NaN  4
4  5.000000  5.0    e  5


## 4. Mode Imputation

In [7]:
mode_imputed_df = df.copy()
mode_imputed_df['B'].fillna(mode_imputed_df['B'].mode()[0], inplace=True)
print("\nMode Imputed Data:")
print(mode_imputed_df)


Mode Imputed Data:
     A    B    C  D
0  1.0  2.0    a  1
1  2.0  2.0    b  2
2  NaN  3.0    c  3
3  4.0  4.0  NaN  4
4  5.0  5.0    e  5


# Categorical Variables

## 1. Frequent category Imputation


In [8]:
frequent_category_imputed_df = df.copy()
frequent_category_imputed_df['C'].fillna(frequent_category_imputed_df['C'].mode()[0], inplace=True)
print("\nFrequent Category Imputed Data:")
print(frequent_category_imputed_df)


Frequent Category Imputed Data:
     A    B  C  D
0  1.0  NaN  a  1
1  2.0  2.0  b  2
2  NaN  3.0  c  3
3  4.0  4.0  a  4
4  5.0  5.0  e  5


## 2. Adding a “Missing” category

In [9]:
missing_category_df = df.copy()
missing_category_df['C'].fillna("Missing", inplace=True)
print("\nAdded 'Missing' Category Data:")
print(missing_category_df)


Added 'Missing' Category Data:
     A    B        C  D
0  1.0  NaN        a  1
1  2.0  2.0        b  2
2  NaN  3.0        c  3
3  4.0  4.0  Missing  4
4  5.0  5.0        e  5


# Both

## 1. Frequent category Imputation

In [10]:
frequent_category_imputed_df = df.copy()
frequent_category_imputed_df['C'].fillna(frequent_category_imputed_df['C'].mode()[0], inplace=True)
print("\nFrequent Category Imputed Data:")
print(frequent_category_imputed_df)


Frequent Category Imputed Data:
     A    B  C  D
0  1.0  NaN  a  1
1  2.0  2.0  b  2
2  NaN  3.0  c  3
3  4.0  4.0  a  4
4  5.0  5.0  e  5


## 2. Adding a “Missing” Indicator

In [11]:
missing_indicator_df = df.copy()
missing_indicator_df['A_missing'] = missing_indicator_df['A'].isnull().astype(int)
missing_indicator_df['B_missing'] = missing_indicator_df['B'].isnull().astype(int)
missing_indicator_df['C_missing'] = missing_indicator_df['C'].isnull().astype(int)
print("\nAdded Missing Indicator Data:")
print(missing_indicator_df)


Added Missing Indicator Data:
     A    B    C  D  A_missing  B_missing  C_missing
0  1.0  NaN    a  1          0          1          0
1  2.0  2.0    b  2          0          0          0
2  NaN  3.0    c  3          1          0          0
3  4.0  4.0  NaN  4          0          0          1
4  5.0  5.0    e  5          0          0          0


## 3. Random Sample Imputation

In [12]:
random_sample_df = df.copy()
random_sample_df['A'].fillna(np.random.choice(random_sample_df['A'].dropna()), inplace=True)
print("\nRandom Sample Imputed Data:")
print(random_sample_df)


Random Sample Imputed Data:
     A    B    C  D
0  1.0  NaN    a  1
1  2.0  2.0    b  2
2  1.0  3.0    c  3
3  4.0  4.0  NaN  4
4  5.0  5.0    e  5
