# Handling Missing Values
* Missing values can cause problems with many machine learning algorithms. Dropping rows containing missing values is rarely a good idea so figuring out the best way to handle missing data is very important. This notebook will only go over simple techniques including Mean/Median/Mode, KNN, and Group-wise imputation
- Mean/Median/Mode Imputation: Replace missing values with the mean/median/mode of the column
- KNN Imputation: Replace missing values using KNN for a given data point
- Group-wise imputation: Replace missing values with the mean/median/mode within a group/category

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [2]:
# Generate data
original_data = pd.DataFrame({
    'Age': [25, np.nan, 32, 74, 33, np.nan, 20, 44, 52, np.nan, 47, 68, 55, 37, np.nan],
    'Income': np.random.choice([np.nan, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000], 15),
    'Education': np.random.choice([None, 'High School', 'Bachelors', 'Masters', 'PhD'], 15),
    'Score': np.random.choice([None, 80, 90, 95], 15)
})
imputed_data = original_data.copy()

In [3]:
print("Original Data")
original_data

Original Data


Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,
4,33.0,80000.0,PhD,80.0
5,,,Bachelors,95.0
6,20.0,80000.0,,80.0
7,44.0,90000.0,,95.0
8,52.0,90000.0,,90.0
9,,100000.0,Masters,95.0


### Mean Imputation

In [4]:
# Replace the missing values of the Age column with the mean
mean_age = int(imputed_data['Age'].mean())
imputed_data['Age'].fillna(mean_age, inplace = True)
print(mean_age)

44


In [5]:
imputed_data

Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,44.0,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,
4,33.0,80000.0,PhD,80.0
5,44.0,,Bachelors,95.0
6,20.0,80000.0,,80.0
7,44.0,90000.0,,95.0
8,52.0,90000.0,,90.0
9,44.0,100000.0,Masters,95.0


### Median Imputation

In [6]:
# Replace the missing valuees of the Income column with the median
median_income = int(imputed_data['Income'].median())
imputed_data['Income'].fillna(median_income, inplace = True)
print(median_income)

80000


In [7]:
imputed_data

Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,44.0,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,
4,33.0,80000.0,PhD,80.0
5,44.0,80000.0,Bachelors,95.0
6,20.0,80000.0,,80.0
7,44.0,90000.0,,95.0
8,52.0,90000.0,,90.0
9,44.0,100000.0,Masters,95.0


### Mode Imputation

In [8]:
# Replace the missing values of the Education column with the mode
mode_education = imputed_data['Education'].mode()[0]
imputed_data['Education'].fillna(mode_education, inplace = True)
print(mode_education)

PhD


In [9]:
imputed_data

Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,44.0,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,
4,33.0,80000.0,PhD,80.0
5,44.0,80000.0,Bachelors,95.0
6,20.0,80000.0,PhD,80.0
7,44.0,90000.0,PhD,95.0
8,52.0,90000.0,PhD,90.0
9,44.0,100000.0,Masters,95.0


### KNN Imputation

In [10]:
# Replace the missing values of the Score column using KNN based off of Age and Income
knn_imputer = KNNImputer(n_neighbors = 2)
data_imputed_knn = knn_imputer.fit_transform(imputed_data[['Age', 'Income', 'Score']])
imputed_data[['Age', 'Income', 'Score']] = data_imputed_knn

In [11]:
imputed_data

Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,44.0,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,92.5
4,33.0,80000.0,PhD,80.0
5,44.0,80000.0,Bachelors,95.0
6,20.0,80000.0,PhD,80.0
7,44.0,90000.0,PhD,95.0
8,52.0,90000.0,PhD,90.0
9,44.0,100000.0,Masters,95.0


### Group-wise Imputation

In [12]:
# For this example, we'll replace the Education in the original df with the imputed data
original_data['Education'] = imputed_data['Education']

In [13]:
# Group by the Education column and calculate the mean of the score column for each education category
group_means = imputed_data.groupby('Education')['Score'].transform('mean')
# Replace the missing values with the group means
original_data['Score'].fillna(group_means, inplace = True)

In [14]:
original_data

Unnamed: 0,Age,Income,Education,Score
0,25.0,30000.0,Bachelors,90.0
1,,80000.0,Masters,90.0
2,32.0,70000.0,PhD,95.0
3,74.0,80000.0,PhD,89.444444
4,33.0,80000.0,PhD,80.0
5,,,Bachelors,95.0
6,20.0,80000.0,PhD,80.0
7,44.0,90000.0,PhD,95.0
8,52.0,90000.0,PhD,90.0
9,,100000.0,Masters,95.0
