# How to Handle Missing Data

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(3)

In [None]:
# Read in example dataset (fabricated for this demo)
df = 
df.head()

In [None]:
# Explore the data 


In [None]:
# Function for permutation test to determine if difference in means is significant
def permutation_test(data, group_col, value_col):
    observed_diff = data.groupby(group_col)[value_col].mean().diff().iloc[-1]
    diffs = []
    for i in range(5000):
        shuffled = data[value_col].sample(frac=1, replace=False).reset_index(drop=True)
        shuffled_df = data.copy()
        shuffled_df[value_col] = shuffled
        diff = shuffled_df.groupby(group_col)[value_col].mean().diff().iloc[-1]
        diffs.append(diff)
    p_val = np.mean(np.abs(diffs) >= np.abs(observed_diff))
    return observed_diff, p_val, diffs

In [None]:
# Do the permutation test


In [None]:
# Visualize results


What if the dataset had missing values?

In [None]:
# Randomly remove 15% of salaries
n = 
data_random_missing = 
mask = 
data_random_missing...
data_random_missing.head(10)

In [None]:
# Drop the rows with missing salary 
data_random_missing_orig = 
data_random_missing...
data_random_missing.shape

In [None]:
# Do the permutation test
obs_diff_random, p_val_random, diffs_random = permutation_test(data_random_missing, "gender", "salary")

In [None]:
# Visualize results
sns.histplot(diffs_random)
plt.scatter(x=obs_diff_random, y = 5, marker='o',color='r');
p_val_random

What if the missingness is not random? Suppose it is based on gender/age.

In [None]:
# Generate missing data based on gender/age 
# (young females less likely to have data)
missing_based_on_gender_age = df.copy()
mask = 
missing_based_on_gender_age.loc[mask, 'salary'] = np.nan

In [None]:
# Drop the rows with missing salary 
missing_based_on_gender_age_orig = missing_based_on_gender_age.copy()
missing_based_on_gender_age.dropna(subset='salary',inplace=True)
missing_based_on_gender_age.shape

In [None]:
# Do the permutation test
obs_diff_missing, p_val_missing, diffs_missing = permutation_test(missing_based_on_gender_age, "gender", "salary")

In [None]:
# Visualize results
sns.histplot(diffs_missing)
plt.scatter(x=obs_diff_missing, y = 5, marker='o',color='r');
p_val_missing

What if missingness is based on the value itself?

In [None]:
# Generate missing data based on salary itself. Lower salaries are more likely to skip
missing_based_on_salary = df.copy()
mask = 
missing_based_on_salary.loc[mask, 'salary'] = np.nan

In [None]:
# Drop the rows with missing salary 
missing_based_on_salary_orig = missing_based_on_salary.copy()
missing_based_on_salary.dropna(subset='salary',inplace=True)
missing_based_on_salary.shape

In [None]:
# Do the permutation test
obs_diff_missing2, p_val_missing2, diffs_missing2 = permutation_test(missing_based_on_salary, "gender", "salary")

In [None]:
# Visualize results
sns.histplot(diffs_missing2)
plt.scatter(x=obs_diff_missing2, y = 5, marker='o',color='r');
p_val_missing2

### Checking for patterns in missingness

In [None]:
# Create a "missing salary" column
data_random_missing_orig['missing_salary'] = 
missing_based_on_gender_age_orig['missing_salary'] = 
missing_based_on_salary_orig['missing_salary'] = 

In [None]:
# check relationship between missingness and gender, MCAR


In [None]:
# check relationship between missingness and age, MCAR


In [None]:
# check relationship between missingness and gender, MAR


In [None]:
# check relationship between missingness and age, MAR


In [None]:
# check relationship between missingness and gender, MNAR


In [None]:
# check relationship between missingness and age, MNAR


### Imputing the data

In [None]:
# Fill with the mean
random_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(random_filled, "gender", "salary")

sns.histplot(diffs)
plt.scatter(x=obs_diff, y = 5, marker='o',color='r');
p_val

In [None]:
# Fill with the median
random_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(random_filled, "gender", "salary")
p_val

In [None]:
# Fill MAR with the mean
MAR_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(MAR_filled, "gender", "salary")
p_val

In [None]:
# Fill MAR with the median
MAR_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(MAR_filled, "gender", "salary")
p_val

In [None]:
# Fill MNAR with the mean
MNAR_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(MNAR_filled, "gender", "salary")
p_val

In [None]:
# Fill MNAR with the median
MNAR_filled = 

In [None]:
# Check analysis
obs_diff, p_val, diffs = permutation_test(MNAR_filled, "gender", "salary")
p_val