In [1]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli

## Missing Data Management - Probability Question
Construct a database with $100,000$ rows where each record has $100$ fields. Assume further that for each record, each of the $100$ fields has a $1$ per cent chance of being empty, i.e., its value is missing.

Remove all records with **two** or more empty fields and report the fraction of records that are removed.

In [2]:
# Set seed for reproducibility
np.random.seed(42)

In [3]:
# define the size of the database
num_rows = 100000
num_fields = 100

In [4]:
# Generate random data with a 1% chance of being empty
data = np.random.choice([np.random.rand() if np.random.rand() > 0.01 else np.nan for _ in range(num_rows * num_fields)],
                        size=(num_rows, num_fields))

In [5]:
# Create a DataFrame
df = pd.DataFrame(data, columns=[f"Field_{i}" for i in range(num_fields)])

In [6]:
# Count the number of missing values in each row
missing_values_per_row = df.isnull().sum(axis=1)

In [7]:
# Remove records with two or more empty fields
df_filtered = df[missing_values_per_row < 2]

In [8]:
# Calculate the fraction of records removed
fraction_removed = 1 - len(df_filtered) / len(df)

In [9]:
print(f"Fraction of records removed: {fraction_removed:.2%}")

Fraction of records removed: 26.57%
