In [1]:
import pandas as pd

def remove_columns_below_threshold(df, compliance_threshold):
    columns_to_remove = []
    missing_data_stats = {}

    for column in df.columns:
        missing_count = df[column].isna().sum()
        missing_percentage = (missing_count / len(df)) * 100

        if missing_percentage > compliance_threshold:
            columns_to_remove.append(column)
            missing_data_stats[column] = {
                'missing_count': missing_count,
                'missing_percentage': missing_percentage,
            }

    df_clean = df.drop(columns_to_remove, axis=1)

    return df_clean, columns_to_remove, missing_data_stats

# Usage
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, None, None, 4, 5],
    'C': [1, 2, 3, 4, 5],
}

df = pd.DataFrame(data)

compliance_threshold = 20  # percentage
df_clean, removed_columns, missing_data = remove_columns_below_threshold(df, compliance_threshold)

print(df_clean)
print(removed_columns)
print(missing_data)

     A  C
0  1.0  1
1  2.0  2
2  NaN  3
3  4.0  4
4  5.0  5
['B']
{'B': {'missing_count': 3, 'missing_percentage': 60.0}}


This example demonstrates how to use the remove_columns_below_threshold function with a pandas DataFrame and a compliance threshold percentage. The function returns the cleaned DataFrame, a list of removed columns, and a dictionary containing the missing data statistics for each removed column.

In [None]:
from missing_data_cleaner import remove_columns_below_threshold

# Create a pandas DataFrame
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, None, None, 4, 5],
    'C': [1, 2, 3, 4, 5],
}
df = pd.DataFrame(data)

# Set a compliance threshold (percentage)
compliance_threshold = 20

# Call the function
df_clean, removed_columns, missing_data = remove_columns_below_threshold(df, compliance_threshold)

# The cleaned DataFrame, removed columns, and missing data statistics are returned
print(df_clean)
print(removed_columns)
print(missing_data)