In [6]:
import pandas as pd  # Import Pandas library for data manipulation
import numpy as np   # Import NumPy library for numerical operations

# Define a function to remove columns below a threshold in terms of missing data percentage
def remove_columns_below_threshold(df, compliance_threshold):
    columns_to_remove = []  # List to store columns that will be removed
    missing_data_stats = {}  # Dictionary to store missing data statistics

    # Iterate through columns in the DataFrame
    for column in df.columns:
        missing_count = df[column].isna().sum()  # Count missing values in the column
        missing_percentage = (missing_count / len(df)) * 100  # Calculate the percentage of missing values
        reason = ''  # Variable to store the reason for removal

        # If missing percentage is greater than the threshold, mark the column for removal
        if missing_percentage > compliance_threshold:
            columns_to_remove.append(column)
            reason = 'Exceeds compliance threshold'
        # If the column contains non-numeric values, mark it for removal
        elif df[column].dtype == object:
            columns_to_remove.append(column)
            reason = 'Contains non-numeric values'

        # If a reason is set, store the column's missing data statistics
        if reason != '':
            missing_data_stats[column] = {
                'missing_count': missing_count,
                'missing_percentage': missing_percentage,
                'reason': reason,
            }

    # Create a clean DataFrame by removing marked columns
    df_clean = df.drop(columns_to_remove, axis=1)

    # Return the clean DataFrame, removed columns list, and missing data statistics
    return df_clean, columns_to_remove, missing_data_stats

# Usage
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, None, None, 4, 5],
    'C': [1, 2, 3, 4, 5],
    'D': ['a', 1,2,4,3],
}

df = pd.DataFrame(data)  # Create a DataFrame from the sample data

compliance_threshold = 20  # Set the threshold as a percentage

# Call the function and store the clean DataFrame, removed columns, and missing data statistics
df_clean, removed_columns, missing_data = remove_columns_below_threshold(df, compliance_threshold)

# Print the results
print("Clean DataFrame:")
print(df_clean)
print("\nRemoved columns:")
print(removed_columns)
print("\nMissing data statistics:")
print(missing_data)



Clean DataFrame:
     A  C
0  1.0  1
1  2.0  2
2  NaN  3
3  4.0  4
4  5.0  5

Removed columns:
['B', 'D']

Missing data statistics:
{'B': {'missing_count': 3, 'missing_percentage': 60.0, 'reason': 'Exceeds compliance threshold'}, 'D': {'missing_count': 0, 'missing_percentage': 0.0, 'reason': 'Contains non-numeric values'}}
