Data Cleaning, remove empty strings/values and replace them with None/Nan

In [12]:
import pandas as pd
import numpy as np

In [13]:
def replace_empty_with_none(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replaces all empty or whitespace-only strings in a DataFrame with NaN (None).
    """
    cleaned_df = df.copy()
    cleaned_df = cleaned_df.replace(r'^\s*$', np.nan, regex=True)
    return cleaned_df

def test_replace_empty_with_none():
    # arrange
    df = pd.DataFrame({
        "A": ["", " ", "Hello", None]
    })

    # act
    cleaned = replace_empty_with_none(df)

    # assert
    assert isinstance(cleaned, pd.DataFrame), "Output should be a pandas DataFrame"

    assert cleaned.shape == df.shape, "Shape of DataFrame should not change"

    # pd.isna(value) → checks if a value (or values) is missing (i.e., NaN, None, or NaT)
    assert pd.isna(cleaned.loc[0, "A"]), "Row 0, Col A should be NaN"
    assert pd.isna(cleaned.loc[1, "A"]), "Row 1, Col A should be NaN"

    assert cleaned.loc[2, "A"] == "Hello", "Non-empty text should remain the same"
   
    nan_count = cleaned.isna().sum().sum()
    assert nan_count == 3, f"Expected 3 NaN values, got {nan_count}"

    print("✅ All assertions passed — function works correctly!")

# run it
test_replace_empty_with_none()


✅ All assertions passed — function works correctly!
