### Detect Data Drift in ML Models
**Objective**: Monitor and detect changes in data distributions that impact ML model performance.

**Task**: Categorical Feature Drift

**Steps**:
1. Load the baseline distribution for a categorical feature (e.g., gender ) from your training dataset.
2. Load the same feature from your current production data.
3. Use chi-squared tests to compare the distributions of the categorical feature.
4. Step 4: If significant drift is detected, investigate the cause and update the model as needed.

In [3]:
import pandas as pd
from scipy.stats import chi2_contingency
import pytest

def get_feature_distribution(df: pd.DataFrame, feature: str) -> pd.Series:
    if df is None or df.empty:
        raise ValueError("Input DataFrame is empty or None.")
    if feature not in df.columns:
        raise ValueError(f"Feature '{feature}' not found in DataFrame.")
    if df[feature].isnull().all():
        raise ValueError(f"Feature '{feature}' contains only null values.")
    return df[feature].value_counts(normalize=False).sort_index()

def align_distributions(dist1: pd.Series, dist2: pd.Series) -> pd.DataFrame:
    all_categories = dist1.index.union(dist2.index)
    dist1_aligned = dist1.reindex(all_categories, fill_value=0)
    dist2_aligned = dist2.reindex(all_categories, fill_value=0)
    return pd.DataFrame({'baseline': dist1_aligned, 'production': dist2_aligned})

def detect_categorical_drift(baseline_df: pd.DataFrame, production_df: pd.DataFrame,
                             feature: str, alpha: float = 0.05) -> bool:
    baseline_dist = get_feature_distribution(baseline_df, feature)
    production_dist = get_feature_distribution(production_df, feature)

    contingency_table = align_distributions(baseline_dist, production_dist)
    chi2, p_value, _, _ = chi2_contingency(contingency_table.T)

    print(f"Chi-squared test statistic: {chi2:.4f}, p-value: {p_value:.4f}")

    if p_value < alpha:
        print(f"Significant drift detected for feature '{feature}' (p < {alpha}).")
        return True
    else:
        print(f"No significant drift detected for feature '{feature}' (p >= {alpha}).")
        return False

# -------------------------
# Unit tests
# -------------------------
def test_normal_case():
    baseline = pd.DataFrame({'gender': ['M', 'F', 'F', 'M', 'F']})
    production = pd.DataFrame({'gender': ['M', 'M', 'F', 'F', 'F']})
    assert detect_categorical_drift(baseline, production, 'gender') in [True, False]

def test_missing_feature():
    baseline = pd.DataFrame({'gender': ['M', 'F']})
    production = pd.DataFrame({'sex': ['M', 'F']})
    try:
        detect_categorical_drift(baseline, production, 'gender')
    except ValueError as e:
        assert "not found" in str(e)

def test_empty_dataframe():
    baseline = pd.DataFrame({'gender': []})
    production = pd.DataFrame({'gender': ['M', 'F']})
    try:
        detect_categorical_drift(baseline, production, 'gender')
    except ValueError as e:
        assert "empty or None" in str(e)

def test_all_null_values():
    baseline = pd.DataFrame({'gender': [None, None, None]})
    production = pd.DataFrame({'gender': ['M', 'F']})
    try:
        detect_categorical_drift(baseline, production, 'gender')
    except ValueError as e:
        assert "only null values" in str(e)

if __name__ == "__main__":
    # Run sample drift detection
    baseline_df = pd.DataFrame({'gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female']})
    production_df = pd.DataFrame({'gender': ['Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female']})

    drift = detect_categorical_drift(baseline_df, production_df, 'gender')
    print("Drift detected:", drift)

    # Run tests
    print("\nRunning unit tests...")
    pytest.main([__file__])

ModuleNotFoundError: No module named 'pytest'

In [2]:
# write your code from here

import pandas as pd
from scipy.stats import chi2_contingency

def get_feature_distribution(df: pd.DataFrame, feature: str) -> pd.Series:
    """
    Returns the value counts (distribution) of a categorical feature.
    """
    if feature not in df.columns:
        raise ValueError(f"Feature '{feature}' not found in DataFrame.")
    return df[feature].value_counts(normalize=False).sort_index()

def align_distributions(dist1: pd.Series, dist2: pd.Series) -> pd.DataFrame:
    """
    Aligns two distributions on their categories, filling missing categories with zero counts.
    Returns a DataFrame with two columns representing counts for dist1 and dist2.
    """
    all_categories = dist1.index.union(dist2.index)
    dist1_aligned = dist1.reindex(all_categories, fill_value=0)
    dist2_aligned = dist2.reindex(all_categories, fill_value=0)
    return pd.DataFrame({'baseline': dist1_aligned, 'production': dist2_aligned})

def detect_categorical_drift(baseline_df: pd.DataFrame, production_df: pd.DataFrame,
                             feature: str, alpha: float = 0.05) -> bool:
    """
    Performs chi-squared test on the categorical feature distributions to detect drift.
    Returns True if drift detected (p-value < alpha), False otherwise.
    """
    baseline_dist = get_feature_distribution(baseline_df, feature)
    production_dist = get_feature_distribution(production_df, feature)

    contingency_table = align_distributions(baseline_dist, production_dist)

    chi2, p_value, _, _ = chi2_contingency(contingency_table.T)

    print(f"Chi-squared test statistic: {chi2:.4f}, p-value: {p_value:.4f}")

    if p_value < alpha:
        print(f"Significant drift detected for feature '{feature}' (p < {alpha}).")
        return True
    else:
        print(f"No significant drift detected for feature '{feature}' (p >= {alpha}).")
        return False

# Example usage
def main():
    # Baseline (training) data example
    baseline_data = {
        'gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female']
    }
    baseline_df = pd.DataFrame(baseline_data)

    # Production data example (slightly changed distribution)
    production_data = {
        'gender': ['Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female']
    }
    production_df = pd.DataFrame(production_data)

    drift = detect_categorical_drift(baseline_df, production_df, 'gender')
    print("Drift detected:", drift)

if __name__ == "__main__":
    main()

Chi-squared test statistic: 0.0000, p-value: 1.0000
No significant drift detected for feature 'gender' (p >= 0.05).
Drift detected: False
