### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [1]:
# write your code from here
import numpy as np
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def validate_dataframes(df_base, df_new):
    """Basic validation of input DataFrames."""
    if not isinstance(df_base, pd.DataFrame) or not isinstance(df_new, pd.DataFrame):
        raise TypeError("Inputs must be pandas DataFrames.")
    if df_base.shape[1] != df_new.shape[1]:
        raise ValueError("Both dataframes must have the same number of features.")
    if df_base.isnull().any().any() or df_new.isnull().any().any():
        raise ValueError("Missing values found. Please clean the data before running.")
    for dt_base, dt_new in zip(df_base.dtypes, df_new.dtypes):
        if dt_base != dt_new:
            raise ValueError("Feature dtype mismatch between datasets.")

def train_model_and_get_shap(df, target_col, model=None):
    """
    Train a classifier and compute SHAP values.
    Returns model and mean absolute SHAP values per feature.
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]

    if model is None:
        model = RandomForestClassifier(n_estimators=100, random_state=42)

    model.fit(X, y)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    # For binary classification shap_values is a list: [class0, class1]
    # We'll take the class 1 explanation (positive class)
    if isinstance(shap_values, list) and len(shap_values) == 2:
        shap_values = shap_values[1]

    mean_abs_shap = np.mean(np.abs(shap_values), axis=0)
    feature_importance = pd.Series(mean_abs_shap, index=X.columns)
    return model, feature_importance

def analyze_feature_drift(df_base, df_new, target_col):
    """
    Analyze feature drift between two datasets using SHAP values.
    Returns a DataFrame with baseline, new, and difference of SHAP importance.
    """
    validate_dataframes(df_base.drop(columns=[target_col]), df_new.drop(columns=[target_col]))

    model, base_importance = train_model_and_get_shap(df_base, target_col)
    _, new_importance = train_model_and_get_shap(df_new, target_col, model=model)

    # Calculate absolute difference in SHAP importance
    drift_df = pd.DataFrame({
        'baseline_importance': base_importance,
        'new_importance': new_importance,
        'importance_diff': (new_importance - base_importance).abs()
    }).sort_values(by='importance_diff', ascending=False)

    return drift_df

# ------------------ Unit Tests ------------------

def test_validate_dataframes():
    df1 = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
    df2 = pd.DataFrame({'a': [1,2,3], 'b': [7,8,9]})
    validate_dataframes(df1, df2)
    try:
        validate_dataframes(df1, [1,2,3])
    except TypeError:
        pass
    else:
        raise AssertionError("TypeError expected for non-DataFrame input")

    df3 = pd.DataFrame({'a': [1,2], 'b': [3,4], 'c': [5,6]})
    try:
        validate_dataframes(df1, df3)
    except ValueError:
        pass
    else:
        raise AssertionError("ValueError expected for shape mismatch")

def test_analyze_feature_drift():
    np.random.seed(42)
    base_df = pd.DataFrame({
        'feat1': np.random.normal(0,1,100),
        'feat2': np.random.normal(5,2,100),
        'target': np.random.choice([0,1], 100)
    })

    # Create new_df with some feature distribution shift
    new_df = base_df.copy()
    new_df['feat1'] = np.random.normal(1,1,100)  # shifted mean
    drift_results = analyze_feature_drift(base_df, new_df, 'target')

    assert 'importance_diff' in drift_results.columns, "Output missing importance_diff"
    assert drift_results.shape[0] == base_df.shape[1] - 1, "Output rows mismatch"

if __name__ == "__main__":
    test_validate_dataframes()
    test_analyze_feature_drift()
    print("All tests passed!")

    # Example usage
    np.random.seed(0)
    baseline_data = pd.DataFrame({
        'feat1': np.random.normal(0, 1, 500),
        'feat2': np.random.normal(5, 2, 500),
        'target': np.random.choice([0, 1], 500)
    })

    new_data = pd.DataFrame({
        'feat1': np.random.normal(0.8, 1, 500),  # Slight drift in feat1
        'feat2': np.random.normal(5, 2, 500),
        'target': np.random.choice([0, 1], 500)
    })

    drift_df = analyze_feature_drift(baseline_data, new_data, target_col='target')
    print("\nFeature importance drift (sorted):\n", drift_df)


ModuleNotFoundError: No module named 'shap'