Model Failure Diagnosis Without Retraining


```python
import pandas as pd
import numpy as np

# Generate Synthetic Historical Dataset
np.random.seed(42)

num_samples_historical = 1000
numerical_features = {
    'numerical_feature_1': np.random.normal(loc=50, scale=10, size=num_samples_historical),
    'numerical_feature_2': np.random.normal(loc=100, scale=20, size=num_samples_historical),
    'numerical_feature_3': np.random.uniform(low=0, high=1, size=num_samples_historical)
}
categorical_features = {
    'categorical_feature_1': np.random.choice(['A', 'B', 'C'], size=num_samples_historical, p=[0.5, 0.3, 0.2]),
    'categorical_feature_2': np.random.choice(['X', 'Y'], size=num_samples_historical, p=[0.7, 0.3])
}

historical_df = pd.DataFrame({**numerical_features, **categorical_features})

# Generate Synthetic Future Dataset with Drift
num_samples_future = 1000

# Introduce drift in numerical_feature_1 (mean shift)
future_numerical_feature_1 = np.random.normal(loc=55, scale=10, size=num_samples_future) # Shifted mean

# Introduce drift in numerical_feature_2 (variance change)
future_numerical_feature_2 = np.random.normal(loc=100, scale=25, size=num_samples_future) # Increased variance

# numerical_feature_3 has no drift
future_numerical_feature_3 = np.random.uniform(low=0, high=1, size=num_samples_future)

# Introduce drift in categorical_feature_1 (proportion change)
future_categorical_feature_1 = np.random.choice(['A', 'B', 'C'], size=num_samples_future, p=[0.3, 0.4, 0.3]) # Changed proportions

# categorical_feature_2 has no drift
future_categorical_feature_2 = np.random.choice(['X', 'Y'], size=num_samples_future, p=[0.7, 0.3])


future_df = pd.DataFrame({
    'numerical_feature_1': future_numerical_feature_1,
    'numerical_feature_2': future_numerical_feature_2,
    'numerical_feature_3': future_numerical_feature_3,
    'categorical_feature_1': future_categorical_feature_1,
    'categorical_feature_2': future_categorical_feature_2
})

print("Historical Data Sample:")
print(historical_df.head())
print("\nFuture Data Sample:")
print(future_df.head())

print(f"\nHistorical DataFrame shape: {historical_df.shape}")
print(f"Future DataFrame shape: {future_df.shape}")
```

In [55]:
import numpy as np
import pandas as pd

def generate_synthetic_data(num_samples, random_seed=42):
    np.random.seed(random_seed)

    data = {
        'numerical_feature_1': np.random.normal(loc=100, scale=20, size=num_samples),
        'numerical_feature_2': np.random.uniform(low=0, high=1000, size=num_samples),
        'categorical_feature_1': np.random.choice(['A', 'B', 'C', 'D'], size=num_samples, p=[0.25, 0.25, 0.25, 0.25]),
        'categorical_feature_2': np.random.choice(['X', 'Y'], size=num_samples, p=[0.5, 0.5])
    }
    return pd.DataFrame(data)

# Generate historical data
num_historical_samples = 1000
historical_df = generate_synthetic_data(num_historical_samples, random_seed=0)

# Generate future data with drift
num_future_samples = 300
future_df = generate_synthetic_data(num_future_samples, random_seed=1)

# Introduce drift in 'future_df'
# Numerical feature drift: shift mean and increase variance
future_df['numerical_feature_1'] = future_df['numerical_feature_1'] * 1.1 + 10 # 10% increase in values, +10 offset
future_df['numerical_feature_2'] = future_df['numerical_feature_2'] * 0.9 - 50 # 10% decrease in values, -50 offset

# Categorical feature drift: change distribution
future_df['categorical_feature_1'] = np.random.choice(['A', 'B', 'C', 'D'], size=num_future_samples, p=[0.1, 0.1, 0.4, 0.4])
future_df['categorical_feature_2'] = np.random.choice(['X', 'Y'], size=num_future_samples, p=[0.8, 0.2])

print("### Historical Data Sample:")
print(historical_df.head())
print("\nShape of Historical Data:", historical_df.shape)

print("\n### Future Data Sample (with drift):")
print(future_df.head())
print("\nShape of Future Data:", future_df.shape)


### Historical Data Sample:
   numerical_feature_1  numerical_feature_2 categorical_feature_1  \
0           135.281047           821.903908                     D   
1           108.003144           700.528623                     A   
2           119.574760           883.077597                     C   
3           144.817864           966.575107                     C   
4           137.351160           774.747614                     B   

  categorical_feature_2  
0                     Y  
1                     X  
2                     X  
3                     X  
4                     X  

Shape of Historical Data: (1000, 4)

### Future Data Sample (with drift):
   numerical_feature_1  numerical_feature_2 categorical_feature_1  \
0           155.735598           347.713853                     D   
1           106.541359           473.419667                     D   
2           108.380221           840.776537                     D   
3            96.394690           133.515603       

In [57]:
import numpy as np
import pandas as pd

def calculate_psi(historical_series, future_series, n_bins=10):



    all_data = pd.concat([historical_series, future_series])
    min_val = all_data.min()
    max_val = all_data.max()


    if min_val == max_val:

        return 0.0

        bins = np.linspace(min_val, max_val + 1e-6, n_bins + 1)

    # 3. Calculate the frequency distribution for historical_series and future_series
    hist_counts = pd.cut(historical_series, bins=bins, include_lowest=True, right=False).value_counts(normalize=True)
    future_counts = pd.cut(future_series, bins=bins, include_lowest=True, right=False).value_counts(normalize=True)


    all_bins = pd.Categorical(bins[:-1], categories=bins[:-1], ordered=True)

    hist_proportions = hist_counts.reindex(pd.IntervalIndex.from_breaks(bins, closed='left'), fill_value=0.0)
    future_proportions = future_counts.reindex(pd.IntervalIndex.from_breaks(bins, closed='left'), fill_value=0.0)


    hist_proportions = hist_proportions.sort_index()
    future_proportions = future_proportions.sort_index()


    epsilon = 1e-6
    hist_proportions = hist_proportions.apply(lambda x: max(x, epsilon))
    future_proportions = future_proportions.apply(lambda x: max(x, epsilon))


    psi_contributions = (future_proportions - hist_proportions) * np.log(future_proportions / hist_proportions)


    psi_value = psi_contributions.sum()

    return psi_value

print("Function `calculate_psi` defined successfully.")

Function `calculate_psi` defined successfully.


In [54]:
import pandas as pd
import numpy as np

def calculate_categorical_drift(historical_series, future_series):


    # 1. Calculate the frequency distribution (proportions) for each category
    historical_proportions = historical_series.value_counts(normalize=True)
    future_proportions = future_series.value_counts(normalize=True)

    # 2. Identify all unique categories present across both datasets
    all_categories = pd.Series(list(historical_proportions.index) + list(future_proportions.index)).unique()

    # 3. Align the proportions for consistent comparison and fill missing with 0
    historical_aligned = historical_proportions.reindex(all_categories, fill_value=0)
    future_aligned = future_proportions.reindex(all_categories, fill_value=0)

    # 4. Calculate the absolute difference between historical and future proportions
    abs_diffs = (future_aligned - historical_aligned).abs()

    # 5. Sum these absolute differences to obtain the drift score
    drift_score = abs_diffs.sum()

    return drift_score

print("Function `calculate_categorical_drift` defined successfully.")

Function `calculate_categorical_drift` defined successfully.


In [61]:
import pandas as pd
import numpy as np

def detect_feature_drift(historical_df, future_df, numerical_cols, categorical_cols):

    drift_results = []


    for col in numerical_cols:
        if col in historical_df.columns and col in future_df.columns:
            psi_score = calculate_psi(historical_df[col], future_df[col])
            drift_results.append({
                'feature_name': col,
                'feature_type': 'numerical',
                'drift_score': psi_score
            })
        else:
            print(f"Warning: Numerical column '{col}' not found in one or both DataFrames. Skipping.")


    for col in categorical_cols:
        if col in historical_df.columns and col in future_df.columns:
            categorical_drift_score = calculate_categorical_drift(historical_df[col], future_df[col])
            drift_results.append({
                'feature_name': col,
                'feature_type': 'categorical',
                'drift_score': categorical_drift_score
            })
        else:
            print(f"Warning: Categorical column '{col}' not found in one or both DataFrames. Skipping.")


    drift_df = pd.DataFrame(drift_results)
    if not drift_df.empty:
        drift_df = drift_df.sort_values(by='drift_score', ascending=False).reset_index(drop=True)

    return drift_df

print("Function `detect_feature_drift` defined successfully.")


Function `detect_feature_drift` defined successfully.


In [62]:
numerical_features = ['numerical_feature_1', 'numerical_feature_2']
categorical_features = ['categorical_feature_1', 'categorical_feature_2']

drift_report = detect_feature_drift(historical_df, future_df, numerical_features, categorical_features)

print("\n### Feature Drift Report (Ranked by Drift Score):\n")
print(drift_report)


### Feature Drift Report (Ranked by Drift Score):

            feature_name feature_type  drift_score
0  categorical_feature_1  categorical     0.596000
1  categorical_feature_2  categorical     0.453333
2    numerical_feature_2    numerical     0.000000
3    numerical_feature_1    numerical     0.000000


In [59]:
import numpy as np
import pandas as pd

def calculate_psi(historical_series, future_series, n_bins=10):

    all_data = pd.concat([historical_series, future_series])
    min_val = all_data.min()
    max_val = all_data.max()


    if min_val == max_val:

        return 0.0

    # Fix: Move bins definition outside the conditional return block
    bins = np.linspace(min_val, max_val + 1e-6, n_bins + 1)

    # 3. Calculate the frequency distribution for historical_series and future_series
    hist_counts = pd.cut(historical_series, bins=bins, include_lowest=True, right=False).value_counts(normalize=True)
    future_counts = pd.cut(future_series, bins=bins, include_lowest=True, right=False).value_counts(normalize=True)


    all_bins = pd.Categorical(bins[:-1], categories=bins[:-1], ordered=True)

    hist_proportions = hist_counts.reindex(pd.IntervalIndex.from_breaks(bins, closed='left'), fill_value=0.0)
    future_proportions = future_counts.reindex(pd.IntervalIndex.from_breaks(bins, closed='left'), fill_value=0.0)


    hist_proportions = hist_proportions.sort_index()
    future_proportions = future_proportions.sort_index()


    epsilon = 1e-6
    hist_proportions = hist_proportions.apply(lambda x: max(x, epsilon))
    future_proportions = future_proportions.apply(lambda x: max(x, epsilon))


    psi_contributions = (future_proportions - hist_proportions) * np.log(future_proportions / hist_proportions)

    # 6. Sum up the PSI contributions from all bins to get the final PSI value
    psi_value = psi_contributions.sum()

    return psi_value

print("Function `calculate_psi` defined successfully.")

Function `calculate_psi` defined successfully.


**Reasoning**:
With the `calculate_psi` function now correctly defined, I will re-execute the code to generate the feature drift report. This step will call the `detect_feature_drift` function with the `historical_df`, `future_df`, and the specified numerical and categorical features, expecting a successful calculation and display of the drift report.



In [63]:
numerical_features = ['numerical_feature_1', 'numerical_feature_2']
categorical_features = ['categorical_feature_1', 'categorical_feature_2']

drift_report = detect_feature_drift(historical_df, future_df, numerical_features, categorical_features)

print("\n### Feature Drift Report (Ranked by Drift Score):\n")
print(drift_report)


### Feature Drift Report (Ranked by Drift Score):

            feature_name feature_type  drift_score
0  categorical_feature_1  categorical     0.596000
1  categorical_feature_2  categorical     0.453333
2    numerical_feature_2    numerical     0.000000
3    numerical_feature_1    numerical     0.000000
