### Detect Data Drift in ML Models
**Objective**: Monitor and detect changes in data distributions that impact ML model performance.

**Task**: Feature Correlation Drift

**Steps**:
1. Compute the correlation matrix of features in your training dataset.
2. Compute the correlation matrix of the same features in your production data.
3. Assess changes in the correlation matrix over time to identify any significant deviations.
4. Investigate any significant changes in correlation as they may indicate issues in the data collection process or model assumptions.

In [1]:
# write your code from here


import pandas as pd
import numpy as np

def compute_correlation_matrix(df: pd.DataFrame, features: list) -> pd.DataFrame:
    """
    Compute correlation matrix for selected features.
    """
    if not set(features).issubset(df.columns):
        raise ValueError("Some specified features are missing from the DataFrame.")
    return df[features].corr()

def correlation_drift_matrix(train_corr: pd.DataFrame, prod_corr: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate absolute difference between two correlation matrices.
    """
    # Align matrices to ensure same order
    train_corr = train_corr.loc[prod_corr.index, prod_corr.columns]
    diff_matrix = (train_corr - prod_corr).abs()
    return diff_matrix

def detect_significant_drift(diff_matrix: pd.DataFrame, threshold: float = 0.2) -> pd.DataFrame:
    """
    Return a DataFrame showing feature pairs with correlation drift above threshold.
    """
    drift_pairs = []
    for row in diff_matrix.index:
        for col in diff_matrix.columns:
            if row != col and diff_matrix.loc[row, col] > threshold:
                drift_pairs.append((row, col, diff_matrix.loc[row, col]))
    return pd.DataFrame(drift_pairs, columns=['Feature 1', 'Feature 2', 'Drift Amount'])

# Example Usage
def main():
    # Sample training dataset
    train_data = {
        'feature1': np.random.normal(0, 1, 100),
        'feature2': np.random.normal(0, 1, 100),
        'feature3': np.random.normal(0, 1, 100)
    }
    train_df = pd.DataFrame(train_data)

    # Create artificial production data with changed relationships
    prod_data = {
        'feature1': np.random.normal(0, 1, 100),
        'feature2': np.random.normal(0, 1, 100) + np.random.normal(0, 1, 100),  # added noise
        'feature3': train_df['feature1'] * 0.5 + np.random.normal(0, 1, 100)     # stronger correlation
    }
    prod_df = pd.DataFrame(prod_data)

    features = ['feature1', 'feature2', 'feature3']
    train_corr = compute_correlation_matrix(train_df, features)
    prod_corr = compute_correlation_matrix(prod_df, features)

    diff_matrix = correlation_drift_matrix(train_corr, prod_corr)
    print("Correlation drift matrix (absolute differences):")
    print(diff_matrix.round(2))

    drift_report = detect_significant_drift(diff_matrix, threshold=0.2)
    if not drift_report.empty:
        print("\n⚠️ Significant correlation drift detected between feature pairs:")
        print(drift_report)
    else:
        print("\n✅ No significant correlation drift detected.")

# Basic test
def test_correlation_drift_detection():
    df1 = pd.DataFrame({
        'a': np.random.rand(50),
        'b': np.random.rand(50)
    })
    df2 = df1.copy()
    df2['b'] = df2['a'] * 0.9 + np.random.normal(0, 0.05, 50)  # create correlation
    corr1 = compute_correlation_matrix(df1, ['a', 'b'])
    corr2 = compute_correlation_matrix(df2, ['a', 'b'])
    diff = correlation_drift_matrix(corr1, corr2)
    result = detect_significant_drift(diff, threshold=0.2)
    assert not result.empty, "Expected drift to be detected but got empty result."

if __name__ == "__main__":
    test_correlation_drift_detection()
    main()

Correlation drift matrix (absolute differences):
          feature1  feature2  feature3
feature1      0.00      0.06      0.06
feature2      0.06      0.00      0.11
feature3      0.06      0.11      0.00

✅ No significant correlation drift detected.
