### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [1]:
# write your code from here

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def validate_dataframes(df_train, df_test):
    if not isinstance(df_train, pd.DataFrame) or not isinstance(df_test, pd.DataFrame):
        raise TypeError("Both inputs must be pandas DataFrames.")
    if df_train.shape[1] != df_test.shape[1]:
        raise ValueError("Train and test data must have the same number of columns.")
    if df_train.isnull().any().any() or df_test.isnull().any().any():
        raise ValueError("Input data contains missing values. Please handle them before.")

def adversarial_validation(df_train, df_test, classifier=None, test_size=0.3, random_state=42):
    """
    Performs adversarial validation by training a classifier to distinguish train vs test samples.

    Parameters:
    - df_train (pd.DataFrame): Training dataset features
    - df_test (pd.DataFrame): Test dataset features
    - classifier: sklearn classifier instance (default RandomForestClassifier)
    - test_size (float): Fraction of data to use for validation split
    - random_state (int): Random seed for reproducibility

    Returns:
    - auc_score (float): ROC-AUC score on validation set
    """

    validate_dataframes(df_train, df_test)

    # Label the datasets: 0 for train, 1 for test
    df_train_labeled = df_train.copy()
    df_train_labeled['origin'] = 0

    df_test_labeled = df_test.copy()
    df_test_labeled['origin'] = 1

    # Combine datasets
    combined = pd.concat([df_train_labeled, df_test_labeled], axis=0).reset_index(drop=True)

    X = combined.drop(columns='origin')
    y = combined['origin']

    # Scale features for better classifier performance
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-validation split for adversarial classifier
    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Use default classifier if none provided
    if classifier is None:
        classifier = RandomForestClassifier(n_estimators=100, random_state=random_state)

    # Train classifier
    classifier.fit(X_train, y_train)

    # Predict probabilities on validation set
    y_pred_proba = classifier.predict_proba(X_val)[:, 1]

    # Calculate ROC-AUC score
    auc_score = roc_auc_score(y_val, y_pred_proba)

    return auc_score

def main():
    # Simulate train data: Normal distribution
    np.random.seed(0)
    train_data = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 1000),
        'feature2': np.random.normal(5, 2, 1000),
    })

    # Simulate test data without drift (same distribution)
    test_data_no_drift = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 500),
        'feature2': np.random.normal(5, 2, 500),
    })

    # Simulate test data with drift (shifted mean)
    test_data_with_drift = pd.DataFrame({
        'feature1': np.random.normal(1, 1, 500),
        'feature2': np.random.normal(6, 2, 500),
    })

    # No drift case
    auc_no_drift = adversarial_validation(train_data, test_data_no_drift)
    print(f"ROC-AUC (No Drift): {auc_no_drift:.4f} (Close to 0.5 means no drift)")

    # Drift case
    auc_with_drift = adversarial_validation(train_data, test_data_with_drift)
    print(f"ROC-AUC (With Drift): {auc_with_drift:.4f} (Closer to 1 means drift detected)")

if __name__ == "__main__":
    main()

ROC-AUC (No Drift): 0.4932 (Close to 0.5 means no drift)
ROC-AUC (With Drift): 0.7384 (Closer to 1 means drift detected)
