### Data Drift: Detection Using Statistical Tests
**Question**: Simulate two datasets with a shift in distribution, and apply the Kolmogorov-
Smirnov test to determine if data drift has occurred.

In [1]:
# write your code from here

import numpy as np
from scipy.stats import ks_2samp

def simulate_data(n_samples=1000, drift=False, seed=42):
    """
    Simulates data samples.
    If drift=True, generate data from a different distribution to simulate drift.
    """
    np.random.seed(seed)
    if not drift:
        # Original distribution: Normal(0,1)
        data = np.random.normal(loc=0, scale=1, size=n_samples)
    else:
        # Drifted distribution: Normal(0.5, 1.5) shifted mean and higher variance
        data = np.random.normal(loc=0.5, scale=1.5, size=n_samples)
    return data

def detect_data_drift(data_ref, data_new, alpha=0.05):
    """
    Uses Kolmogorov-Smirnov test to detect if data_new distribution differs significantly from data_ref.
    
    Parameters:
    - data_ref: Reference data (e.g., training set)
    - data_new: New data to test (e.g., production)
    - alpha: Significance level for rejecting null hypothesis
    
    Returns:
    - ks_statistic: KS test statistic
    - p_value: p-value from KS test
    - drift_detected: True if drift detected, else False
    """
    if len(data_ref) == 0 or len(data_new) == 0:
        raise ValueError("Input datasets must be non-empty arrays.")
    
    ks_statistic, p_value = ks_2samp(data_ref, data_new)
    drift_detected = p_value < alpha
    
    return ks_statistic, p_value, drift_detected

def main():
    # Simulate original (reference) and drifted data
    original_data = simulate_data(n_samples=1000, drift=False)
    new_data_no_drift = simulate_data(n_samples=1000, drift=False, seed=100)
    new_data_drift = simulate_data(n_samples=1000, drift=True, seed=100)

    print("Testing data with NO drift:")
    ks_stat, p_val, drift = detect_data_drift(original_data, new_data_no_drift)
    print(f"KS Statistic: {ks_stat:.4f}, p-value: {p_val:.4f}")
    print(f"Drift detected? {'Yes' if drift else 'No'}\n")

    print("Testing data WITH drift:")
    ks_stat, p_val, drift = detect_data_drift(original_data, new_data_drift)
    print(f"KS Statistic: {ks_stat:.4f}, p-value: {p_val:.4f}")
    print(f"Drift detected? {'Yes' if drift else 'No'}")

if __name__ == "__main__":
    main()

Testing data with NO drift:
KS Statistic: 0.0370, p-value: 0.5006
Drift detected? No

Testing data WITH drift:
KS Statistic: 0.2120, p-value: 0.0000
Drift detected? Yes
