In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import ks_2samp

# --- 1. Train a model on the original data ---
np.random.seed(42)
X_orig = np.random.normal(loc=50, scale=10, size=(100, 1))
y_orig = 2 * X_orig.flatten() + np.random.normal(scale=5, size=100)
model = LinearRegression().fit(X_orig, y_orig)
y_pred_orig = model.predict(X_orig)
mse_orig = mean_squared_error(y_orig, y_pred_orig)

# --- 2. Evaluate on the drifted data ---
X_drift = np.random.normal(loc=60, scale=15, size=(100, 1))  # Drifted distribution
y_drift = 2 * X_drift.flatten() + np.random.normal(scale=5, size=100)
y_pred_drift = model.predict(X_drift)
mse_drift = mean_squared_error(y_drift, y_pred_drift)

# --- 3. Monitor feature statistics ---a
orig_stats = {"mean": X_orig.mean(), "std": X_orig.std()}
drift_stats = {"mean": X_drift.mean(), "std": X_drift.std()}
drift_detected = abs(orig_stats["mean"] - drift_stats["mean"]) > 5  # example threshold

# --- 4. Automating basic data quality checks ---
df = pd.DataFrame({"id": [1, 2, 2], "name": ["Alice", "Bob", "Bob"]})
primary_key_unique = df["id"].is_unique
no_missing_values = df.notnull().all().all()

# --- 5. Kolmogorov-Smirnov test for drift detection ---
ks_stat, ks_pvalue = ks_2samp(X_orig.flatten(), X_drift.flatten())

mse_orig, mse_drift, orig_stats, drift_stats, drift_detected, primary_key_unique, no_missing_values, (ks_stat, ks_pvalue)

(22.0908120572822,
 22.332727843625243,
 {'mean': 48.96153482605907, 'std': 9.036161766446297},
 {'mean': 60.97344379650673, 'std': 16.182717693681113},
 True,
 False,
 True,
 (0.46, 6.422179651064002e-10))

In [None]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:




In [None]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:




In [None]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:




In [None]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.


# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :




In [None]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

# 1. Use SciPy to perform KS test:


