## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [None]:
# Write your code from here

import pandas as pd
import numpy as np
import schedule
import time
import logging
from datetime import datetime
from sklearn.datasets import load_diabetes

# ----------------------------
# Setup Logging
# ----------------------------
logging.basicConfig(
    filename='quality_metrics.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# ----------------------------
# Define Quality Metrics Function
# ----------------------------
def calculate_quality_metrics(df: pd.DataFrame, expected_schema: dict = None) -> dict:
    metrics = {}

    # 1. Column-wise missing value rate
    missing_rate = df.isnull().mean().to_dict()
    metrics["missing_rate"] = missing_rate

    # 2. Row-wise missing value fraction
    metrics["row_missing_fraction"] = (df.isnull().any(axis=1).sum() / len(df)) if len(df) > 0 else 0

    # 3. Schema mismatches
    if expected_schema:
        mismatches = {}
        for col, dtype in expected_schema.items():
            if col not in df.columns:
                mismatches[col] = "missing column"
            elif not np.issubdtype(df[col].dtype, np.dtype(dtype)):
                mismatches[col] = f"type mismatch (expected {dtype}, found {df[col].dtype})"
        metrics["schema_mismatches"] = mismatches
    else:
        metrics["schema_mismatches"] = {}

    return metrics

# ----------------------------
# Load or Simulate Dataset
# ----------------------------
def load_data():
    data = load_diabetes()
    df = pd.DataFrame(data.data, columns=data.feature_names)

    # Simulate missing values and anomaly
    df.loc[0:5, 'age'] = None
    df.loc[10, 'bmi'] = None
    df['fake_column'] = 'oops'  # unexpected column

    return df

# ----------------------------
# Monitoring Job
# ----------------------------
def job():
    logging.info("Running data quality monitoring job.")

    try:
        # Load data snapshot
        df = load_data()

        # Replace infinite values with NaN
        df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Drop columns with all missing
        df.dropna(axis=1, how='all', inplace=True)

        # Define expected schema
        expected_schema = {
            'age': 'float64',
            'bmi': 'float64',
            'bp': 'float64',
            's1': 'float64'
        }

        # Compute metrics
        metrics = calculate_quality_metrics(df, expected_schema)

        # Log each metric
        logging.info("Missing Rate per column: %s", metrics["missing_rate"])
        logging.info("Row Missing Fraction: %.4f", metrics["row_missing_fraction"])
        logging.info("Schema Mismatches: %s", metrics["schema_mismatches"])
        logging.info("Job completed.\n")

    except Exception as e:
        logging.error(f"Monitoring job failed: {e}")

# ----------------------------
# Schedule the Job
# ----------------------------
schedule.every(10).seconds.do(job)  # Run every 10 seconds

# ----------------------------
# Run Loop
# ----------------------------
if __name__ == "__main__":
    print("📊 Starting Data Quality Monitor... Press Ctrl+C to stop.")
    while True:
        schedule.run_pending()
        time.sleep(0)

📊 Starting Data Quality Monitor... Press Ctrl+C to stop.
