### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
# write your code from here
import pandas as pd
import numpy as np
import logging

# ------------------- Setup Logging -------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("MetadataValidator")

# ------------------- Sample Metadata Definition -------------------
metadata = {
    "CustomerID": {"dtype": "int", "nullable": False},
    "Name": {"dtype": "str", "nullable": False},
    "Email": {"dtype": "str", "nullable": True},
    "Age": {"dtype": "int", "nullable": True}
}

# ------------------- Sample Data -------------------
sample_data = {
    "CustomerID": [1, 2, 3, None],
    "Name": ["Alice", "Bob", None, "David"],
    "Email": ["a@example.com", "b@example.com", None, "d@example.com"],
    "Age": [25, "Thirty", 30, None]
}

df = pd.DataFrame(sample_data)

# ------------------- Validation Functions -------------------

def validate_column_presence(df, metadata):
    missing_cols = [col for col in metadata if col not in df.columns]
    if missing_cols:
        raise ValueError(f"‚ùå Missing columns: {missing_cols}")
    logger.info("‚úÖ All required columns are present.")

def validate_data_types(df, metadata):
    issues = []
    for col, rules in metadata.items():
        expected_type = rules["dtype"]
        for i, val in enumerate(df[col]):
            if pd.isnull(val):
                continue
            if expected_type == "int":
                try:
                    int(val)
                except:
                    issues.append((col, i, val))
            elif expected_type == "str":
                if not isinstance(val, str):
                    issues.append((col, i, val))
    if issues:
        logger.warning("‚ö†Ô∏è Data type mismatches found:")
        for issue in issues:
            logger.warning(f" - Column: {issue[0]}, Row: {issue[1]}, Value: {issue[2]}")
    else:
        logger.info("‚úÖ All data types are valid.")

def validate_nullability(df, metadata):
    for col, rules in metadata.items():
        if not rules["nullable"] and df[col].isnull().any():
            null_count = df[col].isnull().sum()
            logger.warning(f"‚ö†Ô∏è Column '{col}' has {null_count} null values but is marked non-nullable.")

# ------------------- Filter Valid Rows -------------------

def get_valid_rows(df, metadata):
    valid_mask = pd.Series([True] * len(df))
    for col, rules in metadata.items():
        if not rules["nullable"]:
            valid_mask &= df[col].notnull()

        expected_type = rules["dtype"]
        def is_valid_type(val):
            if pd.isnull(val):
                return True
            try:
                if expected_type == "int":
                    int(val)
                elif expected_type == "str":
                    return isinstance(val, str)
                return True
            except:
                return False

        valid_mask &= df[col].apply(is_valid_type)

    valid_df = df[valid_mask].copy()
    return valid_df

# ------------------- Execution -------------------

def run_metadata_validation():
    logger.info("üîç Starting metadata-based validation...")

    try:
        validate_column_presence(df, metadata)
        validate_data_types(df, metadata)
        validate_nullability(df, metadata)

        valid_df = get_valid_rows(df, metadata)
        logger.info("‚úÖ Final Valid Data:")
        print(valid_df)

    except Exception as e:
        logger.error(f"‚ùå Validation failed: {e}")

# ------------------- Run -------------------
if __name__ == "__main__":
    run_metadata_validation()

INFO:MetadataValidator:üîç Starting metadata-based validation...
INFO:MetadataValidator:‚úÖ All required columns are present.
INFO:MetadataValidator:‚úÖ Final Valid Data:


   CustomerID   Name          Email Age
0         1.0  Alice  a@example.com  25
