# Data Validation Pipeline Development
- Author: Marcellinus Aditya Witarsah
- Date: 05 June 2024

In [1]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import logging
import time
import pickle
import os
from pathlib import Path
from abc import ABC
from abc import abstractmethod
from scipy import stats
from typing import Tuple
from typing import Union
from dataclasses import dataclass
from src.utils.common import logger
from src.utils.common import read_yaml, create_directories
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH



In [2]:
# run once only
os.chdir("..")

# Configuration

In [3]:
# src/entities/config_entity.py
@dataclass(frozen=True)
class DataValidationConfig:
    """
    Data class for storing data validation configuration.

    Attributes:
        root_dir (Path): Root directory for data validation.
        source_path (Path): Source path of the data to be validated.
        STATUS_FILE (Path): Path to the status file.
        schema (list): List defining the schema for validation.
    """
    root_dir: Path
    source_path: Path
    STATUS_FILE: Path
    schema: list 

# src/config/configuration_manager.py
class ConfigurationManager:
    """
    Prepare ConfigurationManager class.
    
    This class is responsible for reading configuration files and preparing
    configuration settings for the pipeline.

    Attributes:
        config (dict): Parsed configuration file content.
        params (dict): Parsed parameters file content.
        schema (dict): Parsed schema file content.
    """
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH, 
        params_filepath: str = PARAMS_FILE_PATH, 
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize the ConfigurationManager with file paths.

        Args:
            config_filepath (str): File path to the configuration YAML file.
            params_filepath (str): File path to the parameters YAML file.
            schema_filepath (str): File path to the schema YAML file.
        """
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Get configuration for data validation.
        
        Returns:
            DataValidationConfig: Configuration for data validation.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            STATUS_FILE=config.STATUS_FILE,
            schema=schema
        )
        return data_validation_config


# Data Load

In [13]:
# src/data/data_validation.py
class DataValidation:
    """
    Class to handle the data validation process.
    """
    
    def __init__(self, config: DataValidationConfig):
        """
        Instantiate `DataValidation` class.

        Args:
            config (DataValidationConfig): Configuration for data validation.
        """
        self.config = config

    def validate_data(self):
        """
        Validate the data based on the provided schema.
        
        This method reads a CSV file from the source path and checks if all columns match the schema.
        
        Logs messages indicating whether data types match or not.
        """
        try:
            logger.info("Validate data")
            validation_status = None
            
            df = pd.read_csv(self.config.source_path)
            all_cols = df.columns
            all_schema = self.config.schema
            
            for col in all_cols:
                if col not in all_schema.keys():
                    validation_status = False
                else:
                    if df[col].dtype == all_schema[col]:
                        validation_status = True
                    else:
                        validation_status = False
                        
            if validation_status:
                logger.info("All data types match")
            else:
                logger.info("There's a data types mismatch")
            
        except Exception as e:
            logger.error(e)

In [14]:
try:
    configuration_manager = ConfigurationManager()
    data_ingestion = DataValidation( 
        config=configuration_manager.get_data_validation_config()
    )
    data_ingestion.validate_data()
except Exception as e:
    logger.error(e)

2024-06-05 14:34:05,724 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 14:34:05,727 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 14:34:05,731 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 14:34:05,733 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 14:34:05,734 - credit-scorecard-logger - INFO - Created directory at: artifacts/data_validation
2024-06-05 14:34:05,735 - credit-scorecard-logger - INFO - Validate data
2024-06-05 14:34:05,779 - credit-scorecard-logger - INFO - All data types match


# Testing
Restart and run again

In [1]:
import os
os.chdir("..")

In [12]:
from src.utils.common import logger
from src.config.configuration import ConfigurationManager
from src.data.data_validation import DataValidation


class DataValidationPipeline:
    def __init__(self):
        """
        Instantiate `DataValidationPipeline` class
        """
        self.configuration_manager = ConfigurationManager()

    def run(self):
        """
        Validate data
        """
        data_validation = DataValidation(
            config=self.configuration_manager.get_data_validation_config()
        )
        data_validation.validate_data()


if __name__ == "__main__":
    STAGE_NAME = "Data Validation Stage"
    try:
        logger.info(f">>>>>> {STAGE_NAME} Started <<<<<<")
        data_validation_training_pipeline = DataValidationPipeline()
        data_validation_training_pipeline.run()
        logger.info(f">>>>>> {STAGE_NAME} Completed <<<<<<")
    except Exception as e:
        logger.error(e)

2024-06-05 14:31:03,263 - credit-scorecard-logger - INFO - >>>>>> Data Validation Stage Started <<<<<<
2024-06-05 14:31:03,268 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 14:31:03,272 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 14:31:03,276 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 14:31:03,278 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 14:31:03,280 - credit-scorecard-logger - INFO - Created directory at: artifacts/data_validation
2024-06-05 14:31:03,281 - credit-scorecard-logger - INFO - Validate data
2024-06-05 14:31:03,329 - credit-scorecard-logger - INFO - All data types are match
2024-06-05 14:31:03,330 - credit-scorecard-logger - INFO - >>>>>> Data Validation Stage Completed <<<<<<
