In [19]:
import os

In [20]:
%pwd

'e:\\Neoron\\Programming_Practice\\Machine_Learning_Project\\winequality_prediction'

In [4]:
os.chdir('../')
%pwd

'e:\\Neoron\\Programming_Practice\\Machine_Learning_Project\\winequality_prediction'

In [21]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir: Path 
    data_file: str
    status_file: Path
    all_schema: dict

In [22]:
from src.WineQualityPrediction.constants import *
from src.WineQualityPrediction.utils.common import read_yaml, create_directories
from src.WineQualityPrediction.utils.my_logging import logger
from src.WineQualityPrediction.utils.my_exception import CustomException

In [23]:
class ConfigurationManager:
    """
    Manages the configuration for the project by reading YAML files
    and providing structured configurations.

    Attributes:
        config (ConfigBox): Parsed configuration file.
        params (ConfigBox): Parsed parameters file.
        schema (ConfigBox): Parsed schema file.
    """

    def __init__(self,
                 config_filepath: Path = CONFIG_FILE_PATH,
                 params_filepath: Path = PARAMS_FILE_PATH,
                 schema_filepath: Path = SCHEMA_FILE_PATH):
        
        """
        Initializes the ConfigurationManager by loading YAML files and creating directories.

        Args:
            config_filepath (Path): Path to the main configuration file.
            params_filepath (Path): Path to the parameters configuration file.
            schema_filepath (Path): Path to the schema configuration file.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Provides the configuration for the data validation component.

        Returns:
            DataValidationConfig: Configuration object for data validation.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            data_file =config.data_file,
            status_file=config.STATUS_FILE,
            all_schema=schema
        )
        return data_validation_config

In [24]:
import os
import sys
import logging
import pandas as pd
from typing import Optional
from src.WineQualityPrediction.utils.my_logging import logger
from src.WineQualityPrediction.utils.my_exception import CustomException

In [25]:
log_path = 'log/log_file.log'

In [26]:
class DataValidation:
    """
    A class to validate data based on the provided schema.

    Attributes:
        config (DataValidationConfig): Configuration object containing paths and schema for validation.

    Methods:
        validate_all_columns() -> bool:
            Validates all columns in the dataset against the predefined schema.

    Raises:
        Exception: Any exception raised during the validation process is re-raised for further handling.
    """

    def __init__(self, config: DataValidationConfig):
        """
        Initializes the DataValidation class with the given configuration.

        Args:
            config (DataValidationConfig): Configuration object with paths and schema.

        Attributes:
            config (DataValidationConfig): Stores the configuration for validation.
        """
        self.config = config

    def validate_all_columns(self) -> bool:
        """
        Validates that all columns in the dataset match the predefined schema.

        Reads the data from the specified CSV file, checks if all columns in the dataset 
        exist in the schema, and writes the validation status to a file.

        Returns:
            bool: True if all columns are valid, False otherwise.

        Raises:
            Exception: If there is an error during file reading or validation.

        Workflow:
            - Load the dataset from the provided path.
            - Compare dataset columns against schema keys.
            - Write the validation status (True/False) to the specified status file.
        """
        try:
            # Initialize validation status
            validation_status: Optional[bool] = None
            logger(log_path, logging.INFO, "Data validation started...")

            # Read data from the provided CSV file
            data = pd.read_csv(self.config.data_file)
            logger(log_path, logging.INFO, "Data loaded successfully...")
            all_cols = list(data.columns)

            # Get all schema keys
            all_schema = self.config.all_schema.keys()
            logger(log_path, logging.INFO, "Schema keys loaded successfully...")

            # Validate each column in the dataset
            logger(log_path, logging.INFO, "Column validation started...")
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.status_file, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                    break
                else:
                    validation_status = True

            # Write the final status if all columns are valid
            if validation_status:
                with open(self.config.status_file, 'w') as f:
                    f.write(f"Validation status: {validation_status}")
            
            logger(log_path, logging.INFO, "Data validation completed successfully...")

            return validation_status

        except Exception as e:
            # Re-raise exception for external handling
            logger(log_path, logging.ERROR, f"Data validation failed: {CustomException(e,sys)}")
            raise CustomException(e,sys)

In [28]:
try:
    logger(log_path, logging.INFO, "Data validation script started...")

    # Step 1: Configuration setup
    logger(log_path, logging.INFO, "Initializing ConfigurationManager.")
    config_manager = ConfigurationManager()

    # Step 2: Fetching data validation configuration
    logger(log_path, logging.INFO, "Fetching data validation configuration.")
    data_validation_config = config_manager.get_data_validation_config()

    # Step 3: Performing data validation
    logger(log_path, logging.INFO, "Initializing DataValidation.")
    data_validation = DataValidation(config=data_validation_config)

    # Step 4: Running data validation
    logger(log_path, logging.INFO, "Running data validation.")
    validation_status = data_validation.validate_all_columns()

    if validation_status:
        logger(log_path, logging.INFO, "Data validation successful.")
    else:
        logger(log_path, logging.ERROR, "Data validation failed.")

except CustomException as custom_ex:
    logger(log_path, logging.ERROR, f"An unexpected error occurred: {CustomException(custom_ex,sys)}")
    raise CustomException(custom_ex,sys)  # Optionally, re-raise the exception for external handling
except Exception as ex:
    logger(log_path, logging.ERROR, f"An unexpected error occurred: {CustomException(ex,sys)}")
    raise CustomException(ex,sys)  # Optionally, re-raise the exception for external handling

[2025-01-15 00:20:50] - INFO - Data validation script started...
[2025-01-15 00:20:50] - INFO - Initializing ConfigurationManager.
[2025-01-15 00:20:50] - INFO - Yaml file read successfully
[2025-01-15 00:20:50] - INFO - Yaml file read successfully
[2025-01-15 00:20:50] - INFO - Yaml file read successfully
[2025-01-15 00:20:50] - INFO - Directory created at artifacts
[2025-01-15 00:20:50] - INFO - Fetching data validation configuration.
[2025-01-15 00:20:50] - INFO - Directory created at artifacts/data_validation
[2025-01-15 00:20:50] - INFO - Initializing DataValidation.
[2025-01-15 00:20:50] - INFO - Running data validation.
[2025-01-15 00:20:50] - INFO - Data validation started...
[2025-01-15 00:20:51] - INFO - Data loaded successfully...
[2025-01-15 00:20:51] - INFO - Schema keys loaded successfully...
[2025-01-15 00:20:51] - INFO - Column validation started...
[2025-01-15 00:20:51] - INFO - Data validation completed successfully...
[2025-01-15 00:20:51] - INFO - Data validation su