In [1]:
import os

In [2]:
%pwd


'e:\\2025\\Project_Learning\\NLP_Text_Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\2025\\Project_Learning\\NLP_Text_Summarizer'

## Creating entity 


In [23]:
from dataclasses import dataclass
from pathlib import Path
from typing import List # Import List for type hint

@dataclass(frozen=True)
class DataValidationConfig:
    """
    Defines the structure for Data Validation configuration.
    This entity holds the paths and parameters needed for the validation step.
    """
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list[str] # Using List[str] for the file names

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml,create_directories


## Updating Config


In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        # Load configurations using the read_yaml utility
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # Ensure the main artifacts root directory exists
        create_directories([self.config.artifacts_root])

    # Method to get the Data Ingestion specific configuration
    def get_data_validation_config(self) -> DataValidationConfig:
        """Reads data_validation configuration from config.yaml
        and returns it as a DataValidationConfig object."""
    # 1. Access the 'data_validation' section of the main configuration
        config = self.config.data_validation

    # 2. Create the root directory for data validation artifacts
    # Assumes create_directories is available (e.g., imported from utils.common)
        create_directories([config.root_dir])

    # 3. Create the DataValidationConfig entity object
        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir), # Convert string path to Path object
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
        )

    # 4. Return the configured object
        return data_validation_config

## Updating Components 

In [20]:
from textSummarizer.logging import logger
class DataValiadition:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    # Inside the DataValiadition class
    def validate_all_files_exist(self) -> bool:
        try:
            validation_status = None
            data_dir = os.path.join("artifacts", "data_ingestion", "samsum_dataset")
            all_files = os.listdir(data_dir)

            for file in all_files:
                if file not in self.config.ALL_REQUIRED_FILES:
                # 📢 NEW LINE: Log the file causing the failure!
                    logger.warning(f"Validation failed! Unexpected file found: {file}")
                
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}\nFailing file: {file}")
                    break
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            raise e

## Pipeline Creation


In [22]:
try:
    # 1. Initialize Configuration Manager
    config = ConfigurationManager()
    
    # 2. Get the specific configuration for Data Validation
    data_validation_config = config.get_data_validation_config()
    
    # 3. Initialize the Data Validation Component
    # Note: Class name is DataValiadition (as per your image, likely a typo for DataValidation)
    data_validation = DataValiadition(config=data_validation_config)
    
    # 4. Execute the Data Validation step
    data_validation.validate_all_files_exist()
    
except Exception as e:
    raise e

[2025-10-28 14:25:36,995: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-28 14:25:36,997: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-28 14:25:36,999: INFO: common: created directory at: artifacts]
[2025-10-28 14:25:37,000: INFO: common: created directory at: artifacts/data_validation]
