In [1]:
import os

In [2]:
# os.chdir("..")

In [3]:
%reload_ext autoreload
%autoreload 2
import os
import sys
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils.common import logger
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH

In [5]:
%reload_ext autoreload
%autoreload 2
import os
import sys
from dotenv import load_dotenv, find_dotenv
from dataclasses import dataclass
from pathlib import Path
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils.common import logger, read_yaml, create_directories

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    source_path: Path
    STATUS_FILE: Path
    schema: list 

In [None]:
# os.chdir("..")

In [6]:
from src.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = os.getenv("CONFIG_FILE_PATH"),
        params_filepath: str = os.getenv("PARAMS_FILE_PATH"),
        schema_filepath: str = os.getenv("SCHEMA_FILE_PATH"),
    ):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Get configuration for data validation
        
        Returns:
            DataValidationConfig: Configuration for data validation
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            source_path  = config.source_path,
            STATUS_FILE = config.STATUS_FILE,
            schema=schema
        )
        return data_validation_config

In [33]:
import pandas as pd

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        """
        Instantiate `DataValidation` class

        Args:
            config (DataValidationConfig): configuration for data ingestion
        """
        self.config = config

    def validate_data(self):
        """Extract `.zip` file"""
        try:
            logger.info("Validate data")
            validation_status = None
            
            df = pd.read_csv(self.config.source_path)
            all_cols = df.columns
            all_schema = self.config.schema
            
            for col in all_cols:
                if col not in all_schema.keys():
                    validation_status = False
                else:
                    if df[col].dtype == all_schema[col]:
                        validation_status = True
                    else:
                        validation_status = False
                        
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"validation status: {validation_status}")
        except Exception as e:
            logger.error(e)
            

In [34]:
try:
    configuration_manager = ConfigurationManager()
    data_ingestion = DataValidation(config=configuration_manager.get_data_validation_config())
    data_ingestion.validate_data()
except Exception as e:
    logger.error(e)


2024-03-06 22:10:13,129 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\config\config.yaml loaded successfully
2024-03-06 22:10:13,131 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\params.yaml loaded successfully
2024-03-06 22:10:13,134 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\schema.yaml loaded successfully
2024-03-06 22:10:13,136 - sentiment-classifier-logger - INFO - Created directory at: artifacts
2024-03-06 22:10:13,137 - sentiment-classifier-logger - INFO - Created directory at: artifacts/data_validation
2024-03-06 22:10:13,138 - sentiment-classifier-logger - INFO - Validate data


In [18]:
val_config = configuration_manager.get_data_validation_config()
df = pd.read_csv(val_config.source_path)

val_config.schema['overall'] == df['overall'].dtype

2024-03-06 22:05:14,154 - sentiment-classifier-logger - INFO - Created directory at: artifacts/data_validation


True

In [22]:
val_config.schema.keys()

dict_keys(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'image'])