In [1]:
import os 

In [2]:
%pwd

'e:\\ml projects\\ai_or_human_text_detection\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path 
    unzip_data_path: Path
    STATUS_FILE_PATH: str 
    all_schema: dict

In [5]:
from src.ai_or_human_text.constants import * 
from src.ai_or_human_text.utils.common import read_yaml,create_dir

In [6]:
class ConfigManager:
    def __init__(
        self,
        config_file=CONFIG_FILE_PATH,
        schema_file=SCHEMA_FILE_PATH,
        params_file=PARAMS_FILE_PATH):

        self.config=read_yaml(config_file)
        self.schema=read_yaml(schema_file)
        self.params=read_yaml(params_file)

        create_dir([self.config.artifacts_root])

    def get_data_validation_config(self)-> DataValidationConfig:
        config=self.config.data_validation
        schema=self.schema.COLUMNS

        create_dir([config.root_dir])

        data_validation_config=DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_path=config.unzip_data_path,
            STATUS_FILE_PATH=config.STATUS_FILE_PATH,
            all_schema=schema)

        return data_validation_config     

In [7]:
import pandas as pd
from src.ai_or_human_text.logging import logger

[2025-08-14 12:20:42,407 : INFO : utils : NumExpr defaulting to 12 threads.]


In [8]:
class DataValidation:
    def __init__(self,config:DataValidationConfig):
        self.config=config 

    def schema_validation(self):
        try:
            schema_status:None
            data=pd.read_csv(self.config.unzip_data_path)
            all_cols=list(data.columns)
            all_schema=self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    schema_status=False
                    with open(self.config.STATUS_FILE_PATH,"w")as file:
                        file.write(f"schema status: {schema_status}")
                        logger.info(f"schema does not match with this columns:{[col]}")
                
                else: 
                    schema_status=True
                    with open(self.config.STATUS_FILE_PATH,"w")as file:
                        file.write(f"schema status: {schema_status}")
                return schema_status
        
        except Exception as e: 
            raise e
        
    def data_type_validation(self):
        try:
            data_type_status:None 
            data=pd.read_csv(self.config.unzip_data_path)
            all_data_type=list(data.dtypes)
            all_schema=self.config.all_schema.values()

            for data_type in all_data_type:
                if str(data_type) not in all_schema:
                    data_type_status=False
                    with open(self.config.STATUS_FILE_PATH,"a")as file:
                        file.append(f"\ndata type status: {data_type_status}")
                
                else:
                    data_type_status=True
                    with open(self.config.STATUS_FILE_PATH,"a")as file:
                        file.write(f"\ndata type status: {data_type_status}")
                
                return data_type_status
            
        except Exception as e: 
            raise e

            


In [9]:
try:
    config=ConfigManager()
    data_validation_config=config.get_data_validation_config()
    data_validation=DataValidation(config=data_validation_config)
    data_validation.schema_validation()
    data_validation.data_type_validation()
except Exception as e: 
    raise e

[2025-08-14 12:20:42,700 : INFO : common : yaml file loaded sucessfully from config\config.yaml]
[2025-08-14 12:20:42,701 : INFO : common : yaml file loaded sucessfully from schema.yaml]
[2025-08-14 12:20:42,702 : INFO : common : yaml file loaded sucessfully from params.yaml]
[2025-08-14 12:20:42,702 : INFO : common : artifacts created sucessfully]
[2025-08-14 12:20:42,704 : INFO : common : artifacts/data_validation created sucessfully]
