In [42]:
import ipykernel
import os


In [43]:
%pwd

'e:\\ProjectPractice\\E2E-ML-Pipeline-MLOps-MLFlow\\research'

In [44]:
os.chdir("../")

In [45]:
import pandas as pd

In [54]:
data = pd.read_csv("./artifacts/data_ingestion/winequality-red.csv")

In [55]:
data.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [56]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [57]:
data.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [77]:
from dataclasses import dataclass
from pathlib import Path

In [78]:
@dataclass(frozen=True)
class DataValidationConfig :
    root_dir : Path
    unzip_data_dir : Path
    STATUS_FILE : str
    all_schema : dict

In [79]:
from src.ML_MLOps_MLFlow_Pipeline import *
from src.ML_MLOps_MLFlow_Pipeline.constants import *
from src.ML_MLOps_MLFlow_Pipeline.utils.common import read_yaml, create_directories

In [88]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema,
            unzip_data_dir=config.unzip_data_dir
        )

        return data_validation_config

In [89]:
import os
from src.ML_MLOps_MLFlow_Pipeline import logger

In [98]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self)-> bool :
        try:
            validation_status = None
            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)
            all_schema = self.config.all_schema.keys()
            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}\n")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}\n")
                    
            return validation_status
    
        except Exception as e :
            raise e
        
    def validate_all_columns_datatype(self) -> bool :
        try :
            data = pd.read_csv(self.config.unzip_data_dir)  

            dtype_mapping = {
                "int": ["int64", "Int64"],
                "float": ["float64", "Float64"],
                "object": ["object", "string"]
            }

            mismatches = []
            for col, expected_type in self.config.all_schema.items():
                actual_type = str(data[col].dtype)
                if actual_type not in dtype_mapping.get(expected_type, [expected_type]):
                    mismatches.append((col, expected_type, actual_type))

            validation_status = len(mismatches) == 0

            with open(self.config.STATUS_FILE, 'a') as f:
                f.write(f"Data Type validation status: {validation_status}\n")
                if mismatches:
                    f.write(f"Type mismatches: {mismatches}\n")

            return validation_status

        
        except Exception as e :
            raise e

In [99]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.validate_all_columns()
    data_validation.validate_all_columns_datatype()
    
except Exception as e:
    raise e

[2025-08-12 15:05:04,881: INFO: common: yaml file : <_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='UTF-8'> loaded successfully]
[2025-08-12 15:05:04,884: INFO: common: yaml file : <_io.TextIOWrapper name='params.yaml' mode='r' encoding='UTF-8'> loaded successfully]
[2025-08-12 15:05:04,888: INFO: common: yaml file : <_io.TextIOWrapper name='schema.yaml' mode='r' encoding='UTF-8'> loaded successfully]
[2025-08-12 15:05:04,890: INFO: common: Created directory at path : artifacts]
[2025-08-12 15:05:04,891: INFO: common: Created directory at path : artifacts/data_validation]
