In [2]:
import os

In [46]:
%pwd

'/home/omar/Desktop/End-to-end-Machine-Learning-Project-with-MLflow'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/home/omar/Desktop/End-to-end-Machine-Learning-Project-with-MLflow'

In [47]:
import pandas as pd

In [48]:
data = pd.read_csv("artifacts/data_ingestion/predictive_maintenance.csv")
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [50]:
data.columns

Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Target', 'Failure Type'],
      dtype='object')

In [51]:
data.dtypes

UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Target                       int64
Failure Type                object
dtype: object

In [52]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir : Path
    STATUS_FILE : str
    unzip_dir : Path
    all_schema : Path

In [53]:
from src.ML.constants import *
from src.ML.utils.common import read_yaml, create_directories

In [54]:
class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_FILE_PATH,
                params_filepath=PARAMS_FILE_PATH,
                schema_filepath=SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:

        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            unzip_dir = config.unzip_dir,
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema
        )
        return data_validation_config

In [55]:
import os
from src.ML import logger

In [56]:
class DataValidation:
    def __init__(self, config:DataValidationConfig):
        self.config = config
    
    def validate_columns(self) -> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_dir)
            columns = list(data.columns)

            schema = self.config.all_schema.keys()


            for column in columns:
                if column not in schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status:{validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status:{validation_status}")

            return validation_status
        
        except Exception as e:
            raise e


In [58]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_columns()
except Exception as e:
    raise e

[2024-10-23 17:41:49,604: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-10-23 17:41:49,609: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-23 17:41:49,616: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-23 17:41:49,619: INFO: common: created directory at: artifacts]
[2024-10-23 17:41:49,621: INFO: common: created directory at: artifacts/data_validation]
