In [1]:
import os
import yaml
from ensure import ensure_annotations
import pandas as pd

In [2]:
os.chdir("../")

In [3]:
data = pd.read_csv("artifacts\data_ingestion\\train.csv")
data.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175036 entries, 0 to 175035
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CustomerId       175036 non-null  int64  
 1   Surname          175036 non-null  object 
 2   CreditScore      175036 non-null  int64  
 3   Geography        175035 non-null  object 
 4   Gender           175036 non-null  object 
 5   Age              175035 non-null  float64
 6   Tenure           175036 non-null  int64  
 7   Balance          175036 non-null  float64
 8   NumOfProducts    175036 non-null  int64  
 9   HasCrCard        175035 non-null  float64
 10  IsActiveMember   175035 non-null  float64
 11  EstimatedSalary  175036 non-null  float64
 12  Exited           175036 non-null  int64  
dtypes: float64(5), int64(5), object(3)
memory usage: 17.4+ MB


In [5]:
class DataValidation:

    """ 
    Example usage:
    Assuming you have a DataFrame named 'data' and the path to the schema file is 'schema.yaml'
    data_validation = DataValidation(data, 'schema.yaml')
    data_validation.run_validation()
    The following reasons for ensure_annotation is given in reason.ipynb"""
    
    @ensure_annotations
    def __init__(self, dataframe: pd.DataFrame, schema_path: str):
        self.dataframe = dataframe
        self.schema_path = schema_path
        self.validation_status = False
    
    @ensure_annotations
    def read_schema(self)-> dict:
        """
        Args:
            path_to_yaml (str): path like input

        Raises:
            ValueError: if yaml file is empty
            e: empty file

        Returns:
            ConfigBox: ConfigBox type
        """
        try:    
            with open(self.schema_path) as schema_file:
                schema = yaml.safe_load(schema_file)

        except:
            raise ValueError("Schema file is empty")
    
        return schema
        
    @ensure_annotations
    def validate_data(self)-> bool :
        schema = self.read_schema()
        columns = schema.keys()
        # Check if all columns in schema are present in dataframe
        if not set(columns).issubset(set(self.dataframe.columns)):
            print("Validation Failed for columns, columns are not matching in dataframe")
            return False
        
        # Check data types
        for column, dtype in schema.items():
            if self.dataframe[column].dtype.name != dtype:
                print("Validation Failed for columns, datatypes do not match")
                return False
        
        print("Validation Succedded for columns")
        return True
    
    def save_validation_status(self):
        status_path = "artifacts/data_validation/status.txt"
        with open(status_path, "w") as status_file:
             status_file.write("Validation Status: " + str(self.validation_status))
    
    def run_validation(self):
        self.validation_status = self.validate_data()
        self.save_validation_status()
        



In [6]:
try:
    data_validation = DataValidation(data, 'schema.yaml')
    data_validation.run_validation()

except Exception as e:
    raise e

<class 'dict'>
Validation Succedded for columns
