In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
import sklearn

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationArtifact:
    valid_train_path: Path
    valid_test_path: Path
    invalid_train_path: Path
    invalid_test_path: Path
    validation_status: Path

In [4]:
import os
os.getcwd()

'd:\\Data Science\\github\\Projects\\ML\\Credit-Card-Transaction-Fraud-Detection\\Notebooks'

In [5]:
os.chdir("d:\\Data Science\\github\\Projects\\ML\\Credit-Card-Transaction-Fraud-Detection")

In [6]:
import os
class TrainingPipeline:
    def __init__(self):
        
        self.artifact = os.path.join("artifacts")


class DataValidationConfig:
    def __init__(self, training_pipeline: TrainingPipeline):
        
        data_validation_dir = os.path.join(training_pipeline.artifact, "data validation")
        self.validation_train_path = os.path.join(data_validation_dir, "valid", "train.csv")
        self.valid_test_path = os.path.join(data_validation_dir, "valid", "test.csv")
        self.invalid_train_path = os.path.join(data_validation_dir, "invalid", "train.csv")
        self.invalid_test_path = os.path.join(data_validation_dir, "valid", "test.csv")

In [41]:
from scipy.stats import ks_2samp
from sklearn.model_selection import train_test_split
class DataValidation:

    def __init__(self, data_validation_config: DataValidationConfig):
        self.data_validation_config = data_validation_config
        self.numeric_columns = ['cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 
                                'unix_time',
                                'merch_lat', 'merch_long', 'is_fraud']
        
        self.cat_columns = ['trans_date_trans_time','merchant', 'category', 
                            'first', 'last', 'gender', 
                            'street', 'city',
                            'state', 'job', 'dob', 'trans_num']


    def numerical_exists(self, df:pd.DataFrame) -> bool:

        expected_columns = df.select_dtypes(exclude='object')

        if len(self.numeric_columns) == expected_columns.shape[1]:
            for i in range(len(self.numeric_columns)):
                if self.numeric_columns[i] in expected_columns:
                    return True
            else:
                raise Exception(f"the column {self.numeric_columns[i]} is not found in the dataset")
        else:              
            raise Exception(f"the number of expected columns is not equal to no. of required columns")

    
    def categorical_exits(self, df:pd.DataFrame):

        expected_columns = df.select_dtypes(include='object')

        if len(self.cat_columns) == expected_columns.shape[1]:
            for i in range(len(self.cat_columns)):
                print(self.cat_columns[i])
                if self.cat_columns[i] in expected_columns:
                    return True
            else:
                raise Exception(f"the column {self.cat_columns[i]} is not found in the dataset")
        else:              
            raise Exception(f"the number of expected columns is not equal to no. of required columns")

    def drift(self, base_df:pd.DataFrame, current_df:pd.DataFrame, threshold=0.05):
        report = {}
        status = True

        for column in base_df.columns:
            d1 = base_df[column]
            d2 = current_df[column]

            is_same_dist = ks_2samp(d1, d2)

            if threshold<=is_same_dist.pvalue:
                is_found=False
            else:
                is_found=True
                status=False
            report.update({column:{
                "p_value":float(is_same_dist.pvalue),
                "drift_status":is_found
                
                }})

        return report
        
    def initiate_data_validation(self):

        df = pd.read_csv("./dataset/CreditCardData.csv")

        train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

        train_data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
        test_data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

        status = self.numerical_exists(train_data)
        if status:
            print("the numerical columns exists for train data")

        status = self.numerical_exists(test_data)
        if status:
            print("the numerical columns exists for test data")

        status = self.categorical_exits(train_data)
        if status:
            print("the categorical columns exists for train data")

        status = self.categorical_exits(test_data)
        if status:
            print("the categorical columns exists for test data")

        report = self.drift(train_data, test_data)
        return report    

In [42]:
p = TrainingPipeline()
config = DataValidationConfig(p)
data_va = DataValidation(config)
data_va.initiate_data_validation()

Exception: the number of expected columns is not equal to no. of required columns