In [1]:
import os

os.chdir("..")
os.getcwd()

'c:\\Users\\princ\\OneDrive\\Desktop\\project-python\\creditcard'

In [2]:
import os
from datetime import datetime
from pathlib import Path


def get_current_time_stamp():
    return f"{datetime.now().strftime('%Y%m%d%H%M%S')}"


ROOT_DIR = os.getcwd()  # to get current working directory
CURRENT_TIME_STAMP = get_current_time_stamp()
# config constants
CONFIG_DIR = os.path.join(ROOT_DIR, 'configs')
CONFIG_FILE_NAME = "config.yaml"
CONFIG_FILE_PATH = os.path.join(CONFIG_DIR, CONFIG_FILE_NAME)


DATABASE_FILE_NAME = "database.yaml"
DATABASE_FILE = Path(os.path.join(CONFIG_DIR , DATABASE_FILE_NAME))

FEATURE_GENERATOR_FILE_NAME = "feature_generator.yaml"
FEATURE_GENERATOR_FILE_PATH = Path(os.path.join(CONFIG_DIR ,FEATURE_GENERATOR_FILE_NAME))

In [3]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath


class DataIngestionConfig(BaseModel):
    dataset_download_id: str
    raw_data_file_path: Path
    ingested_train_file_path: Path
    ingested_test_data_path: Path
    random_state : int


class TrainingPipelineConfig(BaseModel):
    artifact_dir: DirectoryPath
    pipeline_name: str
    experiment_code : str
    training_random_state : int


class DataValidationConfig(BaseModel):
    experiment_code: str
    data_validated_artifact_dir: DirectoryPath
    schema_file_path: FilePath
    report_file_dir: Path
    data_validated_test_collection: str
    data_validated_train_collection: str
    
class DataTransformationConfig(BaseModel):
    data_validated_train_collection: str
    schema_file_path: FilePath
    random_state : int
    preprocessed_object_file_path : Path
    to_train_collection : str
    to_test_collection : str
    
    

In [4]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath


class DataIngestionArtifact(BaseModel):
    train_file_path: FilePath
    test_file_path: FilePath


class DataValidationArtifact(BaseModel):
    schema_file_path : FilePath
    report_file_dir : DirectoryPath
    is_validated : bool
    
class  DataTransformationArtifact (BaseModel):
    transformed_train_file_path : FilePath
    transformed_test_file_path : FilePath
    preprocessed_object_file_path : FilePath 

In [5]:
import sys
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from kneed import KneeLocator
from sklearn import set_config
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import pandas as pd
from CreditCard.entity import DataTransformationArtifact, DataTransformationConfig
from CreditCard.logging import logger
from CreditCard.exception import AppException
from CreditCard.utils import read_yaml , save_bin
from CreditCard.Database import MongoDB
from CreditCard.constants import FEATURE_GENERATOR_FILE_PATH

set_config(transform_output="pandas")

class FeatureGenerator(BaseEstimator, TransformerMixin):
    """custom feature generator class to generate cluster class for the data
    scaler : StandardScaler clustering using kmeans++ and kneed"""

    def __init__(self, feature_generator_file_path : Path =FEATURE_GENERATOR_FILE_PATH):
        try:
            self.feature_config_info = read_yaml(path_to_yaml= feature_generator_file_path)
            self.cluster = None
            self.pay_x = self.feature_config_info.pay_x_columns
            self.age = self.feature_config_info.Age_column
            self.bill_amt = self.feature_config_info.bill_amt_columns
            self.pay_amt_columns = self.feature_config_info.pay_amt_columns
            self.limit_column = self.feature_config_info.limit_column
            self.encoder = OneHotEncoder(sparse=False)
            self.imputer = SimpleImputer()

        except Exception as e:
            raise AppException(e, sys) from e

    def fit(self, X, y=None):
        data_imputed = self.imputer.fit_transform(X)
        data_generated = self.prepare_data(X=data_imputed)
        data_encoded = self.encoder.fit_transform(data_generated)
        wcss=[]
        for i in range(1,11):
            kmeans=KMeans(n_clusters=i, init='k-means++',random_state=42)
            kmeans.fit(data_encoded)
            wcss.append(kmeans.inertia_) 

        kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing')
        total_clusters=kn.knee
        logger.info(f"total cluster :{total_clusters}")
        self.cluster = KMeans(n_clusters=total_clusters, init='k-means++',random_state=42)
        self.cluster.fit(data_encoded)
        return self
    
    def transform(self, X, y=None):
        try:
            data_imputed = self.imputer.transform(X)
            data_generated = self.prepare_data(X=data_imputed)
            data_encoded = self.encoder.transform(data_generated)
            data_generated["cluster"]  = self.cluster.predict(data_encoded)
            data_encoded["cluster"] = data_generated["cluster"]
            return data_encoded
        except Exception as e:
            raise AppException(e, sys) from e
        
    def prepare_data(self, X, y=None):
        try:
            feature_config_bin_info = self.feature_config_info.bins
            data_generated = pd.DataFrame()
            pay_feature_ceil = feature_config_bin_info.pay_feature_ceil
            pay_feature = lambda x: x if x < pay_feature_ceil else pay_feature_ceil
            for col in self.pay_x:
                data_generated[col] = X[col].apply(pay_feature)
            age_bin = feature_config_bin_info.age_bins
            data_generated[self.age[0]]= pd.cut(X[self.age[0]],age_bin)
            bill_amount_bins = feature_config_bin_info.bill_amount_bins
            for col in self.bill_amt:
                data_generated[col] = pd.cut(X[col],bill_amount_bins)
            pay_amt_bins = feature_config_bin_info.pay_amt_bins
            for col in self.pay_amt_columns:
                data_generated[col] = pd.cut(X[col],pay_amt_bins)
            limit_bins = feature_config_bin_info.limit_bins
            data_generated[self.limit_column[0]] =pd.cut(X[self.limit_column[0]],limit_bins)
            [data_generated[col].astype("category")for col in data_generated.columns]
            return data_generated
        except Exception as e:
            raise AppException(e, sys) from e

class DataTransformation:
    """Data transformation class . Choose the columns to model and transform the data"""

    def __init__(self, data_transformation_config: DataTransformationConfig):
        try:
            self.data_transformation_config_info = data_transformation_config
            train_collections = self.data_transformation_config_info.data_validated_train_collection
            self.train_connection = MongoDB(train_collections, drop_collection=False)
            self.train_df = self.train_connection.find_many_as_df()
            to_train_collection = self.data_transformation_config_info.to_train_collection
            to_test_collection = self.data_transformation_config_info.to_test_collection
            self.to_train_connection = MongoDB(to_train_collection , drop_collection=False)
            self.to_test_connection = MongoDB(to_test_collection , drop_collection=False)
            logger.info(f"{'>>' * 30}Data Transformation log started.{'<<' * 30} ")
        except Exception as e:
            raise AppException(e, sys) from e

    def get_data_transformer_object(self) -> ColumnTransformer:
        try:
            preprocessing = Pipeline(steps=[('feature_generator', FeatureGenerator())])
            return preprocessing

        except Exception as e:
            raise AppException(e, sys) from e
        
    def initiate_data_transformation(self) -> DataTransformationArtifact:
        try:
            logger.info("Obtaining preprocessing object.")
            preprocessing_obj = self.get_data_transformer_object()
            data_transformation_config_info =  self.data_transformation_config_info
            schema_file_path = self.data_transformation_config_info.schema_file_path

            dataset_schema = read_yaml(path_to_yaml=schema_file_path)
            target_column_name = dataset_schema.target_column
            logger.info(f"Target column name : {target_column_name}")
            logger.info("Splitting input and target feature from training and testing dataframe.")
            input_feature_train_df = self.train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = self.train_df[target_column_name]
            random_state = data_transformation_config_info.random_state
            X_train, X_test, y_train, y_test = train_test_split( input_feature_train_df, target_feature_train_df, test_size=0.20, random_state=random_state)
           
            
            logger.info("Applying preprocessing object on training dataframe and testing dataframe")
            to_train_df = preprocessing_obj.fit_transform(X_train)
            to_test_df = preprocessing_obj.fit_transform(X_test)
        
            
            transformed_train_file_path = data_transformation_config_info.transformed_train_file_path
            transformed_test_file_path = data_transformation_config_info.transformed_test_file_path
            
            to_test_df[target_column_name] = y_test.copy(deep= True)
            to_train_df[target_column_name] = y_train.copy(deep=True)
            logger.info("Saving transformed training and testing array.")
            preprocessing_obj_file_path = self. data_transformation_config_info.preprocessed_object_file_path
            self.to_test_connection.insert_many_(to_test_df.to_dict(orient="records"))
            self.to_train_connection.insert_many_(to_train_df.to_dict(orient="records"))
            logger.info("Saving preprocessing object.")
            save_bin(file_path=preprocessing_obj_file_path, obj=preprocessing_obj)

            data_transformation_artifact = DataTransformationArtifact(transformed_train_file_path=transformed_train_file_path,
                                                                      transformed_test_file_path=transformed_test_file_path,
                                                                      preprocessed_object_file_path=preprocessing_obj_file_path)
            logger.info(f"Data transformations artifact: {data_transformation_artifact}")
            return data_transformation_artifact
        except Exception as e:
            raise AppException(e, sys) from e

    def __del__(self):
        logger.info(f"{'>>' * 30}Data Transformation log completed.{'<<' * 30} \n\n")

In [6]:
import sys
import  os
import  json

from CreditCard.entity import DataIngestionConfig ,  TrainingPipelineConfig , DataValidationConfig , DataTransformationConfig
from CreditCard.exception import AppException
from CreditCard.logging import logger
from CreditCard.utils import read_yaml , create_directories
from pathlib import Path
from CreditCard.constants import CONFIG_FILE_PATH ,  CURRENT_TIME_STAMP , ROOT_DIR , CONFIG_DIR



class ConfigurationManager:

    def __init__(self,
                 config_file_path: Path = CONFIG_FILE_PATH) -> None:
        try:
            self.config_info = read_yaml(path_to_yaml=Path(config_file_path))
            self.pipeline_config = self.get_training_pipeline_config()
            self.time_stamp = CURRENT_TIME_STAMP

        except Exception as e:
            raise AppException(e, sys) from e

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        
        try:
            data_ingestion_info = self.config_info.data_ingestion_config
            pipeline_config = self.pipeline_config
            artifact_dir = pipeline_config.artifact_dir
            experiment_code = pipeline_config.experiment_code
            random_state = pipeline_config.training_random_state
            dataset_download_id = data_ingestion_info.dataset_download_id
            data_ingestion_dir_name = data_ingestion_info.ingestion_dir
            raw_data_dir = data_ingestion_info.raw_data_dir
            raw_file_name = data_ingestion_info.dataset_download_file_name
            data_ingestion_dir = os.path.join(artifact_dir, data_ingestion_dir_name, experiment_code)
            raw_data_file_path  = os.path.join(data_ingestion_dir, raw_data_dir, raw_file_name)
            ingested_dir_name = data_ingestion_info.ingested_dir
            ingested_dir_path = os.path.join(data_ingestion_dir,ingested_dir_name)
            
            ingested_train_file_path  = os.path.join(ingested_dir_path, data_ingestion_info.ingested_train_file)
            ingested_test_file_path = os.path.join(ingested_dir_path, data_ingestion_info.ingested_test_file)
            create_directories([os.path.dirname(raw_data_file_path), os.path.dirname(ingested_train_file_path)])
            
            data_ingestion_config = DataIngestionConfig(dataset_download_id = dataset_download_id , 
                                                        raw_data_file_path = raw_data_file_path , 
                                                        ingested_train_file_path = ingested_train_file_path , 
                                                        ingested_test_data_path  = ingested_test_file_path,
                                                        random_state = random_state)
            
            return data_ingestion_config
        except Exception as e:
            raise AppException(e, sys) from e
    def get_training_pipeline_config(self) -> TrainingPipelineConfig:
        try:
            training_config = self.config_info.training_pipeline_config
            training_pipeline_name = training_config.pipeline_name
            training_experiment_code = training_config.experiment_code
            training_random_state = training_config.random_state
            training_artifacts = os.path.join(ROOT_DIR, training_config.artifact_dir )
            create_directories(path_to_directories = [training_artifacts])
            training_pipeline_config =  TrainingPipelineConfig(artifact_dir=training_artifacts ,
                                                               experiment_code = training_experiment_code,
                                                               pipeline_name=training_pipeline_name,
                                                               training_random_state=training_random_state)
            logger.info(f"Training pipeline config: {training_pipeline_config}")
            return training_pipeline_config
        except Exception as e:
            raise AppException(e, sys) from e
        
    def get_data_validation_config(self) -> DataValidationConfig:
        try:
            pipeline_config = self.pipeline_config
            artifact_dir = pipeline_config.artifact_dir
            experiment_code = pipeline_config.experiment_code
            data_validation_config_info = self.config_info.data_validation_config
            schema_file_path = os.path.join(CONFIG_DIR, data_validation_config_info.schema_file_name)
            data_validated_artifact_dir = Path(
                os.path.join(artifact_dir, data_validation_config_info.data_validation_dir, experiment_code))
            report_file_dir = os.path.join(data_validated_artifact_dir, data_validation_config_info.report_dir)
            create_directories([report_file_dir])

            data_validation_config = DataValidationConfig(experiment_code = experiment_code ,
                                                          data_validated_artifact_dir= data_validated_artifact_dir,
                                                          schema_file_path=schema_file_path,
                                                          report_file_dir = report_file_dir , 
                                                          data_validated_test_collection = data_validation_config_info.data_validated_test_collection_name,
                                                          data_validated_train_collection = data_validation_config_info.data_validated_train_collection_name)
            return data_validation_config
    
        except Exception as e:
            raise AppException(e, sys)
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            pipeline_config = self.pipeline_config
            artifact_dir = pipeline_config.artifact_dir
            experiment_code = pipeline_config.experiment_code
            random_state  = pipeline_config.training_random_state
            data_validation_config_info = self.get_data_validation_config()
            data_validated_train_collection = data_validation_config_info.data_validated_train_collection
            schema_file_path = data_validation_config_info.schema_file_path
            data_transformation_config_info = self.config_info.data_transformation_config
            data_transformation_dir_name = data_transformation_config_info.data_transformation_dir
            data_transformation_dir = os.path.join(artifact_dir,data_transformation_dir_name, experiment_code)
            preprocessed_object_dir = data_transformation_config_info.preprocessing_object_dir
            preprocessed_object_name = data_transformation_config_info.preprocessing_object_file_name
            preprocessed_object_file_path = os.path.join(data_transformation_dir,preprocessed_object_dir,preprocessed_object_name)
            to_train_collection = data_transformation_config_info.to_train_collection
            to_test_collection = data_transformation_config_info.to_test_collection

            create_directories([os.path.dirname(preprocessed_object_file_path)])
            data_transformation_config = DataTransformationConfig(data_validated_train_collection = data_validated_train_collection,
                                                                  schema_file_path = schema_file_path ,
                                                                  random_state  = random_state,
                                                                  preprocessed_object_file_path = preprocessed_object_file_path,
                                                                  to_train_collection = to_train_collection,
                                                                  to_test_collection = to_test_collection)
            return data_transformation_config
        
        except Exception as e:
            raise AppException(e, sys)

In [7]:
config = ConfigurationManager()

2023-03-19 11:42:47.329 | INFO     | CreditCard.utils.common:read_yaml:34 - yaml file: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\configs\config.yaml loaded successfully
2023-03-19 11:42:47.331 | INFO     | CreditCard.utils.common:create_directories:53 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact
2023-03-19 11:42:47.333 | INFO     | __main__:get_training_pipeline_config:68 - Training pipeline config: artifact_dir=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact') pipeline_name='CreditCard' experiment_code='Base_model' training_random_state=1961


In [8]:
config.get_data_transformation_config()

2023-03-19 11:42:48.267 | INFO     | CreditCard.utils.common:create_directories:53 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\stage01_data_validation\Base_model\report
2023-03-19 11:42:48.270 | INFO     | CreditCard.utils.common:create_directories:53 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\stage02_data_transformation\Base_model\preprocessing


DataTransformationConfig(data_validated_train_collection='data_validated_train', schema_file_path=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/configs/schema.yaml'), random_state=1961, preprocessed_object_file_path=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact/stage02_data_transformation/Base_model/preprocessing/preprocessing_obj.pkl'), to_train_collection='to_train', to_test_collection='to_test')