In [1]:
import os 

os.chdir("..")
os.getcwd()

'c:\\Users\\princ\\OneDrive\\Desktop\\project-python\\creditcard'

In [2]:
import os 
from datetime import datetime


def get_current_time_stamp():
    return f"{datetime.now().strftime('%Y%m%d%H%M%S')}"


ROOT_DIR = os.getcwd()  # to get current working directory
CURRENT_TIME_STAMP = get_current_time_stamp()
# config constants
CONFIG_DIR = os.path.join(ROOT_DIR, 'configs')
CONFIG_FILE_NAME = "config.yaml"
CONFIG_FILE_PATH = os.path.join(CONFIG_DIR, CONFIG_FILE_NAME)

In [3]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath

class DataIngestionConfig(BaseModel):
    dataset_download_id: str
    raw_data_file_path : Path
    ingested_train_file_path :Path
    ingested_test_data_path : Path
    
class  TrainingPipelineConfig(BaseModel):
    artifact_dir :DirectoryPath 
    pipeline_name : str

In [4]:
import sys
import  os
import  json

from CreditCard.entity import DataIngestionConfig ,  TrainingPipelineConfig
from CreditCard.exception import App_Exception
from CreditCard.logging import logger
from CreditCard.utils import read_yaml , create_directories
from pathlib import Path
from CreditCard.constants import CONFIG_FILE_PATH ,  CURRENT_TIME_STAMP , ROOT_DIR



class Configuration:

    def __init__(self,
                 config_file_path: Path = CONFIG_FILE_PATH) -> None:
        try:
            self.config_info = read_yaml(path_to_yaml=Path(config_file_path))
            self.pipeline_config = self.get_training_pipeline_config()
            self.time_stamp = CURRENT_TIME_STAMP

        except Exception as e:
            raise App_Exception(e, sys) from e

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        
        try:
            data_ingestion_info = self.config_info.data_ingestion_config
            artifact_dir = self.pipeline_config.artifact_dir
            dataset_download_id = data_ingestion_info.dataset_download_id
            data_ingestion_dir_name = data_ingestion_info.ingestion_dir
            raw_data_dir = data_ingestion_info.raw_data_dir
            raw_file_name = data_ingestion_info.dataset_download_file_name
            data_ingestion_dir = os.path.join(artifact_dir, data_ingestion_dir_name)
            raw_data_file_path  = os.path.join(data_ingestion_dir, raw_data_dir, raw_file_name)
            ingested_dir_name = data_ingestion_info.ingested_dir
            ingested_dir_path = os.path.join(data_ingestion_dir,ingested_dir_name)
            
            ingested_train_file_path  = os.path.join(ingested_dir_path, data_ingestion_info.ingested_train_file)
            ingested_test_file_path = os.path.join(ingested_dir_path, data_ingestion_info.ingested_test_file)
            create_directories([os.path.dirname(raw_data_file_path), os.path.dirname(ingested_train_file_path)])
            
            data_ingestion_config = DataIngestionConfig(dataset_download_id = dataset_download_id , 
                                                        raw_data_file_path = raw_data_file_path , 
                                                        ingested_train_file_path = ingested_train_file_path , 
                                                        ingested_test_data_path  = ingested_test_file_path)
            
            return data_ingestion_config
        except Exception as e:
            raise App_Exception(e, sys) from e
    def get_training_pipeline_config(self) -> TrainingPipelineConfig:
        try:
            training_config = self.config_info.training_pipeline_config
            training_pipeline_name = training_config.pipeline_name
            training_artifacts = os.path.join(ROOT_DIR, training_config.artifact_dir)
            create_directories(path_to_directories = [training_artifacts])
            training_pipeline_config =  TrainingPipelineConfig(artifact_dir=training_artifacts ,pipeline_name=training_pipeline_name)
            logger.info(f"Training pipeline config: {training_pipeline_config}")
            return training_pipeline_config
        except Exception as e:
            raise App_Exception(e, sys) from e

In [5]:
config = Configuration()

2023-03-12 17:33:08.114 | INFO     | CreditCard.utils.common:read_yaml:30 - yaml file: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\configs\config.yaml loaded successfully
2023-03-12 17:33:08.116 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact
2023-03-12 17:33:08.116 | INFO     | __main__:get_training_pipeline_config:59 - Training pipeline config: artifact_dir=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact') pipeline_name='CreditCard'


In [6]:
type(CONFIG_FILE_PATH)

str

In [7]:
config.get_training_pipeline_config()

2023-03-12 17:33:08.187 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact
2023-03-12 17:33:08.188 | INFO     | __main__:get_training_pipeline_config:59 - Training pipeline config: artifact_dir=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact') pipeline_name='CreditCard'


TrainingPipelineConfig(artifact_dir=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact'), pipeline_name='CreditCard')

In [8]:
data_ingestion_config = config.get_data_ingestion_config()

2023-03-12 17:33:08.229 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\Stage00_data_ingestion\raw_data
2023-03-12 17:33:08.232 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\Stage00_data_ingestion\ingested_data


In [9]:
data_ingestion_config

DataIngestionConfig(dataset_download_id='https://raw.githubusercontent.com/pk1308/datasets/master/ucicredit/UCI_Credit_Card.csv', raw_data_file_path=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact/Stage00_data_ingestion/raw_data/UCI_Credit_Card.csv'), ingested_train_file_path=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact/Stage00_data_ingestion/ingested_data/train.csv'), ingested_test_data_path=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact/Stage00_data_ingestion/ingested_data/test.csv'))

In [10]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath

class DataIngestionArtifact(BaseModel):
    train_file_path : FilePath
    test_file_path : FilePath 

In [11]:
from CreditCard.entity import DataIngestionConfig , DataIngestionArtifact
import sys,os
from CreditCard.exception import App_Exception
from CreditCard.logging import logger
import numpy as np
import pandas as pd
import gdown
from sklearn.model_selection import StratifiedShuffleSplit
from CreditCard.constants import *

class DataIngestion:
    """Stage 1 data ingestion : Download dataset, split data into train and test, export to pickle and mongoDb
     Input :
     DataIngestionConfig =

     output :
       DataIngestionArtifact(train_file_path,
                            test_file_path)
        top download the dataset from kaggle we use kaggle api authentication
        reference : https://github.com/Kaggle/kaggle-api for more details on kaggle api"""

    def __init__(self, data_ingestion_config_info: DataIngestionConfig):
        try:
            self.data_ingestion_config = data_ingestion_config_info
            logger.info(f"{'>>' * 20}Experiment : base Model {'<<' * 20}")
        except Exception as e:
            raise App_Exception(e, sys)
        
    def download_data(self,dataset_download_id: str, raw_data_file_path: str) -> str:
       
        try:
            # extraction remote url to download dataset
            logger.info(f"Downloading dataset from github")
            raw_data_frame = pd.read_csv(dataset_download_id)
            raw_data_frame.to_csv(raw_data_file_path , index=False)
            logger.info("Dataset unzipped successfully")

            return True

        except Exception as e:
            raise App_Exception(e, sys) from e

    def split_data_as_train_test(self) -> DataIngestionArtifact:
        try:
            logger.info(f"{'>>' * 20}Data splitting.{'<<' * 20}")
            raw_data_file_path = self.data_ingestion_config.raw_data_file_path
            train_file_path = self.data_ingestion_config.ingested_train_file_path
            test_file_path = self.data_ingestion_config.ingested_test_data_path

            logger.info(f"Reading csv file: [{raw_data_file_path}]")
            raw_data_frame = pd.read_csv(raw_data_file_path)

            logger.info("Splitting data into train and test")
            strat_train_set = None
            strat_test_set = None

            split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

            for train_index, test_index in split.split(raw_data_frame, raw_data_frame["default.payment.next.month"]):
                strat_train_set = raw_data_frame.loc[train_index]
                strat_test_set = raw_data_frame.loc[test_index]

            if strat_train_set is not None:
                logger.info(f"Exporting training dataset to file: [{train_file_path}]")
                strat_train_set.to_csv(train_file_path , index=False)

            if strat_test_set is not None:
                logger.info(f"Exporting test dataset to file: [{test_file_path}]")
                strat_test_set.to_csv(test_file_path , index=False)
                data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path,
                                                                test_file_path=test_file_path)                                                               
                logger.info(f"Data Ingestion artifact:[{data_ingestion_artifact}]")
                return data_ingestion_artifact

        except Exception as e:
            raise App_Exception(e, sys) from e

    def initiate_data_ingestion(self) -> DataIngestionArtifact:
        try:
            logger.info(f"{'>>' * 20}Data Ingestion started.{'<<' * 20}")
            data_ingestion_config = self.data_ingestion_config
            dataset_download_id = data_ingestion_config.dataset_download_id
            raw_data_file_path = data_ingestion_config.raw_data_file_path
            self.download_data(dataset_download_id,raw_data_file_path)

            data_ingestion_response = self.split_data_as_train_test()
            logger.info(f"{'>>' * 20}Data Ingestion artifact.{'<<' * 20}")
            logger.info(f" Data Ingestion Artifact{data_ingestion_response}")
            logger.info(f"{'>>' * 20}Data Ingestion completed.{'<<' * 20}")       
            return data_ingestion_response
        except Exception as e:
            raise App_Exception(e, sys) from e

    def __del__(self):
       logger.info(f"{'>>' * 20}Data Ingestion log completed.{'<<' * 20} \n\n")


if __name__ == "__main__":
    config = Configuration()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(data_ingestion_config)
    data_ingestion_response = data_ingestion.initiate_data_ingestion()


2023-03-12 17:33:09.003 | INFO     | CreditCard.utils.common:read_yaml:30 - yaml file: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\configs\config.yaml loaded successfully
2023-03-12 17:33:09.005 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact
2023-03-12 17:33:09.006 | INFO     | __main__:get_training_pipeline_config:59 - Training pipeline config: artifact_dir=WindowsPath('c:/Users/princ/OneDrive/Desktop/project-python/creditcard/artifact') pipeline_name='CreditCard'
2023-03-12 17:33:09.007 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\Stage00_data_ingestion\raw_data
2023-03-12 17:33:09.008 | INFO     | CreditCard.utils.common:create_directories:49 - created directory at: c:\Users\princ\OneDrive\Desktop\project-python\creditcard\artifact\Stage00_data_ingestion\ingested_d