# Data Preprocessing Pipeline Development
- Author: Marcellinus Aditya Witarsah
- Date: 05 June 2024

In [6]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import logging
import time
import pickle
import os
from pathlib import Path
from abc import ABC
from abc import abstractmethod
from scipy import stats
from typing import Tuple
from typing import Union
from dataclasses import dataclass
from src.utils.common import logger
from src.utils.common import read_yaml, create_directories
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# run once only
os.chdir("..")

# Configuration

In [3]:
# src/entities/config_entity.py
@dataclass(frozen=True)
class DataPreprocessingConfig:
    """
    Data class for storing data preprocessing configuration.

    Attributes:
        root_dir (Path): Root directory for data preprocessing.
        source_path (Path): Source path of the data to be processed.
        train_data_path (Path): Path to save the training data.
        test_data_path (Path): Path to save the testing data.
        target_column (str): The name of the target column.
        test_size (float): The proportion of the dataset to include in the test split.
        shuffle (bool): Whether or not to shuffle the data before splitting.
        random_state (int): Random seed for reproducibility.
    """
    root_dir: Path
    source_path: Path
    train_data_path: Path
    test_data_path: Path
    target_column: str
    test_size: float
    shuffle: bool
    random_state: int

# src/config/configuration_manager.py
class ConfigurationManager:
    """
    Prepare ConfigurationManager class.
    
    This class is responsible for reading configuration files and preparing
    configuration settings for the pipeline.

    Attributes:
        config (dict): Parsed configuration file content.
        params (dict): Parsed parameters file content.
        schema (dict): Parsed schema file content.
    """
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH, 
        params_filepath: str = PARAMS_FILE_PATH, 
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize the ConfigurationManager with file paths.

        Args:
            config_filepath (str): File path to the configuration YAML file.
            params_filepath (str): File path to the parameters YAML file.
            schema_filepath (str): File path to the schema YAML file.
        """
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        """
        Get configuration for data preprocessing.
        
        Returns:
            DataPreprocessingConfig: Configuration for data preprocessing.
        """
        config = self.config.data_preprocessing
        params = self.params.data_preprocessing

        create_directories([config.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            target_column=params.split_data.target_column,
            test_size=params.split_data.test_size,
            shuffle=params.split_data.shuffle,
            random_state=params.split_data.random_state
        )
        return data_preprocessing_config

# Data Preprocessing

In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path
import logging
from typing import Tuple

# src/data/data_preprocessing.py
class DataPreprocessing:
    """
    Class to handle the data preprocessing process.
    """
    
    def __init__(self, config: DataPreprocessingConfig):
        """
        Instantiate `DataPreprocessing` class.

        Args:
            config (DataPreprocessingConfig): Configuration for data preprocessing.
        """
        self.config = config

    def split_data(self) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
        """
        Split data into train and test data evenly based on their target values.

        Returns:
            Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: Train and test set.
        """
        try:
            logger.info("Split data")
            df = pd.read_csv(self.config.source_path)
            X, y = df.drop(columns=[self.config.target_column]), df[self.config.target_column]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                stratify=y, 
                test_size=self.config.test_size, 
                shuffle=self.config.shuffle, 
                random_state=self.config.random_state
            )
            train = pd.concat([X_train, y_train], axis=1)
            test = pd.concat([X_test, y_test], axis=1)
            train.to_csv(self.config.train_data_path, index=False)
            test.to_csv(self.config.test_data_path, index=False)
        except Exception as e:
            logger.error(e)

In [8]:
try:
    configuration_manager = ConfigurationManager()
    data_preprocessing = DataPreprocessing( 
        config=configuration_manager.get_data_preprocessing_config()
    )
    data_preprocessing.split_data()
except Exception as e:
    logger.error(e)

2024-06-05 15:25:47,159 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 15:25:47,164 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 15:25:47,167 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 15:25:47,168 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 15:25:47,170 - credit-scorecard-logger - INFO - Created directory at: artifacts/data_preprocessing
2024-06-05 15:25:47,171 - credit-scorecard-logger - INFO - Split data


# Testing
Restart and run again

In [1]:
import os
os.chdir("..")

In [11]:
from src.utils.common import logger
from src.config.configuration_manager import ConfigurationManager
from src.data.data_preprocessing import DataPreprocessing


class DataPreprocessingPipeline:
    def __init__(self):
        """
        Instantiate `DataPreprocessingPipeline` class
        """
        self.configuration_manager = ConfigurationManager()

    def run(self):
        """
        Preprocess data
        """
        data_preprocessing = DataPreprocessing(
            config=self.configuration_manager.get_data_preprocessing_config()
        )
        data_preprocessing.split_data()


if __name__ == "__main__":
    STAGE_NAME = "Data Preprocessing Stage"
    try:
        logger.info(f">>>>>> {STAGE_NAME} Started <<<<<<")
        data_preprocessing_pipeline = DataPreprocessingPipeline()
        data_preprocessing_pipeline.run()
        logger.info(f">>>>>> {STAGE_NAME} Completed <<<<<<")
    except Exception as e:
        logger.error(e)

2024-06-05 15:28:50,967 - credit-scorecard-logger - INFO - >>>>>> Data preprocessing Stage Started <<<<<<
2024-06-05 15:28:50,973 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 15:28:50,978 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 15:28:50,981 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 15:28:50,985 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 15:28:50,986 - credit-scorecard-logger - INFO - Created directory at: artifacts/data_preprocessing
2024-06-05 15:28:50,987 - credit-scorecard-logger - INFO - Split data
2024-06-05 15:28:51,290 - credit-scorecard-logger - INFO - >>>>>> Data preprocessing Stage Completed <<<<<<
