In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
pwd

'd:\\Nik\\UW-SoundSpeed'

In [4]:
from uwsoundspeed.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from uwsoundspeed.utils.common import read_yaml, create_directories
from pathlib import Path
from typing import List, Dict, Any
from dataclasses import dataclass

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    data_path: Path
    numerical_features: List[str]
    # numerical_transformer: Dict[str, Any]
    categorical_features: List[str]
    # categorical_transformer: Dict[str, Any]
    pca_n_components: float

ImportError: cannot import name 'PreprocessingConfig' from 'uwsoundspeed.entity.config_entity' (d:\nik\uw-soundspeed\src\uwsoundspeed\entity\config_entity.py)

In [5]:

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config['artifacts_root']])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config['data_preprocessing']
        params = self.params['preprocessing']

        create_directories([Path(config['root_dir'])])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=Path(config['root_dir']),
            data_path=Path(config['data_path']),
            numerical_features=params['numerical_features'],
            # numerical_transformer=params['numerical_transformer'],
            categorical_features=params['categorical_features'],
            # categorical_transformer=params['categorical_transformer'],
            pca_n_components=params['pca_n_components']
        )

        return data_preprocessing_config

In [6]:
from box.exceptions import BoxValueError
from uwsoundspeed.constants import *
from uwsoundspeed.utils.common import read_yaml, create_directories
from pathlib import Path

In [7]:
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import joblib
import os

class DataPreprocessor:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config
        self.column_transformer = self._create_column_transformer()
        self.pca = PCA(n_components=config.pca_n_components)

    def _create_column_transformer(self):
        # Setup the numerical transformer pipeline
        numerical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Combine into a ColumnTransformer
        return ColumnTransformer(transformers=[
            ('num', numerical_pipeline, self.config.numerical_features),
            ('cat', categorical_pipeline, self.config.categorical_features),
        ])

    def preprocess_and_apply_pca(self):
        # Read data
        df = pd.read_pickle(self.config.data_path)

        # Fit and transform the data with the pre-defined column transformer
        X_transformed = self.column_transformer.fit_transform(df)

        # Apply PCA with the pre-initialized PCA instance
        X_pca = self.pca.fit_transform(X_transformed)

        return X_pca

    def save(self, subdir_name="preprocessor"):
        """Saves the preprocessor and PCA components to the specified directory."""
        save_dir = os.path.join(self.config.root_dir, subdir_name)
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, "preprocessor_and_pca.joblib")
        joblib.dump({'column_transformer': self.column_transformer, 'pca': self.pca}, save_path)


In [8]:
try:
    # Initialize ConfigurationManager and get preprocessing configuration
    config_manager = ConfigurationManager()
    data_preprocessing_config = config_manager.get_data_preprocessing_config()
    data_preprocessor = DataPreprocessor(config=data_preprocessing_config)
    
    # Preprocess the data and apply PCA
    # This method now internally reads the data, applies transformations, and PCA
    X_pca = data_preprocessor.preprocess_and_apply_pca()

    # Save the fitted preprocessor and PCA model for later use
    data_preprocessor.save(subdir_name="data_preprocessor")

except Exception as e:
    logger.error("An error occurred during data preprocessing", exc_info=True)
    raise e


[2024-03-18 22:28:56,268: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-03-18 22:28:56,274: INFO: common: YAML file: params.yaml loaded successfully]
[2024-03-18 22:28:56,276: INFO: common: Directory created at: artifacts]
[2024-03-18 22:28:56,278: INFO: common: Directory created at: artifacts\data_transformation]
