In [1]:
import os
%pwd

'c:\\Users\\pachp\\Desktop\\projects\\customer_churn\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\pachp\\Desktop\\projects\\customer_churn'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    preprocessor_path: Path

In [4]:
from customer_churn.constants import *
from customer_churn.utils.common_utils import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            preprocessor_path=config.preprocessor_path
        )

        return data_transformation_config

In [6]:
import warnings
warnings.simplefilter('ignore')


import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from dataclasses import dataclass
from pathlib import Path
from box.exceptions import BoxValueError
from customer_churn import logger
from customer_churn.utils.common_utils import save_object

In [7]:

class DataTransformation:
    
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_data_transformer_object(self):

        '''
        This is responcible for data transformation
        '''

        try:

            num_features = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'Churn Score',
       'CLTV']
            
            cat_features = ['City', 'Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method']
            
            logger.info("Pipeline initiated.....")
            num_pipeline = Pipeline(
                steps=[
                    ("imputer",SimpleImputer()),
                    ("scaler", StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(sparse_output=False, drop="first",dtype=np.int16))
                ]
            )

            logger.info("Pipeline completed.")

            logger.info("columntransformation initiated.....")
            preprocessor = ColumnTransformer(
                [
                    ("Numerical Pipeline", num_pipeline, num_features),
                    ("Categorical Pipeline", cat_pipeline, cat_features)
                ]
            )

            logger.info("columntransformation completed.")

            return preprocessor
        except BoxValueError:
            raise ValueError("Error occured at data tranformation.....")
        except Exception as e:
            raise e
        
        
    def initiate_data_transformation(self, train_set, test_set):
        try:
            train_df = pd.read_csv(train_set)
            test_df = pd.read_csv(test_set)
            logger.info("Read train and test data completed.")

            preprocessor_obj = self.get_data_transformer_object()

            target_column_name = "Churn Value"

            logger.info("separate the independent and dependent columns started")
            
            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
            target_feature_test_df = test_df[target_column_name]

            logger.info("separate the independent and dependent columns completed")

            logger.info("Applying preprocessing object on train and test dataset")

            input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]

            test_arr = np.c_[
                input_feature_test_arr, np.array(target_feature_test_df)
            ]

            logger.info("Saved preprocessing object.")

            save_object(
                file_path = self.config.preprocessor_path,
                obj = preprocessor_obj
            )

            return (
                train_arr,
                test_arr,
                self.config.preprocessor_path
            )



        except BoxValueError:
            raise ValueError("Error occured at initiate data transformation.....")
        except Exception as e:
            raise e


In [8]:
try:
    config = ConfigurationManager()   
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_data_transformation("artifacts/data_ingestion/train.csv","artifacts/data_ingestion/test.csv")
except Exception as e:
    raise e

[2024-03-13 21:43:35,134: INFO: common_utils: yaml file: config\config.yaml loaded successfully]
[2024-03-13 21:43:35,149: INFO: common_utils: yaml file: params.yaml loaded successfully]
[2024-03-13 21:43:35,153: INFO: common_utils: created directory at: artifacts]
[2024-03-13 21:43:35,158: INFO: common_utils: created directory at: artifacts/data_transformation]


[2024-03-13 21:43:35,298: INFO: 4267195676: Read train and test data completed.]
[2024-03-13 21:43:35,300: INFO: 4267195676: Pipeline initiated.....]
[2024-03-13 21:43:35,305: INFO: 4267195676: Pipeline completed.]
[2024-03-13 21:43:35,313: INFO: 4267195676: columntransformation initiated.....]
[2024-03-13 21:43:35,320: INFO: 4267195676: columntransformation completed.]
[2024-03-13 21:43:35,334: INFO: 4267195676: separate the independent and dependent columns started]
[2024-03-13 21:43:35,365: INFO: 4267195676: separate the independent and dependent columns completed]
[2024-03-13 21:43:35,367: INFO: 4267195676: Applying preprocessing object on train and test dataset]
[2024-03-13 21:43:35,793: INFO: 4267195676: Saved preprocessing object.]
