In [5]:
%pwd

'c:\\Users\\marni\\Desktop\\Customer_churn\\Customer_churn\\research'

In [6]:
os.chdir("../")

In [7]:
%pwd

'c:\\Users\\marni\\Desktop\\Customer_churn\\Customer_churn'

In [2]:
import sys 
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,FunctionTransformer
from Customer_churn.entity.config_entity import DataTransformationConfig

from Customer_churn.exception import CustomException
from Customer_churn.logging import logging
from Customer_churn.utils.common import save_object
import os
from Customer_churn.config.configuration import ConfigurationManager

In [26]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config=config
        
    def get_data_transformer_object(self):
        '''
        This function is responsible for data transformation
        '''
        try:
            numerical_columns = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts","HasCrCard", "IsActiveMember", "EstimatedSalary","Complain","SatisfactionScore","PointEarned"]
            categorical_columns = [
                "Gender",
                "CardType"
            ]
            card_type_mapping = {
                'DIAMOND': 1,
                'PLATINUM': 2,
                'GOLD': 3,
                'SILVER': 4
            }

            # Function to map Card_Type
            def map_card_type(card_type_series):
                return card_type_series.map(card_type_mapping)

            num_pipeline = Pipeline(
                steps=[
                    ("scaler", StandardScaler()),
                ]
            )
            cat_pipeline = ColumnTransformer(
                transformers=[
                    ("gender_pipeline", 
                    Pipeline(steps=[
                        ("one_hot_encoder", OneHotEncoder())
                    ]), ["Gender"]
                    ),
                    ("card_type_pipeline", 
                    Pipeline(steps=[
                        ("card_type_mapper", FunctionTransformer(map_card_type, validate=False)),
                        ("scaler", StandardScaler(with_mean=False))
                    ]), ["CardType"]
                    )
                ]
)

            logging.info(f"Numerical columns encoded: {numerical_columns}")
            logging.info(f"Categorical columns encoded: {categorical_columns}")
            

            preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pilpline", cat_pipeline, categorical_columns),
                ]
            )

            return preprocessor
        except Exception as e:
            raise CustomException(e,sys)

    def initiate_data_transformation(self):
        try:



            df=pd.read_csv(self.config.raw_data_path)
            logging.info('Read the dataset as dataframe')

            logging.info("Train test split initiated")
            train_set,test_set=train_test_split(df,test_size=0.2,random_state=42)

            train_set.to_csv(self.config.train_data_path,index=False,header=True)

            test_set.to_csv(self.config.test_data_path,index=False,header=True)

            logging.info("Ingestion of the data is completed")





            train_df=pd.read_csv(self.config.train_data_path)
            test_df=pd.read_csv(self.config.test_data_path)

            logging.info("Read train and test data completed")
            logging.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            # target_column_name="Exited"
            
            # input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            # target_feature_train_df=train_df[target_column_name]

            # input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            # target_feature_test_df=test_df[target_column_name]

            input_feature_train_df = train_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
            target_feature_train_df = train_df['Exited']

            input_feature_test_df = test_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
            target_feature_test_df = test_df['Exited']

            logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )
            print("train df colomns = ********** : ",input_feature_train_df.columns)
            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logging.info(f"Saved preprocessing object.")

            save_object(

                file_path=self.config.preprocessor_obj_file_path,
                obj=preprocessing_obj

            )

            return (train_arr,test_arr)

        except Exception as e:
            raise CustomException(e,sys)

In [27]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(config=data_transformation_config)
data_transformation.initiate_data_transformation()

[2024-08-21 19:58:50,297: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-21 19:58:50,298: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-21 19:58:50,299: INFO: common: created directory at: artifacts]
[2024-08-21 19:58:50,300: INFO: common: created directory at: artifacts/data_transformation]
[2024-08-21 19:58:50,318: INFO: 3980172065: Read the dataset as dataframe]
[2024-08-21 19:58:50,319: INFO: 3980172065: Train test split initiated]
[2024-08-21 19:58:50,364: INFO: 3980172065: Ingestion of the data is completed]
[2024-08-21 19:58:50,395: INFO: 3980172065: Read train and test data completed]
[2024-08-21 19:58:50,396: INFO: 3980172065: Obtaining preprocessing object]
[2024-08-21 19:58:50,397: INFO: 3980172065: Numerical columns encoded: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'PointEarned']]
[2024-08-21 19:58:50,397: INFO: 3980172065: C

CustomException: Error occured in python script name [C:\Users\marni\AppData\Local\Temp\ipykernel_16412\3980172065.py] line number [109] error message [the first argument must be callable]

In [38]:
train_df=pd.read_csv("artifacts/data_transformation/train.csv")
test_df=pd.read_csv("artifacts/data_transformation/test.csv")

In [44]:
def map_card_type(card_type_series):
    card_type_mapping = {
        'DIAMOND': 1,
        'PLATINUM': 2,
        'GOLD': 3,
        'SILVER': 4
    }
    return card_type_series.map(card_type_mapping)

# Test on train_df
train_df['CardType'] = map_card_type(train_df['CardType'])
print(train_df['CardType'].head())

0    1
1    2
2    4
3    4
4    2
Name: CardType, dtype: int64


In [43]:
train_df.dtypes

RowNumber              int64
CustomerId             int64
Surname               object
CreditScore            int64
Geography             object
Gender                object
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Complain               int64
SatisfactionScore      int64
CardType              object
PointEarned            int64
dtype: object

In [45]:
def get_data_transformer_object():
    '''
    This function is responsible for data transformation
    '''
    try:
        numerical_columns = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts","HasCrCard", "IsActiveMember", "EstimatedSalary","Complain","SatisfactionScore","PointEarned"]
        categorical_columns = ["Gender", "CardType"]

        # Function to map Card_Type
        def map_card_type(card_type_series):
            card_type_mapping = {
                'DIAMOND': 1,
                'PLATINUM': 2,
                'GOLD': 3,
                'SILVER': 4
            }
            return card_type_series.map(card_type_mapping)

        # Numerical Pipeline
        num_pipeline = Pipeline(steps=[("scaler", StandardScaler())])

        # Categorical Pipeline
        cat_pipeline = ColumnTransformer(
            transformers=[
                ("gender_pipeline", 
                Pipeline(steps=[
                    ("one_hot_encoder", OneHotEncoder())
                ]), ["Gender"]),

                ("card_type_pipeline", 
                Pipeline(steps=[
                    ("card_type_mapper", FunctionTransformer(lambda x: x.map(map_card_type), validate=False)),
                    ("scaler", StandardScaler(with_mean=False))
                ]), ["CardType"])
            ]
        )

        logging.info(f"Numerical columns encoded: {numerical_columns}")
        logging.info(f"Categorical columns encoded: {categorical_columns}")

        # Full Preprocessor
        preprocessor = ColumnTransformer(
            [
                ("num_pipeline", num_pipeline, numerical_columns),
                ("cat_pipeline", cat_pipeline, categorical_columns),  # Fixed the typo from 'cat_pilpline'
            ]
        )

        return preprocessor
    except Exception as e:
        raise CustomException(e, sys)

preprocessing_obj=get_data_transformer_object()
input_feature_train_df = train_df[['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_train_df = train_df['Exited']

input_feature_test_df = test_df[['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_test_df = test_df['Exited']
print("train df colomns = ********** : ",input_feature_train_df.columns)
input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)

[2024-08-21 20:17:58,310: INFO: 3931572300: Numerical columns encoded: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'PointEarned']]
[2024-08-21 20:17:58,311: INFO: 3931572300: Categorical columns encoded: ['Gender', 'CardType']]
train df colomns = ********** :  Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain',
       'SatisfactionScore', 'CardType', 'PointEarned'],
      dtype='object')


AttributeError: 'int' object has no attribute 'map'

In [31]:
preprocessing_obj=get_data_transformer_object()

[2024-08-21 20:06:24,263: INFO: 2814553181: Numerical columns encoded: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'PointEarned']]
[2024-08-21 20:06:24,263: INFO: 2814553181: Categorical columns encoded: ['Gender', 'CardType']]


In [50]:
train_df=pd.read_csv("artifacts/data_transformation/train.csv")
test_df=pd.read_csv("artifacts/data_transformation/test.csv")

In [35]:
input_feature_train_df = train_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_train_df = train_df['Exited']

input_feature_test_df = test_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_test_df = test_df['Exited']

logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )
print("train df colomns = ********** : ",input_feature_train_df.columns)
input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
# input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

[2024-08-21 20:09:43,551: INFO: 3840580171: Applying preprocessing object on training dataframe and testing dataframe.]
train df colomns = ********** :  Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Complain', 'SatisfactionScore', 'CardType', 'PointEarned'],
      dtype='object')


TypeError: the first argument must be callable

In [51]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np

def create_preprocessor(numerical_columns, categorical_columns, card_type_mapping):
    # Custom transformer for mapping 'CardType'
    def map_card_type(X):
        return np.vectorize(card_type_mapping.get)(X)
    
    # Pipeline for numerical columns (Standardization)
    numerical_pipeline = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Pipeline for categorical columns (OneHot Encoding and Standardization)
    categorical_pipeline = Pipeline(steps=[
        ('card_type_mapper', FunctionTransformer(map_card_type, validate=False)), # Map CardType
        ('onehot', OneHotEncoder(sparse_output=False)),                           # OneHotEncode
        ('scaler', StandardScaler())                                              # Standardize
    ])
    
    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_columns),
            ('cat', categorical_pipeline, categorical_columns)
        ]
    )
    
    return preprocessor

# Usage:
numerical_columns = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
                     "HasCrCard", "IsActiveMember", "EstimatedSalary", "Complain",
                     "SatisfactionScore", "PointEarned"]

categorical_columns = ["Gender", "CardType"]

card_type_mapping = {'DIAMOND': 1, 'PLATINUM': 2, 'GOLD': 3, 'SILVER': 4}

preprocessor = create_preprocessor(numerical_columns, categorical_columns, card_type_mapping)


In [52]:
input_feature_train_df = train_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_train_df = train_df['Exited']

input_feature_test_df = test_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'SatisfactionScore', 'CardType', 'PointEarned']]
target_feature_test_df = test_df['Exited']

logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )
print("train df colomns = ********** : ",input_feature_train_df.columns)
input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessor.transform(input_feature_test_df)

[2024-08-21 20:27:01,710: INFO: 2799058373: Applying preprocessing object on training dataframe and testing dataframe.]
train df colomns = ********** :  Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Complain', 'SatisfactionScore', 'CardType', 'PointEarned'],
      dtype='object')
