In [1]:
import os

In [2]:
%pwd

'c:\\Users\\karthikeya\\Insurance_Premium_Prediction\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\karthikeya\\Insurance_Premium_Prediction'

In [5]:
import sys
from typing import List
from dataclasses import dataclass
import joblib as jl
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from src.logger import logger

In [6]:

class Duplicate_Dropper(BaseEstimator, TransformerMixin):
    """
    Drops duplicated rows from the dataframe
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_=X.copy()
        X_ = X_.drop_duplicates(keep="first")
        X_ = X_.reset_index(drop=True)
        return X_

In [7]:
class Numerical_Imputer(BaseEstimator, TransformerMixin):
    
    """
    Simple median imputer for Numerical Columns
    """
    
    def __init__(self, variables:List=None):
         self.variables= variables
    
    def fit(self, X, y=None):

        self.imputer_dict_={}

        for feature in self.variables:
            self.imputer_dict_[feature] =X[feature].median()
        return self
    
    def transform(self, X):
        X_ = X.copy()
        for feature in self.variables:
            X_[feature].fillna(self.imputer_dict_[feature], inplace=True)
        return X_

In [8]:
class Categorical_Imputer(BaseEstimator, TransformerMixin):

    """
    Simple mode imputer for Categorical Columns
    """
    
    def __init__(self, variables:List=None):
         self.variables= variables
    
    def fit(self, X, y=None):

        self.imputer_dict_={}

        for feature in self.variables:
            self.imputer_dict_[feature] =X[feature].mode()
        return self
    
    def transform(self, X):
        X_ = X.copy()
        for feature in self.variables:
            X_[feature].fillna(self.imputer_dict_[feature], inplace=True)
        return X_

In [9]:
from src.constants import Numerical_Cols, Categorical_Cols, train_data, test_data, train_labels, test_labels, target_column_name

In [10]:
@dataclass
class DataColumns:
    Numerical_Columns = Numerical_Cols
    Categorical_Columns = Categorical_Cols
    train_df = train_data
    test_df = test_data
    train_target = train_labels
    test_target = test_labels
    target = target_column_name

In [11]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join("artifacts", 'preprocessors.joblib')

In [None]:
class DataTransformation:
    def __init__(self):
        
        self.data_transformation_config = DataTransformationConfig()
        self.data_columns = DataColumns()

    def get_data_transformer_object(self):

        """
        This function is responisble for data transformation
    
        """
        try:

            

            num_pipeline = Pipeline(

                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())]
            )

            cat_pipeline = Pipeline(

                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("OneHotEncoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))]
            )
            logger.info("Numerical columns standard scaling completed")
            logger.info("Categorical columns encoding completed")
            
            preprocessor = ColumnTransformer(
                [
                    # ("Duplicate_Dropper", duplicate_pipeline, self.data_columns.Numerical_Columns+self.data_columns.Categorical_Columns),
                    ("Numerical_Pipeline", num_pipeline, self.data_columns.Numerical_Columns),
                    ("Categorical_Pipeline", cat_pipeline, self.data_columns.Categorical_Columns)
                     ]
            )


            preprocessor_obj = jl.dump(preprocessor,self.data_transformation_config.preprocessor_obj_file_path)
            
            logger.info(f"saved preprocessor as joblib file in the path {self.data_transformation_config.preprocessor_obj_file_path}")
            return preprocessor
        except Exception as e:
            raise e

    def initiate_data_transformation(self, train_path, test_path):

        try:
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            
            logger.info("Reading train and test data completed")

            logger.info("Obtaining preprocessing object")

            prerprocessing_obj = self.get_data_transformer_object()

            logger.info("Applying preprocessor on train dataset and test dataset")
            
            input_feature_arr = prerprocessing_obj.fit_transform(train_df.drop(columns=self.data_columns.target, axis=1))


            input_test_arr = prerprocessing_obj.transform(test_df.drop(columns=self.data_columns.target, axis=1))

            train_arr = np.c_[
                input_feature_arr, np.array(train_df[self.data_columns.target].values)
            ]

            test_arr = np.c_[
                input_test_arr, np.array(test_df[self.data_columns.target].values)
            ]

            return (
                train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path
            )
        except Exception as e:
            raise e
            
            

In [34]:
from src.components.data_ingestion import DataIngestion, DataIngestionConfig

In [35]:
DI =DataIngestion()
_,Tr_D,Ts_D=DI.initiate_data_ingestion()
DT=DataTransformation()
DT.initiate_data_transformation(Tr_D, Ts_D)

[2024-11-19 16:17:24,377, INFO, data_ingestion, Entered the data ingestion method ]
[2024-11-19 16:17:24,379, INFO, data_ingestion, Establishing Connection with SQLite databse ]
[2024-11-19 16:17:24,381, INFO, utils, Successfully connected to the SQLite database. ]
[2024-11-19 16:17:31,199, INFO, data_ingestion, Successfuly read the raw data as dataframe ]
[2024-11-19 16:17:31,201, INFO, utils, Disconnected from the SQLite database. ]
[2024-11-19 16:17:31,202, INFO, data_ingestion, Disconnected from SQLite database ]
[2024-11-19 16:17:31,203, INFO, data_ingestion, Train Test Split Initiated ]
[2024-11-19 16:17:46,664, INFO, data_ingestion, Data ingestion is complete ]
[2024-11-19 16:17:50,248, INFO, 3617766053, Reading train and test data completed ]
[2024-11-19 16:17:50,249, INFO, 3617766053, Obtaining preprocessing object ]
[2024-11-19 16:17:50,250, INFO, 3617766053, Numerical columns standard scaling completed ]
[2024-11-19 16:17:50,251, INFO, 3617766053, Categorical columns encodin

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_[feature].fillna(self.imputer_dict_[feature], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_[feature].fillna(self.imputer_dict_[feature], inplace=True)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 699981 and the array at index 1 has size 700000

In [45]:
DataColumns().target

['charges']