In [2]:
import os

In [3]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction\\research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [25]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    preprocessor_obj_file_path: Path
    

In [8]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories, save_object

In [61]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_obj_file_path=config.preprocessor_obj_file_path
        )

        return data_transformation_config

In [31]:
import os
import pandas as pd
import numpy as np
import pickle
from src.mlProject import logger
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler


In [68]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformation(self):

        try:
            numerical_columns = ["Total_Stops","Journey_day","Journey_month","Journey_weekday","Journey_year","Dep_Time_Hr",
                                 "Dep_Time_Min","Arr_Time_Hr","Arr_Time_Min","Duration_Hour","Duration_Minute"]
            categorical_columns = [
                "Airline",
                "Source",
                "Destination",
                
            ]

            num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ]
            )

            cat_pipeline=Pipeline(

                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder",OneHotEncoder()),
                ("scaler",StandardScaler(with_mean=False))
                ]

            )

            logger.info(f"Categorical columns: {categorical_columns}")
            logger.info(f"Numerical columns: {numerical_columns}")

            preprocessor=ColumnTransformer(
                [
                ("num_pipeline",num_pipeline,numerical_columns),
                ("cat_pipelines",cat_pipeline,categorical_columns)

                ]
            )

            with open('preprocessor.pkl', 'wb') as file:
                pickle.dump(preprocessor, file)

            return preprocessor
        
        except Exception as e:
            raise e 
       
    def initiate_data_transformation(self):
        try:
            df = pd.read_csv("artifacts/data_cleaning/cleaned_data.csv")

            print(df.shape)

            logger.info("Obtaining preprocessing object")

            preprocessing_obj = self.get_data_transformation()
            target_column_name = "Price"

            numerical_columns = ["Total_Stops", "Journey_day", "Journey_month", "Journey_weekday", "Journey_year",
                                 "Dep_Time_Hr", "Dep_Time_Min", "Arr_Time_Hr", "Arr_Time_Min", "Duration_Hour",
                                 "Duration_Minute"]

            categorical_columns = [
                "Airline",
                "Source",
                "Destination",
            ]

            input_feature_df = df.drop(columns=[target_column_name], axis=1)
            target_feature_df = df[target_column_name]

            print(input_feature_df.shape)
            

            logger.info(
                "Applying preprocessing object on training dataframe and testing dataframe."
            )
            input_feature_arr = preprocessing_obj.fit_transform(input_feature_df)

            logger.info("Saved preprocessing object.")

            input_arr = np.c_[
                input_feature_arr, np.array(target_feature_df)
            ]

            transformed_data_df = pd.DataFrame(input_arr)
            
            save_object(
                file_path=self.config.preprocessor_obj_file_path,
                obj=preprocessing_obj
            )

            # Save the transformed data to CSV
            transformed_data_path = os.path.join(self.config.root_dir, "transformed_data.csv")
            transformed_data_df.to_csv(transformed_data_path, index=False)
            logger.info(f"Transformed data saved at: {transformed_data_path}")


        except Exception as e:
            raise e


In [69]:
try:
    config = ConfigurationManager()
    data_trans_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_trans_config)
    data_transformation.initiate_data_transformation()

except Exception as e:
    raise e

[2024-01-31 17:59:29,073: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-31 17:59:29,081: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 17:59:29,089: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-01-31 17:59:29,089: INFO: common: created directory at: artifacts]
[2024-01-31 17:59:29,097: INFO: common: created directory at: artifacts/data_cleaning]
(10462, 15)
[2024-01-31 17:59:29,145: INFO: 727000088: Obtaining preprocessing object]
[2024-01-31 17:59:29,145: INFO: 727000088: Categorical columns: ['Airline', 'Source', 'Destination']]
[2024-01-31 17:59:29,145: INFO: 727000088: Numerical columns: ['Total_Stops', 'Journey_day', 'Journey_month', 'Journey_weekday', 'Journey_year', 'Dep_Time_Hr', 'Dep_Time_Min', 'Arr_Time_Hr', 'Arr_Time_Min', 'Duration_Hour', 'Duration_Minute']]
(10462, 14)
[2024-01-31 17:59:29,161: INFO: 727000088: Applying preprocessing object on training dataframe and testing dataframe.]


[2024-01-31 17:59:29,305: INFO: 727000088: Saved preprocessing object.]
[2024-01-31 17:59:30,628: INFO: 727000088: Transformed data saved at: artifacts/data_cleaning\transformed_data.csv]
