In [31]:
import os

In [32]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [3]:
os.chdir('../')

In [30]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [33]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    data_path: Path
    validated_path: Path

In [34]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [35]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning

        create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            validated_path=config.validated_path
        )

        return data_cleaning_config

In [36]:
import os
from src.mlProject import logger
import pandas as pd
import numpy as np

In [53]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config

    def data_cleaning(self):
        data = self.load_data()

        if data is not None:
            cleaned_data = self.perform_data_cleaning(data)
            self.save_cleaned_data(cleaned_data)
        else:
            logger.error("Validation failed. Check the input data and schema")

    def load_data(self):
        data_path = self.config.data_path
        try:
            with open(data_path, 'r') as file:
                data = pd.read_csv(file)
            return data
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            return None

    def perform_data_cleaning(self, df):
        logger.info("Data cleaning process started")

        # Make a copy of the DataFrame to avoid modifying the original data
        data = df.copy()

        # Data cleaning operations here...
        data.dropna(inplace=True)
        data.drop_duplicates(inplace=True)
        data.reset_index(drop=True, inplace=True)
        data['Date_of_Journey'] = pd.to_datetime(data['Date_of_Journey'])
        data['Journey_day'] = pd.DatetimeIndex(data['Date_of_Journey']).day
        data['Journey_month'] = pd.DatetimeIndex(data['Date_of_Journey']).month
        data['Journey_weekday'] = pd.DatetimeIndex(data['Date_of_Journey']).weekday
        data['Journey_year'] = pd.DatetimeIndex(data['Date_of_Journey']).year
        data['Destination'] = np.where(data['Destination'] == 'Delhi', 'New Delhi', data['Destination'])
        data['Dep_Time_Hr'] = data['Dep_Time'].str.extract('(\d+):(\d+)').astype(int)[0]
        data['Dep_Time_Min'] = data['Dep_Time'].str.extract('(\d+):(\d+)').astype(int)[1]
        data['Arr_Time_Hr'] = data['Arrival_Time'].str.extract('(\d+):(\d+)').astype(int)[0]
        data['Arr_Time_Min'] = data['Arrival_Time'].str.extract('(\d+):(\d+)').astype(int)[1]
        data['Duration_Hour'] = data['Duration'].str.extract('(\d+)h', expand=False).fillna(0).astype(int)
        data['Duration_Minute'] = data['Duration'].str.extract('(\d+)m', expand=False).fillna(0).astype(int)
        data['Total_Stops'] = data['Total_Stops'].map({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})
        data.drop(columns=['Date_of_Journey', 'Dep_Time', 'Arrival_Time', 'Route', 'Additional_Info', 'Duration'], axis=1,
                  inplace=True)

        logger.info("Data cleaning process completed")
        return data

    def save_cleaned_data(self, df):
        try:
            cleaned_data_path = os.path.join(self.config.root_dir, "cleaned_data.csv")
            df.to_csv(cleaned_data_path, index=False)
            logger.info("Cleaned data is saved in artifacts folder.")
        except Exception as e:
            logger.error(f"Error saving cleaned data: {e}")


In [54]:
try:
    config = ConfigurationManager()
    data_cleaning_config = config.get_data_cleaning_config()
    data_cleaning_process = DataCleaning(config=data_cleaning_config)
    data_cleaning_process.data_cleaning()

except Exception as e:
    raise e


[2024-01-31 15:02:09,737: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-31 15:02:09,745: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 15:02:09,753: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-01-31 15:02:09,753: INFO: common: created directory at: artifacts]
[2024-01-31 15:02:09,761: INFO: common: created directory at: artifacts/data_cleaning]
[2024-01-31 15:02:09,817: INFO: 3731321430: Data cleaning process started]


  data['Date_of_Journey'] = pd.to_datetime(data['Date_of_Journey'])


[2024-01-31 15:02:10,443: INFO: 3731321430: Data cleaning process completed]
[2024-01-31 15:02:10,667: INFO: 3731321430: Cleaned data is saved in artifacts folder.]
