In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Omar\\Desktop\\Omar_Files\\Python_Analysis\\EndToEndMLProjectGenderClassification\\research'

In [3]:
os.chdir("../")

In [122]:
%pwd

'c:\\Users\\Omar\\Desktop\\Omar_Files\\Python_Analysis\\EndToEndMLProjectGenderClassification'

In [128]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransfornmationConfig:
    root_dir: Path
    data_path: Path
    drop_cols:str
    lblenc_cols:str
    ordinal_cols:str


In [129]:
from EndToEndMLProjectGenderClassification.constants import *
from EndToEndMLProjectGenderClassification.utils.common import read_yaml,create_directories

In [130]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH) -> None:
        
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransfornmationConfig:
        config=self.config.data_transformation
        schema=self.schema
    
        create_directories([config.root_dir])
        
        data_transformation_config  = DataTransfornmationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            drop_cols=schema.DROP_COLUMNS,
            lblenc_cols=schema.LABL_ENCODING,
            ordinal_cols=schema.ORDINAL_ENCODING

        )

        return data_transformation_config 


In [131]:
import sys
import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder,OrdinalEncoder
from EndToEndMLProjectGenderClassification import logger
from EndToEndMLProjectGenderClassification.utils.common import get_size
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import janitor

In [134]:
class DataTransfornmation:
    def __init__(self,config:DataTransfornmationConfig):
        self.config= config

    def data_dropping(self):
        df=pd.read_csv(self.config.data_path)
        drop_cols=self.config.drop_cols
        df=df.drop(drop_cols,axis=1)
        df=df.clean_names()
        return df

    def data_encoding(self):

        df=self.data_dropping()
        df=df.sort_values(by=self.config.ordinal_cols).reset_index().drop("index",axis=1)
        for col in df.select_dtypes(include="object"):
            df[col]=LabelEncoder().fit_transform(df[col])
        return df

    def train_test_splitting(self):

        df=self.data_encoding()
        
        train_set,test_set=train_test_split(df,test_size=0.2,random_state=42)

        train_set.to_csv(os.path.join(self.config.root_dir,"train.csv"),index=False)
        test_set.to_csv(os.path.join(self.config.root_dir,"test.csv"),index=False)

        logger.info("Data Splitting is completed")
        logger.info(train_set.shape) 
        logger.info(test_set.shape)        
    

In [135]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransfornmation(config=data_transformation_config)
    data_transformation.data_dropping()
    data_transformation.data_encoding()
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2024-09-08 21:49:34,566: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-08 21:49:34,569: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-08 21:49:34,574: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-08 21:49:34,576: INFO: common: created directory at: artifacts]
[2024-09-08 21:49:34,578: INFO: common: created directory at: artifacts/data_transformation]
[2024-09-08 21:49:35,190: INFO: 2417638356: Data Splitting is completed]
[2024-09-08 21:49:35,191: INFO: 2417638356: (54284, 8)]
[2024-09-08 21:49:35,192: INFO: 2417638356: (13572, 8)]
