In [2]:
import os

In [3]:
%pwd

'c:\\development\\Machine-Learning\\end-to-end-ml-project\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\development\\Machine-Learning\\end-to-end-ml-project'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from ml_project.constants import *
from ml_project.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [9]:
from ml_project import logger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import pandas as pd


[2024-02-25 13:00:01,611: INFO: utils:  Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.]
[2024-02-25 13:00:01,616: INFO: utils:  NumExpr defaulting to 8 threads.]


In [25]:
class DataTransformation:

    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.df = pd.read_csv(self.config.data_path , sep=';')

    def binary_encoding(df, columns, positive_value):
        for col in columns:
            df[col] = df[col].apply(lambda x: 1 if x == positive_value else 0)
        return df     

    def month_transformer(self) -> pd.DataFrame:
        # print(self.df.columns)
        month_mapping = {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9,
            'oct': 10,
            'nov': 11,
            'dec': 12
        }
        self.df['month'] = self.df['month'].str.lower()
        self.df.month = self.df.month.map(month_mapping)

        self.df.day = self.df.day.astype(str).str.zfill(2)
        self.df.month = self.df.month.astype(str).str.zfill(2)

        self.df['date'] = self.df.day + '-' + self.df.month

        self.df['date_month'] = pd.to_datetime('2024-' + self.df['date'], format='%Y-%d-%m', errors='coerce')\
                            .dt.strftime('%d-%m')

        self.df.drop(['date', 'day', 'month'], axis=1, inplace=True)
        
        
    def age_transformer(self) -> pd.DataFrame:
        blanks = []
        
        for age in self.df['age']:
            if 18 <= age <= 30:
                blanks.append('18-30')
            elif 31 <= age <= 40:
                blanks.append('31-40')
            elif 41 <= age <= 50:
                blanks.append('41-50')
            elif 51 <= age <= 60:
                blanks.append('51-60')
            elif 61 <= age <= 70:
                blanks.append('61-70')
            elif 71 <= age <= 80:
                blanks.append('71-80')
            elif 81 <= age <= 90:
                blanks.append('81-90')
            elif 91 <= age <= 95:
                blanks.append('91-95')
            else:
                blanks.append('Unknown')
                
        self.df['age_group'] = blanks   
        self.df.drop('age', axis=1, inplace=True)
        
        
    
        

    def binary(self):
        def binary_encoding(df, columns, positive_value):
            for col in columns:
                df[col] = df[col].apply(lambda x: 1 if x == positive_value else 0)
            return df 
        binary_columns = ['default', 'housing', 'loan', 'y']
        self.df = binary_encoding(self.df, binary_columns, 'yes')
        
    
    def categorical_encoding(self):
        
        one_hot_columns = ['job', 'marital', 'education', 'contact', 'poutcome']
        label_columns = ['age_group' , 'date_month'] 
        
        self.df = pd.get_dummies(self.df, columns=one_hot_columns, drop_first=True ,  dtype='int64' )
        print(self.df.head())
        le = LabelEncoder()
        for col in label_columns:
            self.df[col] = le.fit_transform(self.df[col])
    
    
    def scaling(self):
        scaler = StandardScaler()
        continuous_features = self.df[[col for col in self.df.columns if self.df[col].dtype != 'object']]
        self.df[continuous_features.columns] = scaler.fit_transform(continuous_features)
    
    
    
    def train_test_split(self):
        X = self.df.drop('y', axis=1)
        y = self.df['y']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        logger.info("Splited data into training and test sets")
        logger.info(X_train.shape)
        logger.info(y_train.shape)
        X_train.to_csv(os.path.join(self.config.root_dir, "X_train.csv"),index = False)
        X_test.to_csv(os.path.join(self.config.root_dir, "X_test.csv"),index = False)
        y_train.to_csv(os.path.join(self.config.root_dir, "y_train.csv"),index = False)
        y_test.to_csv(os.path.join(self.config.root_dir, "y_test.csv"),index = False)

        return X_train, X_test, y_train, y_test
    
    def transform(self):
        self.month_transformer()
        self.age_transformer()
        self.binary()
        self.scaling()
        self.categorical_encoding()
        self.train_test_split()    
        
            
        
    

In [26]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transform()
except Exception as e:
    raise e

[2024-02-25 13:10:34,114: INFO: common:  yaml file: config\config.yaml loaded successfully]
[2024-02-25 13:10:34,115: INFO: common:  yaml file: params.yaml loaded successfully]
[2024-02-25 13:10:34,117: INFO: common:  yaml file: schema.yaml loaded successfully]
[2024-02-25 13:10:34,118: INFO: common:  created directory at: artifacts]
[2024-02-25 13:10:34,119: INFO: common:  created directory at: artifacts/data_transformation]
   default   balance   housing      loan  duration  campaign     pdays  \
0 -0.13549  0.256419  0.893915 -0.436803  0.011016 -0.569351 -0.411453   
1 -0.13549 -0.437895  0.893915 -0.436803 -0.416127 -0.569351 -0.411453   
2 -0.13549 -0.446762  0.893915  2.289359 -0.707361 -0.569351 -0.411453   
3 -0.13549  0.047205  0.893915 -0.436803 -0.645231 -0.569351 -0.411453   
4 -0.13549 -0.447091 -1.118674 -0.436803 -0.233620 -0.569351 -0.411453   

   previous         y date_month  ... marital_married  marital_single  \
0  -0.25194 -0.363983      05-05  ...               