- Data Transformation

In [2]:
# features
from src.data.feature_engineering import FeatureEngineering

# Pandas and Numpy
import pandas as pd
import numpy as np

# helpers
from helpers.config import load_config
from helpers.logger import logger

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from typing import Tuple

class DataTransformation:
    """A utility class to scale training and testing features and split training and testing features."""
    
    def __init__(self, config: dict, data: FeatureEngineering | None = None):
        """Initialize DataTransformation class.
        
        Args:
            config (dict): A configuration file consisting of files, target, unused features, features ect.
            data (FeatureEngineering): Feature Engineering module with cleaned data and selected features.
        """
        
        self.config = config or load_config()
        self.data = data or FeatureEngineering(self.config)
        self.scaler = StandardScaler()
        
    def split_and_scale_features(self) -> Tuple[np.ndarray, np.ndarray]:
        """selected features scaled using StandardScaler and split 80/20.
        
        Returns:
        X_train_scaled (np.ndarray): A scaled dataframe consisting of 80% of features.
        X_test_scaled (np.ndarray): A scaled dataframe consisting of 20% of features.
        
        Raises:
            ValueError: If Splitting and scaling features fails.
        """
        try:
            data = self.data.select_features()
            if data is None:
                raise ValueError("Could not get Features from feature engineering")
            
            # features from config
            features = self.config['features']    
            
         
            X_train, X_test = train_test_split(data[features], test_size=0.20, random_state=42)
            logger.info("Features have been split")
            # scale X_train and X_test using Standard scaler.
            
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)
            
            logger.info(f"Shape of X_train_scaled: {X_train_scaled}")
            logger.info(f"Shape of X_test_scaled: {X_test_scaled.shape}")
            
            return X_train_scaled, X_test_scaled
        except Exception as exc:
            logger.error("Failed to split and scale features: %s", exc)
            raise ValueError(f"Could not split and scale data: {exc}") from exc
        
        
    def split_targets(self) -> Tuple[np.ndarray, np.ndarray]:
        """Split targets into training and testing sets.
        
        Returns:
            Tuple[np.ndarray, np.ndarray]: Training and testing target arrays.
            
        Raises:
            ValueError: If Splitting targets fails.
        """
        try:
            data = self.data.select_features()
            if data is None:
                raise ValueError("Could not split targets")
            
            # targets
            targets = self.config['target']
            
            y_train, y_test = train_test_split(data[targets], test_size=0.20, random_state=42)
            
            logger.info("y_train and y_test have been initialized")
            
            logger.info(f"Shape of y_train: {y_train.shape}")
            logger.info(f"Shape of y_test: {y_test.shape}")
            
            return y_train, y_test
        except Exception as exc:
            logger.error("Failed to split targets: %s", exc)
            raise ValueError(f"Could not split targets: {exc}") from exc



        


# X_train_scaled, X_test_scaled
X_train_scaled, X_test_scaled = DataTransformation(config=load_config()).split_and_scale_features()
X_train_scaled, X_test_scaled

[2026-01-04 08:37:41,810: INFO: 852684778: Features have been split]
[2026-01-04 08:37:41,816: INFO: 852684778: Shape of X_train_scaled: [[ 4.90283207  0.40849867  0.50869345 ... -0.6710645  -0.53641908
  -0.87616094]
 [-0.1817844   0.40849867  0.50869345 ...  2.32650402 -0.53641908
  -0.87616094]
 [-0.1817844   0.40849867  0.50869345 ... -0.07155079  1.86421408
  -0.87616094]
 ...
 [-0.1817844   0.40849867  0.50869345 ... -0.07155079 -0.53641908
   1.14134282]
 [-0.1817844   0.40849867  0.50869345 ... -0.6710645  -0.53641908
  -0.87616094]
 [-0.1817844   0.40849867  0.50869345 ... -0.6710645   1.86421408
  -0.87616094]]]
[2026-01-04 08:37:41,816: INFO: 852684778: Shape of X_test_scaled: (476, 15)]


(array([[ 4.90283207,  0.40849867,  0.50869345, ..., -0.6710645 ,
         -0.53641908, -0.87616094],
        [-0.1817844 ,  0.40849867,  0.50869345, ...,  2.32650402,
         -0.53641908, -0.87616094],
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.07155079,
          1.86421408, -0.87616094],
        ...,
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.07155079,
         -0.53641908,  1.14134282],
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.6710645 ,
         -0.53641908, -0.87616094],
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.6710645 ,
          1.86421408, -0.87616094]], shape=(1902, 15)),
 array([[-0.1817844 ,  0.40849867, -1.96582047, ..., -0.6710645 ,
          1.86421408,  1.14134282],
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.6710645 ,
          1.86421408, -0.87616094],
        [-0.1817844 ,  0.40849867,  0.50869345, ..., -0.07155079,
         -0.53641908, -0.87616094],
        ...,
        [-0.1817844 ,  0.40849867,  

In [3]:
# y_train and y_test
y_train, y_test = DataTransformation(config=load_config()).split_targets()
y_train, y_test

[2026-01-04 08:37:45,409: INFO: 852684778: y_train and y_test have been initialized]
[2026-01-04 08:37:45,410: INFO: 852684778: Shape of y_train: (1902,)]
[2026-01-04 08:37:45,410: INFO: 852684778: Shape of y_test: (476,)]


(1909    0
 1378    0
 1655    0
 286     0
 759     0
        ..
 1638    0
 1095    0
 1130    0
 1294    0
 860     0
 Name: approved, Length: 1902, dtype: int64,
 2339    0
 443     0
 100     0
 1820    1
 56      0
        ..
 618     0
 457     0
 1736    0
 707     0
 1851    1
 Name: approved, Length: 476, dtype: int64)