In [109]:
# notebooks/FeatureEngineering.ipynb

# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Append the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.getcwd(), '../src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from data_loader import DataLoader
from feature_engineering import FeatureEngineering


In [110]:
# Load the data
file_path = '../data/raw/data.csv'
loader = DataLoader(file_path)
data = loader.load_data()

In [111]:
# Perform Feature Engineering
fe = FeatureEngineering(data)

In [112]:
# Create Aggregate Features
data = fe.create_aggregate_features()

In [113]:
# Extract Temporal Features
data = fe.extract_temporal_features()

In [114]:
# Encode Categorical Variables
print("Encoding Categorical Variables...")
cat_cols = data.select_dtypes(include=['object']).columns

# One-Hot Encoding for categorical variables with fewer categories
for col in cat_cols:
    if data[col].nunique() < 10:
        ohe = OneHotEncoder(sparse_output=False, drop='first')
        ohe_df = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data, ohe_df], axis=1).drop(columns=[col])
    else:
        # Label Encoding for categorical variables with more categories
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

Encoding Categorical Variables...


In [115]:
# Handle Missing Values
data = fe.handle_missing_values()

# Normalize Numerical Features
data = fe.normalize_features()

# Standardize Numerical Features (optional, depending on the need)
# data = fe.standardize_features()

# Save the processed data
data.to_csv('../data/processed/processed_data.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)


In [None]:
# Standardize Numerical Features
print("Standardizing Numerical Features...")
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])