In [3]:
# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Append the src directory to sys.path
import sys
sys.path.append('..')

from src.data_loader import DataLoader
from src.eda import EDA
from src.feature_engineering import FeatureEngineering


In [4]:
# Load the data
file_path = '../data/raw/data.csv'
loader = DataLoader(file_path)
data = loader.load_data()

2024-06-07 00:19:36,976 - src.logger - INFO - Data loaded successfully from ../data/raw/data.csv


In [5]:
# Perform Feature Engineering
fe = FeatureEngineering(data)

In [6]:
# Create Aggregate Features
data = fe.create_aggregate_features()

2024-06-07 00:19:37,000 - src.logger - INFO - Creating aggregate features...
2024-06-07 00:19:37,000 - src.logger - INFO - Creating aggregate features...


In [7]:
# Extract Temporal Features
data = fe.extract_temporal_features()

2024-06-07 00:19:37,057 - src.logger - INFO - Extracting temporal features...
2024-06-07 00:19:37,057 - src.logger - INFO - Extracting temporal features...


In [8]:
# Encode Categorical Variables
print("Encoding Categorical Variables...")
cat_cols = data.select_dtypes(include=['object']).columns

# One-Hot Encoding for categorical variables with fewer categories
for col in cat_cols:
    if data[col].nunique() < 10:
        ohe = OneHotEncoder(sparse_output=False, drop='first')
        ohe_df = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data, ohe_df], axis=1).drop(columns=[col])
    else:
        # Label Encoding for categorical variables with more categories
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

Encoding Categorical Variables...


In [9]:
# Handle Missing Values
data = fe.handle_missing_values()

2024-06-07 00:19:38,433 - src.logger - INFO - Handling missing values...
2024-06-07 00:19:38,433 - src.logger - INFO - Handling missing values...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)


In [10]:
# Normalize Numerical Features
data = fe.normalize_features()

2024-06-07 00:19:38,505 - src.logger - INFO - Normalizing features...
2024-06-07 00:19:38,505 - src.logger - INFO - Normalizing features...


In [11]:
# Standardize Numerical Features
print("Standardizing Numerical Features...")
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

Standardizing Numerical Features...


In [12]:
# Save the processed data
data.to_csv('../data/processed/processed_data.csv', index=False)

In [4]:
# notebooks/FeatureEngineering.ipynb

# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from xverse.transformer import WOE

# Append the src directory to sys.path
# Append the src directory to sys.path
import sys
sys.path.append('..')

from src.data_loader import DataLoader
from src.eda import EDA
from src.feature_engineering import FeatureEngineering

# Load the data
file_path = '../data/raw/data.csv'
loader = DataLoader(file_path)
data = loader.load_data()

# Perform Feature Engineering
fe = FeatureEngineering(data)

# Create Aggregate Features
data = fe.create_aggregate_features()

# Extract Temporal Features
data = fe.extract_temporal_features()

# Encode Categorical Variables
print("Encoding Categorical Variables...")
cat_cols = data.select_dtypes(include=['object']).columns

# One-Hot Encoding for categorical variables with fewer categories
for col in cat_cols:
    if data[col].nunique() < 10:
        ohe = OneHotEncoder(sparse_output=False, drop='first')
        ohe_df = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data, ohe_df], axis=1).drop(columns=[col])
    else:
        # Label Encoding for categorical variables with more categories
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Handle Missing Values
data = fe.handle_missing_values()

# Apply WoE Transformation
print("Applying WoE Transformation...")
woe = WOE()
# Assuming 'target' is the target variable column
data_woe = woe.fit_transform(data, data['FraudResult'])

# Standardize Numerical Features
print("Standardizing Numerical Features...")
num_cols = data_woe.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data_woe[num_cols] = scaler.fit_transform(data_woe[num_cols])

# Save the processed data
processed_file_path = '../data/processed/processed_data.csv'
os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
data_woe.to_csv(processed_file_path, index=False)

print(f"Processed data saved to {processed_file_path}")

2024-06-08 10:09:20,916 - src.logger - INFO - Data loaded successfully from ../data/raw/data.csv
2024-06-08 10:09:20,916 - src.logger - INFO - Data loaded successfully from ../data/raw/data.csv
2024-06-08 10:09:20,918 - src.logger - INFO - Creating aggregate features...
2024-06-08 10:09:20,918 - src.logger - INFO - Creating aggregate features...
2024-06-08 10:09:20,918 - src.logger - INFO - Creating aggregate features...
2024-06-08 10:09:20,974 - src.logger - INFO - Extracting temporal features...
2024-06-08 10:09:20,974 - src.logger - INFO - Extracting temporal features...
2024-06-08 10:09:20,974 - src.logger - INFO - Extracting temporal features...


Encoding Categorical Variables...


2024-06-08 10:09:21,985 - src.logger - INFO - Handling missing values...
2024-06-08 10:09:21,985 - src.logger - INFO - Handling missing values...
2024-06-08 10:09:21,985 - src.logger - INFO - Handling missing values...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)


Applying WoE Transformation...


KeyError: 'target'