# Feature Engineering

In this notebook, we will apply the preprocessing pipeline to our raw data and then engineer new features to improve our model's performance.

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from src.data_preprocessing import preprocess_data

# Load the raw datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
validation_df = pd.read_csv('../data/raw/validation.csv')

print("Raw data loaded.")

## 1. Apply Preprocessing

In [None]:
# Apply the preprocessing function to each dataframe
train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df)
validation_processed = preprocess_data(validation_df)

print("Preprocessing complete.")

## 2. Inspect Processed Data

In [None]:
# Display the first 5 rows of the processed training data
train_processed.head()

In [None]:
# Get a summary of the processed training data
train_processed.info()

## 3. Feature Engineering

In [None]:
# Create tenure bins
tenure_bins = [0, 12, 24, 48, 60, 73] # Max tenure is 72, so upper bound is 73
tenure_labels = ['0-12 Months', '12-24 Months', '24-48 Months', '48-60 Months', '60-72 Months']

for df in [train_processed, test_processed, validation_processed]:
    df['Tenure Bins'] = pd.cut(df['Tenure in Months'], bins=tenure_bins, labels=tenure_labels, right=False)

# One-hot encode the new 'Tenure Bins' column
train_processed = pd.get_dummies(train_processed, columns=['Tenure Bins'], drop_first=True)
test_processed = pd.get_dummies(test_processed, columns=['Tenure Bins'], drop_first=True)
validation_processed = pd.get_dummies(validation_processed, columns=['Tenure Bins'], drop_first=True)

# Drop the original 'Tenure in Months' column
train_processed.drop(columns=['Tenure in Months'], inplace=True)
test_processed.drop(columns=['Tenure in Months'], inplace=True)
validation_processed.drop(columns=['Tenure in Months'], inplace=True)

print("Feature engineering complete.")
train_processed.head()

## 4. Save Processed Data

In [None]:
# Save the processed dataframes to the data/processed directory
train_processed.to_csv('../data/processed/train_processed.csv', index=False)
test_processed.to_csv('../data/processed/test_processed.csv', index=False)
validation_processed.to_csv('../data/processed/validation_processed.csv', index=False)

print("Processed data saved.")