# Feature Engineering for Dynamic Pricing

This notebook demonstrates the feature engineering pipeline for dynamic pricing:
- Time-based feature extraction
- Demand-supply pressure index calculation
- Location-based features
- Customer behavior features
- Interaction features
- Feature importance analysis

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().absolute().parent / "src"))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import project modules
from src.data.load_data import load_raw_data
from src.data.clean import clean_data
from src.features.build_features import build_features, FeatureBuilder
from src.features.time_features import extract_time_features, create_time_buckets
from src.features.pressure_index import calculate_pressure_index, create_surge_indicators
from src.config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_COLUMN

## Load and Prepare Data

In [None]:
# Load and clean data
df = load_raw_data()
df_clean, _ = clean_data(df)

print(f"Original dataset shape: {df.shape}")
print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Target variable: {TARGET_COLUMN}")

# Split for demonstration
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_clean, test_size=0.2, random_state=42)

print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

## Time-Based Feature Engineering

In [None]:
# Extract time features
df_time = extract_time_features(train_df.copy())

print("Time-based features added:")
time_features = [col for col in df_time.columns if col not in train_df.columns]
for feature in time_features:
    print(f"  - {feature}")

print(f"\nDataset shape after time features: {df_time.shape}")

# Display time feature statistics
time_feature_stats = df_time[time_features].describe()
display(time_feature_stats)

In [None]:
# Visualize time features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Time-Based Features Distribution', fontsize=16)

# Select numerical time features for visualization
numerical_time_features = ['booking_hour', 'hour_sin', 'hour_cos', 'is_rush_hour', 'demand_score', 'time_multiplier']
numerical_time_features = [f for f in numerical_time_features if f in df_time.columns]

for i, feature in enumerate(numerical_time_features[:6]):
    row, col = i // 3, i % 3
    if feature in df_time.columns:
        df_time[feature].hist(bins=30, ax=axes[row, col], alpha=0.7)
        axes[row, col].set_title(feature)
        axes[row, col].set_xlabel('')
        axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Demand-Supply Pressure Features

In [None]:
# Calculate pressure index features
df_pressure = calculate_pressure_index(df_time.copy())

print("Pressure index features added:")
pressure_features = [col for col in df_pressure.columns if col not in df_time.columns]
for feature in pressure_features:
    print(f"  - {feature}")

print(f"\nDataset shape after pressure features: {df_pressure.shape}")

# Display pressure feature statistics
pressure_feature_stats = df_pressure[pressure_features].describe()
display(pressure_feature_stats)

In [None]:
# Visualize pressure features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Pressure Index Features Distribution', fontsize=16)

# Select pressure features for visualization
key_pressure_features = ['demand_supply_ratio', 'supply_demand_ratio', 'pressure_index', 
                       'market_imbalance', 'base_surge_multiplier', 'surge_probability']
key_pressure_features = [f for f in key_pressure_features if f in df_pressure.columns]

for i, feature in enumerate(key_pressure_features[:6]):
    row, col = i // 3, i % 3
    if feature in df_pressure.columns:
        df_pressure[feature].hist(bins=30, ax=axes[row, col], alpha=0.7)
        axes[row, col].set_title(feature)
        axes[row, col].set_xlabel('')
        axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Surge Indicator Features

In [None]:
# Create surge indicators
df_surge = create_surge_indicators(df_pressure.copy())

print("Surge indicator features added:")
surge_features = [col for col in df_surge.columns if col not in df_pressure.columns]
for feature in surge_features:
    print(f"  - {feature}")

print(f"\nDataset shape after surge features: {df_surge.shape}")

# Display surge level distribution
if 'surge_level' in df_surge.columns:
    print("\nSurge level distribution:")
    print(df_surge['surge_level'].value_counts())

## Comprehensive Feature Engineering Pipeline

In [None]:
# Use the comprehensive feature builder
feature_builder = FeatureBuilder()
df_features, _ = build_features(train_df.copy(), fit_transform=True)

print(f"Final feature dataset shape: {df_features.shape}")
print(f"Total features created: {df_features.shape[1] - 1}")  # -1 for target

# Display feature groups
feature_groups = feature_builder.get_feature_importance_groups()
print("\nFeature groups:")
for group, features in feature_groups.items():
    print(f"  {group}: {len(features)} features")
    for feature in features[:3]:  # Show first 3
        print(f"    - {feature}")
    if len(features) > 3:
        print(f"    ... and {len(features) - 3} more")
    print()

## Feature Correlation Analysis

In [None]:
# Correlation analysis with target
feature_columns = [col for col in df_features.columns if col != TARGET_COLUMN]
correlations = df_features[feature_columns + [TARGET_COLUMN]].corr()[TARGET_COLUMN].sort_values(key=abs, ascending=False)

print("Top 20 features correlated with target:")
top_correlations = correlations.head(21)[1:]  # Exclude target itself
for feature, corr in top_correlations.items():
    print(f"  {feature}: {corr:.4f}")

In [None]:
# Visualize top correlations
plt.figure(figsize=(12, 8))
top_15_features = top_correlations.head(15).index
correlation_matrix = df_features[list(top_15_features) + [TARGET_COLUMN]].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Top Features')
plt.tight_layout()
plt.show()

## Feature Selection

In [None]:
# Prepare data for feature selection
X = df_features[feature_columns]
y = df_features[TARGET_COLUMN]

# Remove any remaining non-numeric columns
numeric_features = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_features]

print(f"Numeric features for selection: {len(numeric_features)}")

# Univariate feature selection
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X_numeric, y)
selected_features = numeric_features[selector.get_support()]

print(f"\nTop 20 selected features:")
feature_scores = selector.scores_[selector.get_support()]
for i, (feature, score) in enumerate(zip(selected_features, feature_scores)):
    print(f"  {i+1:2d}. {feature}: {score:.2f}")

## Feature Importance Analysis

In [None]:
# Train a simple Random Forest for feature importance
from sklearn.ensemble import RandomForestRegressor

# Use top selected features
X_top = X_numeric[selected_features]

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_top, y)

# Get feature importance
importance_scores = rf.feature_importances_
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': importance_scores
}).sort_values('importance', ascending=False)

print("Feature Importance (Random Forest):")
display(feature_importance.head(15))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))
top_15_importance = feature_importance.head(15)

plt.barh(range(len(top_15_importance)), top_15_importance['importance'])
plt.yticks(range(len(top_15_importance)), top_15_importance['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Feature Engineering Summary

In [None]:
# Create feature engineering summary
feature_summary = {
    'original_features': len(train_df.columns),
    'engineered_features': len(df_features.columns),
    'feature_groups': feature_groups,
    'top_correlated_features': top_correlations.head(10).to_dict(),
    'selected_features': selected_features.tolist(),
    'feature_importance': feature_importance.head(10).to_dict('records')
}

print("Feature Engineering Summary:")
print(f"  Original features: {feature_summary['original_features']}")
print(f"  Engineered features: {feature_summary['engineered_features']}")
print(f"  Features created: {feature_summary['engineered_features'] - feature_summary['original_features']}")

print("\nFeature Groups:")
for group, features in feature_summary['feature_groups'].items():
    print(f"  {group}: {len(features)} features")

print("\nTop 5 Most Important Features:")
for i, record in enumerate(feature_summary['feature_importance'][:5]):
    print(f"  {i+1}. {record['feature']}: {record['importance']:.4f}")

## Save Feature Engineered Dataset

In [None]:
# Save the feature engineered dataset
output_path = Path().absolute().parent / "data" / "processed"
output_path.mkdir(exist_ok=True)

# Save training features
train_features_path = output_path / "train_features.csv"
df_features.to_csv(train_features_path, index=False)
print(f"Training features saved to: {train_features_path}")

# Save feature builder
feature_builder.save_encoders(str(output_path / "feature_encoders.json"))
print(f"Feature encoders saved to: {output_path / 'feature_encoders.json'}")

# Apply same transformation to test set
test_features = feature_builder.transform(test_df)
test_features_path = output_path / "test_features.csv"
test_features.to_csv(test_features_path, index=False)
print(f"Test features saved to: {test_features_path}")

print(f"\nFinal datasets:")
print(f"  Train: {df_features.shape}")
print(f"  Test: {test_features.shape}")

## Key Insights

### Feature Engineering Results:
1. **Time Features**: Successfully extracted cyclical time features, rush hour indicators, and demand scores
2. **Pressure Index**: Created comprehensive demand-supply dynamics metrics
3. **Surge Indicators**: Developed multi-level surge classification and probability estimates
4. **Interaction Features**: Generated location-time, loyalty-pressure, and other interaction terms

### Most Important Features:
1. [Top feature from analysis]
2. [Second most important feature]
3. [Third most important feature]

### Feature Selection Insights:
- [Number of features selected vs total]
- [Feature reduction percentage]
- [Performance implications]

### Recommendations for Modeling:
1. Use top 20-30 features for baseline models
2. Consider feature groups for interpretable models
3. Monitor feature importance drift over time
4. Implement feature validation in production