# 6. Feature Engineering
**Purpose**: Create enhanced features using EDA insights while preventing data leakage

In [None]:
# Feature Engineering - Create Enhanced Features
print("🔧 Feature Engineering")
print("="*50)

# Apply feature engineering using our defined function
df_features = engineer_features(df_clean, target_column)

print(f"\n📊 Feature Engineering Results:")
print(f"  • Original features: {df_clean.shape[1]}")
print(f"  • Enhanced features: {df_features.shape[1]}")
print(f"  • New features created: {df_features.shape[1] - df_clean.shape[1]}")
print(f"  • Data shape: {df_features.shape}")

# Verify no data leakage
feature_columns = [col for col in df_features.columns if col != target_column]
vibration_features = [col for col in feature_columns if 'VIBRATION' in col.upper()]

print(f"\n🛡️ Data Leakage Check:")
if vibration_features:
    print(f"  ❌ WARNING: Found {len(vibration_features)} vibration features in predictors!")
    for vf in vibration_features:
        print(f"    - {vf}")
else:
    print(f"  ✅ PASSED: No vibration features in predictor set")
    print(f"  • Total predictive features: {len(feature_columns)}")
    print(f"  • Target variable: {target_column}")

# Show sample of new engineered features
print(f"\n🔍 Sample of Engineered Features:")
engineered_cols = [col for col in df_features.columns if '_rolling_' in col or col in ['hour', 'day_of_week', 'month']][:10]
for i, col in enumerate(engineered_cols, 1):
    print(f"  {i:2d}. {col}")

print(f"\n✅ Feature engineering complete - {df_features.shape[1]} features ready for importance analysis")

In [None]:
# Feature Engineering Quality Assessment
print("\n📈 Feature Engineering Quality Assessment")
print("="*50)

# Analyze feature types
feature_types = {
    'Original': [col for col in df_features.columns if '_rolling_' not in col and col not in ['hour', 'day_of_week', 'month', target_column]],
    'Rolling_Mean': [col for col in df_features.columns if '_rolling_mean_' in col],
    'Rolling_Std': [col for col in df_features.columns if '_rolling_std_' in col],
    'Temporal': [col for col in df_features.columns if col in ['hour', 'day_of_week', 'month']]
}

print("📊 Feature Type Breakdown:")
total_features = 0
for feat_type, feat_list in feature_types.items():
    print(f"  • {feat_type:<12}: {len(feat_list):3d} features")
    total_features += len(feat_list)

print(f"  • {'Target':<12}: {1:3d} feature")
print(f"  • {'Total':<12}: {total_features + 1:3d} features")

# Check feature completeness after engineering
missing_after_engineering = df_features.isnull().sum()
features_with_missing = missing_after_engineering[missing_after_engineering > 0]

print(f"\n🔍 Data Quality After Engineering:")
print(f"  • Features with missing data: {len(features_with_missing)}/{len(df_features.columns)}")
print(f"  • Average missing percentage: {(missing_after_engineering.sum() / (len(df_features) * len(df_features.columns))) * 100:.2f}%")

if len(features_with_missing) > 0:
    print(f"  • Worst missing features:")
    worst_missing = (features_with_missing / len(df_features) * 100).nlargest(5)
    for feat, pct in worst_missing.items():
        print(f"    - {feat[:40]:<40}: {pct:.1f}%")

# Quick correlation check of engineered features with target
print(f"\n🎯 Engineered Features Correlation Check:")
rolling_features = feature_types['Rolling_Mean'] + feature_types['Rolling_Std'][:5]  # Sample
if rolling_features:
    eng_correlations = df_features[rolling_features + [target_column]].corr()[target_column].abs().sort_values(ascending=False)[1:]  # Exclude target itself
    
    print("  Top 5 Engineered Features by Correlation:")
    for i, (feat, corr) in enumerate(eng_correlations.head(5).items(), 1):
        print(f"    {i}. {feat[:45]:<45} | r = {corr:.3f}")
    
    avg_eng_corr = eng_correlations.mean()
    print(f"  • Average engineered feature correlation: {avg_eng_corr:.3f}")
    
    # Compare with original feature average correlation
    orig_correlations = df_features[feature_types['Original'] + [target_column]].corr()[target_column].abs()
    avg_orig_corr = orig_correlations[orig_correlations.index != target_column].mean()
    print(f"  • Average original feature correlation: {avg_orig_corr:.3f}")
    
    improvement = ((avg_eng_corr - avg_orig_corr) / avg_orig_corr) * 100 if avg_orig_corr > 0 else 0
    print(f"  • Correlation improvement: {improvement:+.1f}%")

print(f"\n✅ Feature engineering quality assessment complete")
print(f"  • Dataset ready for feature importance analysis")
print(f"  • Zero data leakage confirmed")
print(f"  • {len(feature_columns)} predictive features available")