# 7. Feature Selection

Based on the feature importance analysis, we'll select the most impactful features for model training. We'll use multiple selection strategies and compare their effectiveness.

In [None]:
# First, prepare train/test split for feature selection
print("=== PREPARING DATA FOR FEATURE SELECTION ===")

# Split the data first
X_train, X_test, y_train, y_test = prepare_model_data(df_features, target_column, test_size=0.2)

print(f"✅ Data split complete:")
print(f"  • Training samples: {len(X_train):,}")
print(f"  • Testing samples: {len(X_test):,}")
print(f"  • Total features: {X_train.shape[1]}")

# Apply different feature selection strategies
print("\n=== FEATURE SELECTION ANALYSIS ===\n")

# Get feature names and prepare data
feature_names = list(X_train.columns)
print(f"Total available features: {len(feature_names)}")

# Strategy 1: Top 20 features by importance
top20_features, top20_indices = select_top_k_features(X_train, y_train, feature_names, k=20)
print(f"\n1. TOP 20 FEATURES BY IMPORTANCE:")
for i, feature in enumerate(top20_features, 1):
    print(f"   {i:2d}. {feature}")

# Strategy 2: Features contributing to 80% cumulative importance
cum80_features, cum80_indices = select_by_cumulative_importance(X_train, y_train, feature_names, threshold=0.8)
print(f"\n2. FEATURES FOR 80% CUMULATIVE IMPORTANCE:")
print(f"   Number of features needed: {len(cum80_features)}")
for i, feature in enumerate(cum80_features, 1):
    print(f"   {i:2d}. {feature}")

# Strategy 3: Features contributing to 90% cumulative importance  
cum90_features, cum90_indices = select_by_cumulative_importance(X_train, y_train, feature_names, threshold=0.9)
print(f"\n3. FEATURES FOR 90% CUMULATIVE IMPORTANCE:")
print(f"   Number of features needed: {len(cum90_features)}")

# Strategy 4: Statistical feature selection (top 20)
stat_features, stat_indices = select_statistical_features(X_train, y_train, feature_names, k=20)
print(f"\n4. TOP 20 STATISTICAL FEATURES:")
for i, feature in enumerate(stat_features, 1):
    print(f"   {i:2d}. {feature}")

# Compare overlap between methods
print(f"\n=== FEATURE SELECTION COMPARISON ===")
top20_set = set(top20_features)
stat_set = set(stat_features)
overlap = top20_set.intersection(stat_set)

print(f"Overlap between RF Importance and Statistical: {len(overlap)}/20 features")
print(f"Common features: {sorted(list(overlap))}")

In [None]:
# Prepare final selected features for next sections
print("=== PREPARING SELECTED FEATURES FOR MODEL TRAINING ===")

# Use 80% cumulative importance as our selected feature set
selected_features = cum80_features
selected_feature_indices = cum80_indices

# Create final training datasets with selected features
X_train_selected = X_train.iloc[:, selected_feature_indices].copy()
X_test_selected = X_test.iloc[:, selected_feature_indices].copy()

print(f"Original feature count: {X_train.shape[1]}")
print(f"Selected feature count: {X_train_selected.shape[1]}")
print(f"Feature reduction: {((X_train.shape[1] - X_train_selected.shape[1]) / X_train.shape[1] * 100):.1f}%")

print(f"Training set shape: {X_train_selected.shape}")
print(f"Test set shape: {X_test_selected.shape}")

print(f"Selected features saved for model training phase:")
for i, feature in enumerate(selected_features, 1):
    print(f"  {i:2d}. {feature}")

# Verify no data leakage in selected features
vibration_features = [f for f in selected_features if 'vibration' in f.lower() or 'vib' in f.lower()]
if vibration_features:
    print(f"⚠️  WARNING: Found vibration-related features in selection: {vibration_features}")
else:
    print(f"✅ CONFIRMED: No vibration-related features in selected set - data leakage prevented")

In [None]:
# Create different feature sets for model comparison
feature_sets = {
    'All Features': (list(range(len(feature_names))), len(feature_names)),
    'Top 20 RF': (top20_indices, len(top20_features)),
    'Top 20 Statistical': (stat_indices, len(stat_features)),
    'Cumulative 80%': (cum80_indices, len(cum80_features)),
    'Cumulative 90%': (cum90_indices, len(cum90_features))
}

print("=== FEATURE SET SUMMARY ===")
for name, (indices, count) in feature_sets.items():
    print(f"{name}: {count} features")

# Quick model performance comparison with different feature sets
print(f"\n=== QUICK PERFORMANCE COMPARISON ===")
results = []

for set_name, (indices, count) in feature_sets.items():
    # Select features
    X_train_subset = X_train.iloc[:, indices]
    X_test_subset = X_test.iloc[:, indices]
    
    # Train simple model
    rf_quick = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
    rf_quick.fit(X_train_subset, y_train)
    
    # Predict and evaluate
    train_pred = rf_quick.predict(X_train_subset)
    test_pred = rf_quick.predict(X_test_subset)
    
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    results.append({
        'Feature Set': set_name,
        'Features': count,
        'Train R²': train_r2,
        'Test R²': test_r2,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Overfitting': train_r2 - test_r2
    })
    
    print(f"{set_name:15s} ({count:3d} features): R² = {test_r2:.4f}, RMSE = {test_rmse:.3f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
print(f"\n=== DETAILED RESULTS ===")
print(results_df.round(4))

# Visualize feature selection results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Performance comparison
performance_data = results_df[['Feature Set', 'Features', 'Test R²', 'Test RMSE']].copy()
ax1.scatter(performance_data['Features'], performance_data['Test R²'], s=100, alpha=0.7)
for i, row in performance_data.iterrows():
    ax1.annotate(row['Feature Set'], 
                (row['Features'], row['Test R²']), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)
ax1.set_xlabel('Number of Features')
ax1.set_ylabel('Test R²')
ax1.set_title('Feature Set Performance: R² vs Number of Features')
ax1.grid(True, alpha=0.3)

# 2. Overfitting analysis
ax2.bar(results_df['Feature Set'], results_df['Overfitting'], alpha=0.7)
ax2.set_xlabel('Feature Set')
ax2.set_ylabel('Overfitting (Train R² - Test R²)')
ax2.set_title('Overfitting Analysis by Feature Set')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3)

# 3. Feature type distribution for selected sets
feature_types = {'Rolling_Mean': [], 'Rolling_Std': [], 'Original': [], 'Temporal': []}

for feature in cum80_features:  # Use 80% cumulative as example
    if '_rolling_mean_' in feature:
        feature_types['Rolling_Mean'].append(feature)
    elif '_rolling_std_' in feature:
        feature_types['Rolling_Std'].append(feature)
    elif feature in ['hour', 'day_of_week', 'month']:
        feature_types['Temporal'].append(feature)
    else:
        feature_types['Original'].append(feature)

type_counts = [len(features) for features in feature_types.values()]
ax3.pie(type_counts, labels=feature_types.keys(), autopct='%1.1f%%', startangle=90)
ax3.set_title(f'Feature Type Distribution\n(80% Cumulative Set - {len(cum80_features)} features)')

# 4. Performance vs complexity tradeoff
ax4.scatter(results_df['Features'], results_df['Test RMSE'], s=100, alpha=0.7, color='red')
for i, row in results_df.iterrows():
    ax4.annotate(row['Feature Set'], 
                (row['Features'], row['Test RMSE']), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)
ax4.set_xlabel('Number of Features')
ax4.set_ylabel('Test RMSE')
ax4.set_title('Feature Set Performance: RMSE vs Number of Features')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Recommend optimal feature set
print("\n=== FEATURE SELECTION RECOMMENDATION ===")
best_r2 = results_df.loc[results_df['Test R²'].idxmax()]
best_tradeoff = results_df.loc[(results_df['Test R²'] > 0.85) & (results_df['Features'] < 50)]
if not best_tradeoff.empty:
    best_tradeoff = best_tradeoff.loc[best_tradeoff['Features'].idxmin()]
    print(f"Recommended: {best_tradeoff['Feature Set']} ({best_tradeoff['Features']} features)")
    print(f"  - Test R²: {best_tradeoff['Test R²']:.4f}")
    print(f"  - Test RMSE: {best_tradeoff['Test RMSE']:.3f}")
    print(f"  - Good balance of performance and complexity")
else:
    print(f"Best overall: {best_r2['Feature Set']} ({best_r2['Features']} features)")
    print(f"  - Test R²: {best_r2['Test R²']:.4f}")
    print(f"  - Test RMSE: {best_r2['Test RMSE']:.3f}")

print(f"\nFinal selected features for modeling: {best_r2['Feature Set']}")
print(f"Features: {sorted(top20_features)}")