## 7. Feature Importance
**Purpose**: Identify the most predictive features using machine learning techniques

In [None]:
# Feature Selection Functions - Define before using
def select_top_k_features(X, y, feature_names, k=20):
    """Select top K features based on Random Forest importance"""
    rf = RandomForestRegressor(n_estimators=25, max_depth=4, random_state=42)
    rf.fit(X, y)
    
    importance_scores = rf.feature_importances_
    feature_importance = list(zip(feature_names, importance_scores))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    selected_features = [f[0] for f in feature_importance[:k]]
    selected_indices = [feature_names.index(f) for f in selected_features]
    
    return selected_features, selected_indices

def select_by_cumulative_importance(X, y, feature_names, threshold=0.8):
    """Select features that contribute to X% of cumulative importance"""
    rf = RandomForestRegressor(n_estimators=25, max_depth=4, random_state=42)
    rf.fit(X, y)
    
    importance_scores = rf.feature_importances_
    feature_importance = list(zip(feature_names, importance_scores))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    cumulative_importance = 0
    selected_features = []
    
    for feature, importance in feature_importance:
        selected_features.append(feature)
        cumulative_importance += importance
        if cumulative_importance >= threshold:
            break
    
    selected_indices = [feature_names.index(f) for f in selected_features]
    return selected_features, selected_indices

def select_statistical_features(X, y, feature_names, k=20):
    """Select features using statistical tests (f_regression)"""
    selector = SelectKBest(score_func=f_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    
    selected_indices = selector.get_support(indices=True)
    selected_features = [feature_names[i] for i in selected_indices]
    
    return selected_features, selected_indices

print("✅ Feature selection functions defined")

In [None]:
# Feature Importance Analysis using Random Forest
print("🎯 Feature Importance Analysis")
print("="*50)

# Prepare data for importance analysis
X_temp, _, y_temp, _ = prepare_model_data(df_features, target_column, test_size=0.3)

# Use only a subset of data for faster training
sample_size = min(5000, len(X_temp))  # Limit to 5000 samples max
if len(X_temp) > sample_size:
    from sklearn.utils import resample
    X_temp, y_temp = resample(X_temp, y_temp, n_samples=sample_size, random_state=42)

print(f"📊 Importance Analysis Setup:")
print(f"  • Features for analysis: {X_temp.shape[1]}")
print(f"  • Samples for training: {X_temp.shape[0]:,}")
print(f"  • Target variable: {target_column}")

# Train Random Forest for feature importance (using a subset for speed)
print(f"\n🌲 Training Random Forest for Feature Importance...")
rf_importance = RandomForestRegressor(
    n_estimators=50,        # Reduced from 100 for speed
    random_state=42,
    n_jobs=-1,
    max_depth=8,           # Reduced from 10
    min_samples_split=10,  # Added to prevent overfitting
    min_samples_leaf=5     # Added to speed up training
)

# Fit the model
rf_importance.fit(X_temp, y_temp)

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X_temp.columns,
    'importance': rf_importance.feature_importances_
}).sort_values('importance', ascending=False)

print(f"✅ Random Forest trained successfully")
print(f"  • Model R² score: {rf_importance.score(X_temp, y_temp):.3f}")
print(f"  • Feature importance calculated for {len(feature_importance)} features")

# Display top features
print(f"\n🏆 Top 20 Most Important Features:")
print(f"{'Rank':<4} {'Feature':<50} {'Importance':<12} {'Type'}")
print("-" * 80)

for i, (_, row) in enumerate(feature_importance.head(20).iterrows(), 1):
    feature = row['feature']
    importance = row['importance']
    
    # Determine feature type
    if '_rolling_mean_' in feature:
        feat_type = 'Rolling Mean'
    elif '_rolling_std_' in feature:
        feat_type = 'Rolling Std'
    elif feature in ['hour', 'day_of_week', 'month']:
        feat_type = 'Temporal'
    else:
        feat_type = 'Original'
    
    print(f"{i:<4} {feature[:48]:<50} {importance:<12.4f} {feat_type}")

# Calculate importance by feature type
importance_by_type = {}
for feat_type in ['Original', 'Rolling Mean', 'Rolling Std', 'Temporal']:
    if feat_type == 'Rolling Mean':
        mask = feature_importance['feature'].str.contains('_rolling_mean_')
    elif feat_type == 'Rolling Std':
        mask = feature_importance['feature'].str.contains('_rolling_std_')
    elif feat_type == 'Temporal':
        mask = feature_importance['feature'].isin(['hour', 'day_of_week', 'month'])
    else:  # Original
        mask = ~(feature_importance['feature'].str.contains('_rolling_') | 
                feature_importance['feature'].isin(['hour', 'day_of_week', 'month']))
    
    if mask.any():
        importance_by_type[feat_type] = {
            'total_importance': feature_importance[mask]['importance'].sum(),
            'avg_importance': feature_importance[mask]['importance'].mean(),
            'count': mask.sum(),
            'top_feature': feature_importance[mask].iloc[0]['feature'] if mask.any() else 'None'
        }

print(f"\n📊 Feature Importance by Type:")
print(f"{'Type':<15} {'Count':<7} {'Total Imp':<11} {'Avg Imp':<10} {'Top Feature'}")
print("-" * 80)

for feat_type, stats in importance_by_type.items():
    print(f"{feat_type:<15} {stats['count']:<7} {stats['total_importance']:<11.4f} {stats['avg_importance']:<10.4f} {stats['top_feature'][:25]}")

print(f"\n✅ Feature importance analysis complete")

# Store results for next steps
importance_results = {
    'feature_importance_df': feature_importance,
    'model_r2': rf_importance.score(X_temp, y_temp),
    'importance_by_type': importance_by_type
}

In [None]:
# Feature Importance Visualization
print("\n📊 Feature Importance Visualization")
print("="*50)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Feature Importance Analysis', fontsize=16, fontweight='bold')

# 1. Top 15 features bar plot
top_15 = feature_importance.head(15)
axes[0, 0].barh(range(len(top_15)), top_15['importance'])
axes[0, 0].set_yticks(range(len(top_15)))
axes[0, 0].set_yticklabels([f[:30] + '...' if len(f) > 30 else f for f in top_15['feature']])
axes[0, 0].set_xlabel('Importance')
axes[0, 0].set_title('Top 15 Most Important Features')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].invert_yaxis()

# 2. Importance by feature type
types = list(importance_by_type.keys())
total_importances = [importance_by_type[t]['total_importance'] for t in types]
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold']

bars = axes[0, 1].bar(types, total_importances, color=colors[:len(types)], alpha=0.7)
axes[0, 1].set_title('Total Importance by Feature Type')
axes[0, 1].set_ylabel('Total Importance')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, total_importances):
    height = bar.get_height()
    axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + 0.001,
                   f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

# 3. Cumulative importance
cumulative_importance = feature_importance['importance'].cumsum()
axes[1, 0].plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
axes[1, 0].axhline(y=0.8, color='red', linestyle='--', alpha=0.7, label='80% threshold')
axes[1, 0].axhline(y=0.9, color='orange', linestyle='--', alpha=0.7, label='90% threshold')
axes[1, 0].set_xlabel('Number of Features')
axes[1, 0].set_ylabel('Cumulative Importance')
axes[1, 0].set_title('Cumulative Feature Importance')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].legend()

# Find how many features needed for 80% and 90% importance
features_for_80 = (cumulative_importance >= 0.8).idxmax() + 1
features_for_90 = (cumulative_importance >= 0.9).idxmax() + 1

# 4. Average importance by feature type
avg_importances = [importance_by_type[t]['avg_importance'] for t in types]
axes[1, 1].bar(types, avg_importances, color=colors[:len(types)], alpha=0.7)
axes[1, 1].set_title('Average Importance by Feature Type')
axes[1, 1].set_ylabel('Average Importance')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

# Add value labels
for i, (bar, value) in enumerate(zip(axes[1, 1].patches, avg_importances)):
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.0001,
                   f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Analysis insights
print(f"\n🎯 Key Feature Importance Insights:")
print(f"  • Most important feature: {feature_importance.iloc[0]['feature']}")
print(f"    - Importance: {feature_importance.iloc[0]['importance']:.4f}")
print(f"    - Type: {'Rolling Mean' if '_rolling_mean_' in feature_importance.iloc[0]['feature'] else 'Original'}")

print(f"\n📊 Feature Selection Recommendations:")
print(f"  • Features needed for 80% importance: {features_for_80}")
print(f"  • Features needed for 90% importance: {features_for_90}")
print(f"  • Total features available: {len(feature_importance)}")

# Feature type performance
best_type = max(importance_by_type.items(), key=lambda x: x[1]['avg_importance'])
print(f"\n🏆 Best performing feature type:")
print(f"  • Type: {best_type[0]}")
print(f"  • Average importance: {best_type[1]['avg_importance']:.4f}")
print(f"  • Total contribution: {best_type[1]['total_importance']:.4f}")

print(f"\n✅ Feature importance visualization complete - ready for feature selection")