# 03 - Feature Selection

This notebook covers:
1. Loading preprocessed data
2. Feature importance using Random Forest
3. Recursive Feature Elimination (RFE)
4. Chi-Square test for feature significance
5. Comparing different feature selection methods
6. Selecting optimal features for modeling


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load preprocessed data
print("Loading preprocessed data...")
X_train = joblib.load('../data/X_train.pkl')
X_test = joblib.load('../data/X_test.pkl')
y_train = joblib.load('../data/y_train.pkl')
y_test = joblib.load('../data/y_test.pkl')

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Feature names: {list(X_train.columns)}")

# Display first few rows
print("\nFirst 5 rows of training data:")
print(X_train.head())


In [None]:
# Method 1: Random Forest Feature Importance
print("Method 1: Random Forest Feature Importance")
print("=" * 50)

# Train Random Forest to get feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Random Forest):")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(12, 8))
plt.barh(range(len(feature_importance)), feature_importance['importance'], 
         color='skyblue', alpha=0.7, edgecolor='black')
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Select top features based on importance
top_features_rf = feature_importance.head(8)['feature'].tolist()
print(f"\nTop 8 features selected by Random Forest: {top_features_rf}")


In [None]:
# Method 2: Recursive Feature Elimination (RFE)
print("\nMethod 2: Recursive Feature Elimination (RFE)")
print("=" * 50)

# Use Logistic Regression as base estimator for RFE
estimator = LogisticRegression(random_state=42, max_iter=1000)

# Apply RFE to select top 8 features
rfe = RFE(estimator=estimator, n_features_to_select=8)
rfe.fit(X_train, y_train)

# Get selected features
selected_features_rfe = X_train.columns[rfe.support_].tolist()
feature_ranking_rfe = pd.DataFrame({
    'feature': X_train.columns,
    'ranking': rfe.ranking_,
    'selected': rfe.support_
}).sort_values('ranking')

print("RFE Feature Ranking:")
print(feature_ranking_rfe)

print(f"\nTop 8 features selected by RFE: {selected_features_rfe}")

# Visualize RFE results
plt.figure(figsize=(12, 8))
colors = ['red' if not selected else 'green' for selected in rfe.support_]
plt.barh(range(len(feature_ranking_rfe)), feature_ranking_rfe['ranking'], 
         color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(feature_ranking_rfe)), feature_ranking_rfe['feature'])
plt.xlabel('RFE Ranking (1 = selected)')
plt.title('Recursive Feature Elimination Results')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# Method 3: Statistical Tests (F-test and Chi-Square)
print("\nMethod 3: Statistical Tests")
print("=" * 50)

# F-test (ANOVA F-test)
f_selector = SelectKBest(score_func=f_classif, k=8)
X_train_f = f_selector.fit_transform(X_train, y_train)
selected_features_f = X_train.columns[f_selector.get_support()].tolist()

f_scores = pd.DataFrame({
    'feature': X_train.columns,
    'f_score': f_selector.scores_,
    'p_value': f_selector.pvalues_,
    'selected': f_selector.get_support()
}).sort_values('f_score', ascending=False)

print("F-test Results:")
print(f_scores)

print(f"\nTop 8 features selected by F-test: {selected_features_f}")

# Visualize F-test results
plt.figure(figsize=(12, 8))
colors = ['red' if not selected else 'green' for selected in f_selector.get_support()]
plt.barh(range(len(f_scores)), f_scores['f_score'], 
         color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(f_scores)), f_scores['feature'])
plt.xlabel('F-Score')
plt.title('F-test Feature Selection Results')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# Compare all feature selection methods
print("\nFeature Selection Comparison")
print("=" * 50)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Feature': X_train.columns,
    'RF_Importance': feature_importance.set_index('feature')['importance'],
    'RFE_Selected': [1 if f in selected_features_rfe else 0 for f in X_train.columns],
    'F_Test_Selected': [1 if f in selected_features_f else 0 for f in X_train.columns]
})

# Calculate consensus score
comparison_df['Consensus_Score'] = (
    comparison_df['RFE_Selected'] + 
    comparison_df['F_Test_Selected'] + 
    (comparison_df['RF_Importance'] > comparison_df['RF_Importance'].median()).astype(int)
)

# Sort by consensus score
comparison_df = comparison_df.sort_values('Consensus_Score', ascending=False)

print("Feature Selection Comparison:")
print(comparison_df)

# Select final features based on consensus
final_features = comparison_df[comparison_df['Consensus_Score'] >= 2]['Feature'].tolist()
print(f"\nFinal selected features (consensus >= 2): {final_features}")

# If we need exactly 8 features, take top 8 by consensus
if len(final_features) > 8:
    final_features = final_features[:8]
elif len(final_features) < 8:
    # Add features with highest RF importance that aren't already selected
    remaining_features = comparison_df[comparison_df['Consensus_Score'] < 2]['Feature'].tolist()
    needed = 8 - len(final_features)
    final_features.extend(remaining_features[:needed])

print(f"\nFinal 8 features for modeling: {final_features}")


In [None]:
# Create datasets with selected features
X_train_selected = X_train[final_features]
X_test_selected = X_test[final_features]

print(f"Original training data shape: {X_train.shape}")
print(f"Selected training data shape: {X_train_selected.shape}")
print(f"Features removed: {X_train.shape[1] - X_train_selected.shape[1]}")

# Visualize final feature selection
plt.figure(figsize=(15, 5))

# 1. Consensus score visualization
plt.subplot(1, 3, 1)
plt.barh(range(len(comparison_df)), comparison_df['Consensus_Score'], 
         color='lightblue', alpha=0.7, edgecolor='black')
plt.yticks(range(len(comparison_df)), comparison_df['Feature'])
plt.xlabel('Consensus Score')
plt.title('Feature Selection Consensus')
plt.gca().invert_yaxis()

# 2. Selected vs not selected
plt.subplot(1, 3, 2)
selected_mask = [f in final_features for f in comparison_df['Feature']]
colors = ['green' if selected else 'red' for selected in selected_mask]
plt.barh(range(len(comparison_df)), [1] * len(comparison_df), 
         color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(comparison_df)), comparison_df['Feature'])
plt.xlabel('Selected (Green) vs Not Selected (Red)')
plt.title('Final Feature Selection')
plt.gca().invert_yaxis()

# 3. Feature importance of selected features
plt.subplot(1, 3, 3)
selected_importance = comparison_df[comparison_df['Feature'].isin(final_features)]
plt.barh(range(len(selected_importance)), selected_importance['RF_Importance'], 
         color='orange', alpha=0.7, edgecolor='black')
plt.yticks(range(len(selected_importance)), selected_importance['Feature'])
plt.xlabel('Random Forest Importance')
plt.title('Importance of Selected Features')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()


In [None]:
# Save feature-selected data and feature selection results
import os

# Create directories if they don't exist
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save feature-selected data
joblib.dump(X_train_selected, '../data/X_train_selected.pkl')
joblib.dump(X_test_selected, '../data/X_test_selected.pkl')
joblib.dump(final_features, '../models/selected_features.pkl')

# Save feature selection models and results
joblib.dump(rf, '../models/rf_feature_selector.pkl')
joblib.dump(rfe, '../models/rfe_selector.pkl')
joblib.dump(f_selector, '../models/f_test_selector.pkl')
joblib.dump(comparison_df, '../models/feature_comparison.pkl')

print("Feature selection completed and data saved!")
print("Files saved:")
print("- ../data/X_train_selected.pkl")
print("- ../data/X_test_selected.pkl")
print("- ../models/selected_features.pkl")
print("- ../models/rf_feature_selector.pkl")
print("- ../models/rfe_selector.pkl")
print("- ../models/f_test_selector.pkl")
print("- ../models/feature_comparison.pkl")

# Display summary
print(f"\nFeature Selection Summary:")
print(f"- Original features: {X_train.shape[1]}")
print(f"- Selected features: {len(final_features)}")
print(f"- Features removed: {X_train.shape[1] - len(final_features)}")
print(f"- Dimensionality reduction: {((X_train.shape[1] - len(final_features)) / X_train.shape[1] * 100):.1f}%")
print(f"- Selected features: {final_features}")
