## Initial Feature Ranking with Tree-Based Methods

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Define features and target
X = df.drop(columns=['occurrence'])  # Exclude the target column
y = df['occurrence']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances from Random Forest')
plt.show()

# Print feature importances
print(feature_importance_df)


## Detailed Feature Selection with Recursive Feature Elimination (RFE)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assuming rf is already defined as a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize RFE with RandomForestClassifier
rfe = RFE(estimator=rf, n_features_to_select=11)  # Adjust number of features to 11
rfe.fit(X_train, y_train)

# Get RFE rankings
rfe_ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

# Print RFE rankings
print("RFE Rankings:")
print(rfe_ranking)

# Filter features selected by RFE
selected_features = rfe_ranking[rfe_ranking['Ranking'] == 1]['Feature'].tolist()
print("Selected Features:", selected_features)

# Optionally, retrain a model with selected features
X_train_rfe = X_train[selected_features]
X_test_rfe = X_test[selected_features]

# Initialize and train the RandomForestClassifier with selected features
rf_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
rf_rfe.fit(X_train_rfe, y_train)

# Evaluate the model
y_pred_rfe = rf_rfe.predict(X_test_rfe)
print("Accuracy with RFE-selected features:", accuracy_score(y_test, y_pred_rfe))
print("Classification Report:\n", classification_report(y_test, y_pred_rfe))
