In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, balanced_accuracy_score

def objective(params):
    """
    Objective function with refined search space based on previous results.
    """
    # Convert integer parameters
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])
    params['max_leaf_nodes'] = int(params['max_leaf_nodes'])
    
    # Add fixed parameters
    params.update({
        'bootstrap': True,
        'class_weight': 'balanced_subsample',
        'random_state': 42,
        'n_jobs': -1
    })
    
    # Create and train model
    model = RandomForestClassifier(**params)
    model.fit(X_train.values, y_train.values.astype(int))
    
    # Evaluate on validation set
    y_pred = model.predict(X_val.values)
    balanced_accuracy = balanced_accuracy_score(y_val.values.astype(int), y_pred)
    
    return {'loss': -balanced_accuracy, 'status': STATUS_OK}

# Define search space
criterion_choices = ['entropy', 'gini']
class_weight_choices = ['balanced', 'balanced_subsample']

search_space = {
    'n_estimators': hp.quniform('n_estimators', 600, 1000, 1),
    'max_depth': hp.quniform('max_depth', 70, 100, 1),
    'min_samples_split': hp.quniform('min_samples_split', 5, 15, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),
    'max_features': hp.uniform('max_features', 0.8, 1.0),
    'criterion': hp.choice('criterion', criterion_choices),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 400, 600, 1),
    'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0.04, 0.08),
    'class_weight': hp.choice('class_weight', class_weight_choices)
}

# Run optimization
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            show_progressbar=True)

# Print results
print("\nBest trial:")
print(f"  Value (F1 Score): {-min(trials.losses()):.4f}")
print("\nBest parameters:")
for key, value in best.items():
    print(f"    {key}: {value}")

# Train final model with best parameters
best_params = best.copy()
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
best_params['max_leaf_nodes'] = int(best_params['max_leaf_nodes'])
best_params['criterion'] = criterion_choices[int(best_params['criterion'])]
best_params['class_weight'] = class_weight_choices[int(best_params['class_weight'])]
best_params.update({
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1
})

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train.values, y_train.values.astype(int))

# Evaluate on test set
y_pred = final_model.predict(X_test.values)
print("\nTest Set Performance:")
print(f"F1 Score: {f1_score(y_test.values.astype(int), y_pred):.4f}")

In [None]:
# Train final model with best parameters
best_params = best.copy()
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split']) 
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
best_params['max_leaf_nodes'] = int(best_params['max_leaf_nodes'])
best_params['criterion'] = criterion_choices[int(best_params['criterion'])]
best_params['class_weight'] = class_weight_choices[int(best_params['class_weight'])]
best_params.update({
    'bootstrap': True,
    'class_weight': best_params['class_weight'],
    'random_state': 42,
    'n_jobs': -1
})

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train.values, y_train.values.astype(int))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, roc_auc_score, balanced_accuracy_score, 
                           precision_score, recall_score, confusion_matrix, f1_score)

# Make predictions
y_pred = final_model.predict(X_test.values)
y_true = y_test.values.astype(int)

# Calculate metrics
metrics = {
    'F1 Score': f1_score(y_true, y_pred),
    'Precision': precision_score(y_true, y_pred),
    'Recall': recall_score(y_true, y_pred),
    'Accuracy': accuracy_score(y_true, y_pred),
    'AUC': roc_auc_score(y_true, y_pred),
    'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred)
}

# Print metrics in a organized way
print("Test Set Metrics:")
print("-" * 40)
for metric, value in metrics.items():
    print(f"{metric:20s}: {value:.4f}")
print("-" * 40)

# Plot confusion matrix using seaborn
plt.figure(figsize=(6, 4))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Event', 'Extreme Event'],
            yticklabels=['No Event', 'Extreme Event'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()
