In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ===================  Optimizer Functions =====================
# Note: optimizer implementation
# Sources: https://github.com/thieu1995/mealpy
# https://github.com/Valdecy/pyMetaheuristic

# =================== Random Forest & Data Preparation =====================

def load_and_prepare_data(file_path):
    """Load data and prepare train/test splits with scaling."""
    data = pd.read_excel(file_path)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    
    # Apply scaling
    scaler_X = PowerTransformer()
    X_scaled = scaler_X.fit_transform(X)
    
    # Split the data
    X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
        X_scaled, y, data.index, test_size=0.2, random_state=42
    )
    
    return X_train, X_test, y_train, y_test, train_indices, test_indices, scaler_X

def create_objective_function_with_kfold(X_train, y_train, n_splits=5):
    """Create objective function with K-fold cross-validation for hyperparameter optimization."""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    kfold_splits = list(kf.split(X_train))
    
    def objective_function(params):
        n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, max_samples = params
        
        # Convert parameters to appropriate types
        n_estimators = int(n_estimators)
        max_depth = int(max_depth) if max_depth > 0 else None
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        bootstrap = bool(round(bootstrap))
        
        model_params = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'max_features': max_features,
            'bootstrap': bootstrap,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Only add max_samples if bootstrap is True
        if bootstrap:
            model_params['max_samples'] = max_samples
        
        # Perform K-fold cross-validation
        fold_rmse_scores = []
        for train_idx, val_idx in kfold_splits:
            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
            
            model = RandomForestRegressor(**model_params)
            model.fit(X_fold_train, y_fold_train)
            y_pred = model.predict(X_fold_val)
            rmse = np.sqrt(mean_squared_error(y_fold_val, y_pred))
            fold_rmse_scores.append(rmse)
        
        return np.mean(fold_rmse_scores)
    
    return objective_function, kfold_splits

def optimize_rf_hyperparameters(X_train, y_train, iterations=10, population=50, n_splits=5):
    """Optimize Random Forest hyperparameters using CSA algorithm with K-fold CV."""
    
    # Define parameter bounds for Random Forest
    param_bounds = {
        'n_estimators': (100, 500),
        'max_depth': (5, 30),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_features': (0.1, 1.0),
        'bootstrap': (0, 1),
        'max_samples': (0.5, 1.0)
    }
    
    param_names = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 
                   'max_features', 'bootstrap', 'max_samples']
    
    lb = [param_bounds[name][0] for name in param_names]
    ub = [param_bounds[name][1] for name in param_names]
    
    # Create objective function with K-fold CV
    objective_function, kfold_splits = create_objective_function_with_kfold(X_train, y_train, n_splits)
    
    # Create problem bounds using Mealpy's FloatVar
    bounds = FloatVar(lb=lb, ub=ub, name="hyperparams")
    problem_dict = {
        "bounds": bounds,
        "minmax": "min",
        "obj_func": objective_function
    }
    
    # Run CSA optimization
    optimizer = OriginalCircleSA(epoch=iterations, pop_size=population)
    g_best = optimizer.solve(problem_dict)
    best_solution = g_best.solution
    best_fitness = g_best.target.fitness
    
    # Convert to dictionary format
    best_params = {
        'n_estimators': int(best_solution[0]),
        'max_depth': int(best_solution[1]) if best_solution[1] > 0 else None,
        'min_samples_split': int(best_solution[2]),
        'min_samples_leaf': int(best_solution[3]),
        'max_features': best_solution[4],
        'bootstrap': bool(round(best_solution[5])),
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Only add max_samples if bootstrap is True
    if best_params['bootstrap']:
        best_params['max_samples'] = best_solution[6]
    
    return best_params, best_fitness, kfold_splits

def evaluate_fold_performance(X_train, y_train, kfold_splits, best_params, train_indices):
    """Evaluate model performance on individual K-fold splits."""
    fold_results = {}
    
    for fold_idx, (train_idx, val_idx) in enumerate(kfold_splits):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
        
        # Train model on this fold
        fold_model = RandomForestRegressor(**best_params)
        fold_model.fit(X_fold_train, y_fold_train)
        
        # Get predictions
        fold_train_pred = fold_model.predict(X_fold_train)
        fold_val_pred = fold_model.predict(X_fold_val)
        
        # Calculate metrics
        fold_results[f'fold_{fold_idx + 1}'] = {
            'train_indices': train_indices[train_idx],
            'val_indices': train_indices[val_idx],
            'fold_train_pred': fold_train_pred,
            'fold_val_pred': fold_val_pred,
            'metrics': {
                'train_r2': r2_score(y_fold_train, fold_train_pred),
                'train_rmse': np.sqrt(mean_squared_error(y_fold_train, fold_train_pred)),
                'train_mae': mean_absolute_error(y_fold_train, fold_train_pred),
                'val_r2': r2_score(y_fold_val, fold_val_pred),
                'val_rmse': np.sqrt(mean_squared_error(y_fold_val, fold_val_pred)),
                'val_mae': mean_absolute_error(y_fold_val, fold_val_pred)
            }
        }
    
    return fold_results

def train_and_evaluate_model(X_train, X_test, y_train, y_test, params):
    """Train Random Forest model with optimized parameters and return metrics."""
    
    # Train the model
    rf_model = RandomForestRegressor(**params)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    train_predictions = rf_model.predict(X_train)
    test_predictions = rf_model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'train_r2': r2_score(y_train, train_predictions),
        'train_rmse': np.sqrt(mean_squared_error(y_train, train_predictions)),
        'train_mae': mean_absolute_error(y_train, train_predictions),
        'test_r2': r2_score(y_test, test_predictions),
        'test_rmse': np.sqrt(mean_squared_error(y_test, test_predictions)),
        'test_mae': mean_absolute_error(y_test, test_predictions)
    }
    
    return rf_model, metrics

def main(file_path="P03.xlsx", iterations=10, population=50, n_splits=5):
    """Main function to run Random Forest optimization with CSA and K-fold CV."""
    
    # Load and prepare data
    X_train, X_test, y_train, y_test, train_indices, test_indices, scaler = load_and_prepare_data(file_path)
    
    # Optimize hyperparameters with K-fold CV
    best_params, best_fitness, kfold_splits = optimize_rf_hyperparameters(
        X_train, y_train, iterations, population, n_splits
    )
    
    # Train and evaluate final model
    model, metrics = train_and_evaluate_model(X_train, X_test, y_train, y_test, best_params)
    
    # Evaluate individual fold performance
    fold_results = evaluate_fold_performance(X_train, y_train, kfold_splits, best_params, train_indices)
    
    return model, best_params, metrics, fold_results

if __name__ == "__main__":
    model, params, metrics, fold_results = main()