In [26]:
import time
import os
import import_ipynb
import pandas as pd
import matplotlib.pyplot as plt

import pickle
from collections import defaultdict

from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.plots import plot_convergence, plot_objective

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from helpers import get_dataset_path, initialize_model, evaluate_model


In [27]:

TUNABLE_PARAMS = {
    "n_estimators": Integer(100, 1000),
    "max_depth": Integer(10, 100),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "max_features": Categorical([0.3, 0.5, 0.7, 0.9, "sqrt", "log2"]),  
    "criterion": Categorical(["gini", "entropy"])
}

configurations = {
    "Simple Random Sampling": {
        "dataset_template": "simple_random_{sample_size}.csv",
        "model": RandomForestClassifier,
        "fixed_params": {
            "bootstrap": True
        },
        "tunable_params": TUNABLE_PARAMS
    },
    "SMOTE Balanced Sampling": {
        "dataset_template": "smote_balanced_{sample_size}.csv",
        "model": RandomForestClassifier,
        "fixed_params": {
            "bootstrap": True
        },
        "tunable_params": TUNABLE_PARAMS
    },
    "Stratified Balanced Sampling": {
        "dataset_template": "stratified_balanced_{sample_size}.csv",
        "model": RandomForestClassifier,
        "fixed_params": {
            "bootstrap": True
        },
        "tunable_params": TUNABLE_PARAMS
    },
    "Balanced RF": {
        "dataset_template": "simple_random_{sample_size}.csv",
        "model": BalancedRandomForestClassifier,
        "fixed_params": {
            "sampling_strategy": "all",
            "replacement": False,
            "bootstrap": True,
            "oob_score": False
        },
        "tunable_params": TUNABLE_PARAMS
    },
    "Weighted RF": {
        "dataset_template": "simple_random_{sample_size}.csv",
        "model": RandomForestClassifier,
        "fixed_params": {
            "class_weight": "balanced",
            "bootstrap": True
        },
        "tunable_params": TUNABLE_PARAMS
    },
    "Weighted Subsample RF": {
        "dataset_template": "simple_random_{sample_size}.csv",
        "model": RandomForestClassifier,
        "fixed_params": {
            "class_weight": "balanced_subsample",
            "bootstrap": True
        },
        "tunable_params": TUNABLE_PARAMS
    }
}

In [28]:
data_path = "../data/samples"
aoi_sites = ["aoi1_2024_06_02", "aoi2_2024_06_02", "aoi3_2024_06_02", "aoi4_2024_06_02"]
sample_sizes = [100] #[100, 1000, 10000]  # Sample sizes to evaluate
iterations = 1  # Number of times to repeat the evaluation

In [29]:
# Centralized results storage
all_results = []  # Store results from all iterations

for iteration in range(1, iterations + 1):
    print(f"Starting Iteration {iteration}...")
    iteration_results = []  # Store results for this iteration

    # Leave-One-Site-Out Cross-Validation for each sample size
    for sample_size in sample_sizes:
        for left_out_site in aoi_sites:
            train_sites = [site for site in aoi_sites if site != left_out_site]
            test_site = left_out_site

            for config_name in configurations.keys():
                start_time = time.time()  # Start time for the evaluation

                # Load training data dynamically
                train_data = pd.concat([
                    pd.read_csv(get_dataset_path(configurations, config_name, site, sample_size, data_path, is_train=True))
                    for site in train_sites
                ])
                # Load testing data (fixed stratified balanced dataset)
                test_data = pd.read_csv(get_dataset_path(configurations, config_name, test_site, sample_size, data_path, is_train=False))
                test_data_balanced = pd.read_csv(get_dataset_path(configurations, config_name, test_site, sample_size, data_path, is_train=False, is_balanced=True))

                # Prepare features and labels
                y_train = train_data['label']
                y_test = test_data['label']
                y_test_balanced = test_data_balanced['label']

                X_train = train_data.drop(columns=['label', 'longitude', 'latitude', 'synthetic'], errors='ignore')
                X_test = test_data.drop(columns=['label', 'longitude', 'latitude', 'synthetic'], errors='ignore')
                X_test_balanced = test_data_balanced.drop(columns=['label', 'longitude', 'latitude', 'synthetic'], errors='ignore')
                
                # Initialize and train the untuned model
                model = initialize_model(configurations, config_name)
                model.fit(X_train, y_train)

                 
                # Evaluate the untuned model
                oa, f1 = evaluate_model(model, X_test, y_test)
                oa_balanced, f1_balanced = evaluate_model(model, X_test_balanced, y_test_balanced)

                # Tune hyperparameters and get the best model
                def tune_model_hyperparameters(configurations, X_train, y_train, config_name, iteration, sample_size):
                    """
                    Tune the hyperparameters of the model using Bayesian optimization.
                    Returns the best model, parameters, and optimization results.
                    """
                    config = configurations[config_name]
                    model = config["model"](**config["fixed_params"])
                    
                    # Define the search space using tunable_params
                    search_space = config["tunable_params"]
                    
                    # Initialize BayesSearchCV for hyperparameter tuning
                    bayes_search = BayesSearchCV(
                        estimator=model,
                        search_spaces=search_space,
                        n_iter=20,
                        scoring="f1_weighted",
                        cv=3,
                        n_jobs=-1,
                        random_state=42,
                        return_train_score=True
                    )
                    
                    # Perform the search
                    bayes_search.fit(X_train, y_train)
                    
                    # Create directory for optimization plots
                    plot_dir = "../data/results/optimization_objective_plot"
                    os.makedirs(plot_dir, exist_ok=True)
                    
                    # Create and save the objective plot
                    try:
                        # Set smaller font sizes for this plot
                        plt.rcParams.update({
                            'font.size': 6,
                            'axes.labelsize': 6,
                            'axes.titlesize': 6,
                            'xtick.labelsize': 6,
                            'ytick.labelsize': 6,
                            'legend.fontsize': 6
                        })
                        
                        # Create figure with explicit figure number
                        fig = plt.figure(figsize=(12, 8))
                        plot_objective(bayes_search.optimizer_results_[0])
                        plt.tight_layout()
                        
                        # Save plot with informative filename
                        plot_path = f"{plot_dir}/objective_iter{iteration}_{config_name}_sample{sample_size}.svg"
                        plt.savefig(plot_path, bbox_inches='tight')
                        plt.close(fig)

                    finally:
                        # Ensure figure is closed even if an error occurs
                        plt.close(fig)
                        # Reset to default font sizes
                        plt.rcParams.update({
                            'font.size': 10,
                            'axes.labelsize': 10,
                            'axes.titlesize': 12,
                            'xtick.labelsize': 10,
                            'ytick.labelsize': 10,
                            'legend.fontsize': 10
                        })
                    
                    return bayes_search.best_estimator_, bayes_search.best_params_

                best_model, best_params = tune_model_hyperparameters(
                    configurations,
                    X_train, 
                    y_train, 
                    config_name,
                    iteration,
                    sample_size
                )
                # Evaluate the best model 
                oa_tuned, f1_tuned = evaluate_model(best_model, X_test, y_test)
                oa_balanced_tuned, f1_balanced_tuned = evaluate_model(best_model, X_test_balanced, y_test_balanced)
            
                # Record end time
                elapsed_time = time.time() - start_time

                # Store results
                iteration_results.append({
                    "iteration": iteration,
                    "configuration": config_name,
                    "site_left_out": test_site,
                    "sample_size": sample_size,
                    "OA": oa,
                    "balanced_OA": oa_balanced,
                    "F1": f1,
                    "balanced_F1": f1_balanced, 
                    "OA_tuned": oa_tuned,
                    "balanced_OA_tuned": oa_balanced_tuned,
                    "F1_tuned": f1,
                    "balanced_F1_tuned": f1_balanced_tuned,
                    "time_seconds": elapsed_time,
                    **{"param_" + key: value for key, value in best_params.items()}
                })

                # Print completion message with time taken
                print(f"Iteration {iteration}: Completed {config_name}, Sample size: {sample_size}, Testing site: {test_site} in {elapsed_time:.2f} seconds")

    # Append results of this iteration to the overall results
    all_results.extend(iteration_results)
    
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Save results to CSV
results_csv_path = "../data/results/evaluation_results_tuned.csv"
results_df.to_csv(results_csv_path, index=False)

# Display summary of results
print(f"Completed {iterations} iterations. Final Results:")
print(results_df)

Starting Iteration 1...
Iteration 1: Completed Simple Random Sampling, Sample size: 100, Testing site: aoi1_2024_06_02 in 14.13 seconds
Iteration 1: Completed SMOTE Balanced Sampling, Sample size: 100, Testing site: aoi1_2024_06_02 in 10.44 seconds
Iteration 1: Completed Stratified Balanced Sampling, Sample size: 100, Testing site: aoi1_2024_06_02 in 9.58 seconds
Iteration 1: Completed Balanced RF, Sample size: 100, Testing site: aoi1_2024_06_02 in 12.35 seconds


In [24]:
from hyperparameter_tuning import generate_optimal_parameter_table

generate_optimal_parameter_table(
    input_path="../data/results/evaluation_results_tuned.csv",
    output_path="../data/results/optimal_parameters"
)



ImportError: cannot import name 'generate_optimal_parameter_table' from 'hyperparameter_tuning' (unknown location)

In [9]:
from plotting_functions import plot_metric_comparison

plot_metric_comparison(
    results_df=results_df,  # Pass the DataFrame directly
    configurations=configurations,
    metric_pair=("OA", "OA_tuned"),
    sample_size=100,
    y_limit = (0.5, 1.0),
    output_path="../data/plots/evaluation_results_tuned.svg"  # Changed to .svg
)

NameError: name 'configurations' is not defined

In [8]:
from plotting_functions import plot_metric_comparison

plot_metric_comparison(
    results_df=results_df,  # Pass the DataFrame directly
    configurations=configurations,
    metric_pair=("F1", "F1_tuned"),
    sample_size=100,
    output_path="../data/plots/evaluation_results_tuned.svg"  # Changed to .svg
)

NameError: name 'configurations' is not defined

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Set Seaborn style
sns.set_theme(style="whitegrid", font="Arial", font_scale=1)
sns.set_context("talk",  font_scale=1, rc={"lines.linewidth": 2.5})

def plot_f1_boxplot(csv_filepath, configurations):
    """
    Plots F1 scores across configurations using Seaborn Catplot with boxplots.
    Custom binary colors are used for each sample size, with some set to invisible.
    """
    # Load results from the CSV file
    results_df = pd.read_csv(csv_filepath)

    # Filter to only include the F1 metric
    filtered_results = results_df[["configuration", "sample_size", "F1"]]
    plt.figure(figsize=(12, 4))

    # Define custom color palette for sample sizes
    custom_palette = {
        100: "#FFFFFF00",  # Transparent
        1000: "#9f9f9f",   # Gray
        10000: "#FFFFFF00" # Transparent
    }

    # Create the Seaborn Catplot
    catplot = sns.catplot(
        data=filtered_results,
        x="configuration",
        y="F1",
        hue="sample_size",
        kind="box",
        palette=sns.color_palette("colorblind", 3),
        height=6,
        aspect=3,
        legend=False,
        linewidth=1.5
    )

    # Adjust y-axis limits
    catplot.set(ylim=(0.85, 1))

    # Customize the plot
    catplot.set_axis_labels("", "F1-Score")
    catplot.set_titles("F1 Scores Across Configurations and Sample Sizes")
    catplot.set_xticklabels(rotation=45, horizontalalignment="right")

    #sns.despine(offset=10, trim=True);

    # Save the figure as an SVG
    output_filepath = "results/figures/configurations/f1_boxplot.svg"
    plt.savefig(output_filepath, format="svg", bbox_inches="tight")
    print(f"Plot saved to {output_filepath}")

    # Show the plot
    plt.show()

# Example call to the function
csv_filepath = "results/data/evaluation_results.csv"  # Path to the CSV file
plot_f1_boxplot(csv_filepath, configurations)
