# Create Reproducibility Subset

- Below is code to subset the model fitting results for random forest and glmm results for the most common nine species.
- This skips the data produced in `01_model_fitting.ipynb` and the model fitting part in `03_glmm_analysis.ipynb`.
- This data subset allows to reproduce the results in `02_randomforests_analysis.ipynb` and `03_glmm_analysis.ipynb`.
- Important: Results that are based on analysing all species at once are NOT correctly reproduced when using this subset.


## Setup


In [5]:
import os
import shutil


def copy_subset(source_root, destination_root):
    """Copies specified species and their subfolders/files from each run_{seed} folder."""

    # Specify species, subfolders, and root files to copy
    species_to_copy = [
        "Fagus sylvatica",
        "Quercus robur",
        "Quercus petraea",
        "Carpinus betulus",
        "Castanea sativa",
        "Quercus pubescens",
        "Pinus sylvestris",
        "Abies alba",
        "Picea abies",
    ]

    subfolders_to_copy = [
        "final_model",
        "glmm",
        "shap",
        "rf_performance",
        "treeid",
    ]

    glmm_files_to_copy = [
        "pr_auc.csv",
        "roc_auc.csv",
        "summary.csv",
        "glmm_model.pkl",
    ]

    root_files_to_copy = [
        "__user_input.txt",
        "final_model_performance.csv",
        "shap_variable_importance.csv",
        "refcv_metrics.cvs",
    ]

    # Delete destination root folder if it exists
    if os.path.exists(destination_root):
        raise Exception("Destination folder already exists. Please delete it first.")

    # Create destination root folder if it doesn't exist
    os.makedirs(destination_root, exist_ok=True)

    # Copy species-specific content from each run_{seed} folder
    for run_folder in os.listdir(source_root):
        run_path = os.path.join(source_root, run_folder)

        if not os.path.isdir(run_path) or not run_folder.startswith("run_"):
            continue  # Skip non-run folders

        destination_run_path = os.path.join(destination_root, run_folder)
        os.makedirs(destination_run_path, exist_ok=True)

        # Copy species-specific content
        for species in species_to_copy:
            species_source = os.path.join(run_path, species)
            species_dest = os.path.join(destination_run_path, species)

            if os.path.exists(species_source):
                os.makedirs(species_dest, exist_ok=True)

                # Copy selected subfolders
                for subfolder in subfolders_to_copy:
                    subfolder_source = os.path.join(species_source, subfolder)
                    subfolder_dest = os.path.join(species_dest, subfolder)

                    if os.path.exists(subfolder_source):
                        if subfolder == "glmm":
                            # Copy only the specified files from glmm
                            os.makedirs(subfolder_dest, exist_ok=True)
                            for file_name in glmm_files_to_copy:
                                file_source = os.path.join(subfolder_source, file_name)
                                file_dest = os.path.join(subfolder_dest, file_name)

                                if os.path.exists(file_source):
                                    shutil.copy2(file_source, file_dest)
                        else:
                            # Copy the entire folder for other subfolders
                            shutil.copytree(
                                subfolder_source, subfolder_dest, dirs_exist_ok=True
                            )

                # Copy selected root files
                for root_file in root_files_to_copy:
                    file_source = os.path.join(species_source, root_file)
                    file_dest = os.path.join(species_dest, root_file)

                    if os.path.exists(file_source):
                        shutil.copy2(file_source, file_dest)

    print("File copy process completed.")

In [7]:
# Copy subset
dir_root = "model_runs/all_runs"
dir_dest = "/Volumes/SAMSUNG 1TB/data_for_zenodo/all_runs"
# dir_dest = "model_runs/all_runs_subset"

copy_subset(dir_root, dir_dest)

File copy process completed.


---
