# Testing Predictivness of Datasets


In [52]:


# Imports

# Magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Data wrangling
import pandas as pd
import numpy as np

# Data visualisation
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

# My functions
import sys

sys.path.insert(0, "../../src")
from run_mp import *
from utilities import *
from random_forest_utils import *

# Other
from os import error
import datetime
from io import StringIO
import re
import warnings
import chime

from pyprojroot import here

chime.theme("mario")
import statsmodels.api as sm
from scipy.stats import pearsonr
import os
from sklearn.inspection import PartialDependenceDisplay


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## User Input


In [62]:
target_of_interest = "mort_nat_vol_prc_yr"  # ! Set Target Variable
species_subset = "all"  # ! Define species (lowercase)
height_class_subset = "all"  # ! Define height subset (lowercase)
dataset_to_compare_to = "edo"  # ! Define predictor database (see below for options)
share_of_0s_allowed = 0.01  # ! Set allowed number of 0s in target variable

## Load Data


### Predictors


In [63]:
# Show all available datasets
dir_predictors = here("data/final/predictor_datasets")
predictor_datasets = [f for f in os.listdir(dir_predictors) if not f.startswith(".")]
predictor_datasets = [re.sub(".feather", "", f) for f in predictor_datasets]
predictor_datasets = predictor_datasets + ["growth_mortality"]
print("Predictor datasets available:")
for i, dataset in enumerate(predictor_datasets):
    print(f" {i+1}. {dataset}")

Predictor datasets available:
 1. apt
 2. edo
 3. forest_structure-idp
 4. dem_derivatives
 5. forest_health
 6. soil
 7. safran
 8. growth_mortality


In [64]:
# Get files for growth and mortality
file_dir = here("data/tmp/nfi/growth_and_mortality_data/idp").as_posix() + "/"

# List top 10 files
file_list = os.listdir(file_dir)
print(f" - Number of files in directory: {len(file_list)} | {file_list[:10]}")

 - Number of files in directory: 73 | ['species_fraxinus-height_0-10.feather', 'species_acer-height_0-10.feather', 'species_quercus-height_all.feather', 'species_picea-height_10-15.feather', 'species_carpinus-height_10-15.feather', 'species_fagus-height_20-25.feather', 'species_quercus-height_10-15.feather', 'species_abies-height_0-10.feather', 'species_populus-height_20-25.feather', 'species_carpinus-height_0-10.feather']


In [65]:
# Load requested predictor dataset
if dataset_to_compare_to == "growth_mortality":
    print(
        "Not loading additional predictor dataset because comparison to growth_mortality data itself."
    )
else:
    file_predictors = here(
        f"data/final/predictor_datasets/{dataset_to_compare_to}.feather"
    )
    if not os.path.exists(file_predictors):
        raise ValueError(f"Predictor dataset {dataset_to_compare_to} does not exist.")
    else:
        df_pr_org = pd.read_feather(file_predictors)
        print(
            f"Shape of predictor dataset '{dataset_to_compare_to}': {df_pr_org.shape}"
        )
        # display(df_pr_org.head())

Shape of predictor dataset 'edo': (40022, 317)


### Growth & Mortality


In [66]:
# Get requested growth and mortality subset
file_name = f"species_{species_subset}-height_{height_class_subset}.feather"
if file_name in file_list:
    print(f" ✅ Requested file `{file_name}` found in directory.")
    df_gm_org = pd.read_feather(file_dir + file_name)
    print(f"     - Shape of data: {df_gm_org.shape}")
    print(f"     - Variables in data: {df_gm_org.columns.tolist()}")
    # display(df.head())
else:
    raise ValueError(f" ❌ Requested file `{file_name}` not found in directory.")

if dataset_to_compare_to == "growth_mortality":
    df_pr_org = df_gm_org.copy()

 ✅ Requested file `species_all-height_all.feather` found in directory.
     - Shape of data: (40231, 48)
     - Variables in data: ['idp', 'n_plots', 'n_a1', 'n_a2', 'n_aa', 'n_ad', 'n_ac', 'n_na', 'ba_ax_v1', 'ba_ax_v2', 'ba_aa_v1', 'ba_aa_v2', 'ba_ad_v1', 'ba_ac_v1', 'ba_na_v2', 'vol_ax_v1', 'vol_aa_v1', 'vol_ad_v1', 'vol_ac_v1', 'mort_tot_stems_prc_yr_esq', 'mort_nat_stems_prc_yr_esq', 'mort_cut_stems_prc_yr_esq', 'mort_tot_stems_prc_yr', 'mort_nat_stems_prc_yr', 'mort_cut_stems_prc_yr', 'mort_tot_ba_yr', 'mort_tot_ba_prc_yr', 'mort_nat_ba_yr', 'mort_nat_ba_prc_yr', 'mort_cut_ba_yr', 'mort_cut_ba_prc_yr', 'mort_tot_vol_yr', 'mort_tot_vol_prc_yr', 'mort_nat_vol_yr', 'mort_nat_vol_prc_yr', 'mort_cut_vol_yr', 'mort_cut_vol_prc_yr', 'grwt_stems_prc_yr', 'grwt_tot_ba_yr', 'grwt_tot_ba_prc_yr', 'grwt_tot_ba_prc_yr_hos', 'grwt_sur_ba_yr', 'grwt_sur_ba_prc_yr', 'grwt_sur_ba_prc_yr_hos', 'grwt_rec_ba_yr', 'grwt_rec_ba_prc_yr', 'change_tot_ba_yr', 'change_tot_ba_prc_yr']


### Functions


In [67]:
# Remove share of 0s in target variable
def remove_na_and_reduce_zero_share(df, target_column, max_zero_share, verbose=False):
    # Make a copy of the original dataframe
    df_org = df.copy()

    # Remove NA values
    df_nona = df.dropna(subset=[target_column])

    if verbose:
        print(f"Removing NA values from target variable '{target_column}'")
        print(
            f" - Shape of data before removing NA values:\t {df_org.shape} \t | % of NAs in target:\t {(df_org[target_column] == np.nan).mean():.2%}"
        )
        print(
            f" - Shape of data after removing NA values:\t {df_nona.shape} \t | % of NAs in target:\t {(df_nona[target_column] == np.nan).mean():.2%}"
        )
        print(f" - Number of NA values removed:\t\t\t {len(df_org) - len(df_nona)}")

    # Calculate the current share of 0 values in the target column
    df = df_nona.copy()
    zero_share = (df[target_column] == 0).mean()

    # Check if the current share exceeds the maximum allowed share
    while zero_share > max_zero_share * 1.25:
        # Calculate the number of 0 values to remove
        num_zeros_to_remove = int((zero_share - max_zero_share) * len(df))

        # Get the indices of the 0 values
        zero_indices = df[df[target_column] == 0].index

        # Randomly select indices to remove
        indices_to_remove = np.random.choice(
            zero_indices, size=num_zeros_to_remove, replace=False
        )

        # Remove the selected indices from the dataframe
        df = df.drop(indices_to_remove)

        # Recalculate the share of 0 values in the target column
        zero_share = (df[target_column] == 0).mean()

    # Verbose
    if verbose:
        print(f"\nRemoving 0 values from target variable '{target_column}'")
        print(
            f" - Shape of data before removing 0s:\t {df_nona.shape} \t | % of 0s in target:\t {(df_nona[target_column] == 0).mean():.2%}"
        )
        print(
            f" - Shape of data after removing 0s:\t {df.shape} \t | % of 0s in target:\t {(df[target_column] == 0).mean():.2%}"
        )
        print(f" - Number of 0s removed:\t\t {len(df_nona) - len(df)}")

    return df

### Directories


In [71]:
# Set dirs here to avoid changing later
dir_base = f"correlation_exploration/{dataset_to_compare_to}/species_{species_subset}-height_{height_class_subset}"
dir_target = f"{dir_base}/{target_of_interest}"
dir_heatmaps = f"{dir_base}/heatmaps"
dir_histogra = f"{dir_base}/histograms"
dir_scatters = f"{dir_target}/scatterplots"
dir_rf = f"{dir_target}/random_forest"

# Make sure all dirs exist
os.makedirs(dir_base, exist_ok=True)
# os.makedirs(dir_rf, exist_ok=True)  # Created later
os.makedirs(dir_heatmaps, exist_ok=True)
os.makedirs(dir_histogra, exist_ok=True)
os.makedirs(dir_scatters, exist_ok=True)

# Print all directories
print(f"dir_base:\t\t{dir_base}")
print(f"dir_heatmaps:\t\t{dir_heatmaps}")
print(f"dir_histogra:\t\t{dir_histogra}")
print(f"dir_scatters:\t\t{dir_scatters}")

dir_base:		correlation_exploration/edo/species_all-height_all
dir_heatmaps:		correlation_exploration/edo/species_all-height_all/heatmaps
dir_histogra:		correlation_exploration/edo/species_all-height_all/histograms
dir_scatters:		correlation_exploration/edo/species_all-height_all/mort_nat_vol_prc_yr/scatterplots


## Explore General Correlations


### Histograms of Predictors


In [72]:
# Set plot grid size nxn
grid_n = 4
grid_n_sq = grid_n**2

# List of predictors
predictors = df_pr_org.columns

# Number of figures
n_figures = np.ceil(len(predictors) / grid_n_sq)

# Ignore divide by zero warning from LOESS
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Create and save figures
for i in range(int(n_figures)):
    # Verbose
    print(f"Working on figure: {i+1} | {int(n_figures)}")
    # Check if figure already exists
    figure_path = os.path.join(dir_histogra, f"figure_{i+1}.png")
    if os.path.exists(figure_path):
        print(f" ✅ Figure {figure_path} already exists. Skipping...")
        continue

    # ! Create a new figure
    fig, axs = plt.subplots(grid_n, grid_n, figsize=(20, 20))
    axs = axs.ravel()

    # Plot each predictor
    for j in range(grid_n_sq):
        index = i * grid_n_sq + j
        if index < len(predictors):
            # Get predictor to plot
            predictor = predictors[index]
            # Verbose
            print(f"    Working on {predictor}:\t {j+1} | {grid_n_sq}", end="\t")

            # * Histogram
            print(f"adding histogram...", end="    ")
            sns.histplot(
                data=df_pr_org,
                x=predictor,
                ax=axs[j],
                color="grey",
            )

            # * Set the title
            axs[j].set_title(f"Distribution of {predictor}")

        else:
            axs[j].axis("off")

    # ! Save the figure
    plt.tight_layout()
    plt.savefig(figure_path, dpi=300)
    plt.close(fig)

# Reset warnings
warnings.resetwarnings()

Working on figure: 1 | 20


### Heatmap


In [None]:
# Get dfs
df_pr = df_pr_org.copy()
df_gm = df_gm_org.copy()

# Check if inputed df is df_gm already, else merge it on idp
if dataset_to_compare_to == "growth_mortality":
    comparing_to_gm = True
    df_merged = df_gm
    pass
else:
    comparing_to_gm = False
    df_merged = df_gm.merge(df_pr, on="idp", how="left")

# Get all meta targets
meta_targets = ["grwt", "mort_tot", "mort_cut", "mort_nat"]

# Loop over all meta targets
debug_counter = 0
for mt in meta_targets:
    print(f"Working on: {mt}")
    # Get empty correlation df
    df_corr = pd.DataFrame()

    # Get all sub targets
    sub_targets = [c for c in df_merged.columns if c.startswith(mt)]
    # Loop over all sub targets
    for st in tqdm(sub_targets):
        # print(f"Working on: {st}")
        # Remove NA and 0s from target variable
        df_loop = remove_na_and_reduce_zero_share(
            df_merged, st, share_of_0s_allowed, verbose=False
        )
        # Remove other growth_mortality variables if not comparing to it
        if not comparing_to_gm:
            to_remove = [c for c in df_gm if c != st]
            df_loop = df_loop.drop(to_remove, axis=1)
        # Calculate correlation between target and all predictors
        corr = df_loop.corr()[st].sort_values(ascending=False)
        # Save to df with subtarget as column name
        df_corr[st] = corr

    # ! Create heatmaps
    max_rows = 50
    num_heatmaps = len(df_corr) // max_rows + 1
    last_heatmap_rows = len(df_corr) % max_rows
    heatmap_counter = 0
    for i in range(num_heatmaps):
        heatmap_counter += 1
        start_index = i * max_rows
        end_index = start_index + max_rows
        if i == num_heatmaps - 1 and last_heatmap_rows < max_rows / 2:
            figsize = (20, 10)
        else:
            figsize = (20, 20)
        df_corr_subset = df_corr.iloc[start_index:end_index]

        # Create heatmap
        fig, ax = plt.subplots(figsize=figsize)
        heatmap = ax.pcolor(df_corr_subset, cmap=plt.cm.RdBu, vmin=-1, vmax=1)

        # put the major ticks at the middle of each cell
        ax.set_xticks(np.arange(df_corr_subset.shape[1]) + 0.5, minor=False)
        ax.set_yticks(np.arange(df_corr_subset.shape[0]) + 0.5, minor=False)

        # want a more natural, table-like display
        ax.invert_yaxis()
        ax.xaxis.tick_top()

        ax.set_xticklabels(df_corr_subset.columns, minor=False, rotation=45, ha="left")
        ax.set_yticklabels(df_corr_subset.index, minor=False)

        # Add colorbar
        cbar = plt.colorbar(heatmap)

        # Add correlation text annotations
        for i in range(df_corr_subset.shape[0]):
            for j in range(df_corr_subset.shape[1]):
                text = ax.text(
                    j + 0.5,
                    i + 0.5,
                    f"{df_corr_subset.iloc[i, j]:.2f}",
                    ha="center",
                    va="center",
                    color="black",
                )

        # Add title
        plt.title(
            f"Correlation between '{dataset_to_compare_to} predictors' and '{mt} targets' ({heatmap_counter}/{num_heatmaps})\n",
            fontsize=20,
            fontweight="bold",
        )

        # plt.show()
        # raise ValueError("Stop here")
        plt.savefig(
            f"{dir_heatmaps}/{mt}_{heatmap_counter}.png",
            bbox_inches="tight",
        )
        plt.close("all")

Working on: grwt


100%|██████████| 9/9 [00:00<00:00,  9.67it/s]


Working on: mort_tot


100%|██████████| 6/6 [00:00<00:00, 18.84it/s]


Working on: mort_cut


100%|██████████| 6/6 [00:00<00:00, 23.85it/s]


Working on: mort_nat


100%|██████████| 6/6 [00:00<00:00, 16.32it/s]


## Explore Target Specific Correlations


In [None]:
# Get dataframe for predicting on target
df_target = pd.merge(df_gm_org[["idp", target_of_interest]], df_pr_org, how="left")

# Remove NA and 0s from target variable
df_target = remove_na_and_reduce_zero_share(
    df_target, target_of_interest, share_of_0s_allowed, verbose=True
)

# Remove idp index
df_target = df_target.drop("idp", axis=1)

Removing NA values from target variable 'mort_nat_vol_prc_yr'
 - Shape of data before removing NA values:	 (22900, 48) 	 | % of NAs in target:	 0.00%
 - Shape of data after removing NA values:	 (22520, 48) 	 | % of NAs in target:	 0.00%
 - Number of NA values removed:			 380

Removing 0 values from target variable 'mort_nat_vol_prc_yr'
 - Shape of data before removing 0s:	 (22520, 48) 	 | % of 0s in target:	 88.13%
 - Shape of data after removing 0s:	 (2817, 48) 	 | % of 0s in target:	 5.11%
 - Number of 0s removed:		 19703


### Target distribution


In [None]:
# Create density plot of target variable
filename = f"{dir_target}/density_plot.png"
if os.path.exists(filename):
    print(f" ✅ File `{filename}` already exists.")
else:
    print(f" ❌ File `{filename}` does not exist. Creating...")
    plt.figure(figsize=(10, 6))
    plt.title(f"Density of {target_of_interest}")
    plt.xlabel(target_of_interest)
    plt.ylabel("Density")

    # Plot the KDE line after adding the text
    sns.kdeplot(data=df_target, x=target_of_interest, zorder=1)

    # Add ticks along the x-axis to indicate sample count
    sample_counts = df_target[target_of_interest].value_counts()
    ymax = plt.gca().get_ylim()[1]
    for value, _ in sample_counts.items():
        plt.text(value, ymax, "|", ha="center", va="top", alpha=0.1)

    # Save the plot under dir_target
    plt.savefig(filename, dpi=300)
    plt.close()

# Create histo plot of target variable
filename = f"{dir_target}/histo_plot.png"
if os.path.exists(filename):
    print(f" ✅ File `{filename}` already exists.")
else:
    print(f" ❌ File `{filename}` does not exist. Creating...")
    plt.figure(figsize=(10, 6))
    plt.title(f"Density of {target_of_interest}")
    plt.xlabel(target_of_interest)
    plt.ylabel("Density")

    # Plot the KDE line after adding the text
    sns.histplot(data=df_target, x=target_of_interest)

    # Add ticks along the x-axis to indicate sample count
    sample_counts = df_target[target_of_interest].value_counts()
    ymax = plt.gca().get_ylim()[1]
    for value, _ in sample_counts.items():
        plt.text(value, ymax, "|", ha="center", va="top", alpha=0.1)

    # Save the plot under dir_target
    plt.savefig(filename, dpi=300)
    plt.close()

 ✅ File `correlation_exploration/growth_mortality/species_quercus-height_all/mort_nat_vol_prc_yr/density_plot.png` already exists.
 ❌ File `correlation_exploration/growth_mortality/species_quercus-height_all/mort_nat_vol_prc_yr/histo_plot.png` does not exist. Creating...


### Scatterplots


In [None]:
# Set plot grid size nxn
grid_n = 4
grid_n_sq = grid_n**2

# List of predictors
predictors = [col for col in df_merged.columns if col != target_of_interest]

# Number of figures
n_figures = np.ceil(len(predictors) / grid_n_sq)

# y-axis limits
y_max = np.percentile(df_merged[target_of_interest].dropna(), 90)
y_min = min(0, np.percentile(df_merged[target_of_interest].dropna(), 10))

# Ignore divide by zero warning from LOESS
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Create and save figures
for i in range(int(n_figures)):
    # Verbose
    print(f"Working on figure: {i+1} | {int(n_figures)}")
    # Check if figure already exists
    figure_path = os.path.join(dir_scatters, f"figure_{i+1}.png")
    if os.path.exists(figure_path):
        print(f" ✅ Figure {figure_path} already exists. Skipping...")
        continue

    # ! Create a new figure
    fig, axs = plt.subplots(grid_n, grid_n, figsize=(20, 20))
    axs = axs.ravel()

    # Plot each predictor
    for j in range(grid_n_sq):
        index = i * grid_n_sq + j
        if index < len(predictors):
            # Get predictor to plot
            predictor = predictors[index]
            # Verbose
            print(f"    Working on {predictor}:\t {j+1} | {grid_n_sq}", end="\t")

            # Check for NaN values in predictor and target variables
            df_loop = df_merged.dropna(subset=[target_of_interest, predictor])[
                [target_of_interest, predictor]
            ]

            # * KDE plot
            # print(f"adding KDE...", end="    ")
            # sns.kdeplot(
            #     data=df_loop,
            #     x=predictor,
            #     y=target_of_interest,
            #     ax=axs[j],
            #     fill=True,
            #     warn_singular=False,
            #     # levels=100,
            #     # thresh=0,
            # )

            # * Scatterplot (much faster)
            print(f"adding scatterplot...", end="    ")
            sns.scatterplot(
                data=df_loop,
                x=predictor,
                y=target_of_interest,
                ax=axs[j],
                alpha=0.1,
                # size=1,
                linewidth=0,
                color="black",
            )

            # * LOESS smoother
            print(f"adding LEOSS...", end="    ")
            lowess = sm.nonparametric.lowess(
                df_loop[target_of_interest], df_loop[predictor], frac=0.30
            )
            axs[j].plot(lowess[:, 0], lowess[:, 1], color="red")

            # * Pearson correlation
            print(f"adding corr...")
            r, _ = pearsonr(df_loop[predictor], df_loop[target_of_interest])
            axs[j].set_title(f"{target_of_interest} ~ {predictor}\nPearson r: {r:.2f}")

            # * Set the axis limits
            x_max = np.percentile(df_loop[predictor], 99)
            x_min = min(0, np.percentile(df_loop[predictor], 1))
            axs[j].set_xlim(xmin=x_min, xmax=x_max)
            axs[j].set_ylim(ymin=y_min, ymax=y_max)
        else:
            axs[j].axis("off")

    # ! Save the figure
    plt.tight_layout()
    plt.savefig(figure_path, dpi=300)
    plt.close(fig)

# Reset warnings
warnings.resetwarnings()

Working on figure: 1 | 3
    Working on idp:	 1 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_plots:	 2 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_a1:	 3 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_a2:	 4 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_aa:	 5 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_ad:	 6 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_ac:	 7 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on n_na:	 8 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on ba_ax_v1:	 9 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on ba_ax_v2:	 10 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on ba_aa_v1:	 11 | 16	adding scatterplot...    adding LEOSS...    adding

  grid_kw = {k[5:]: v for k, v in kwargs.items()}


adding LEOSS...    adding corr...
    Working on mort_cut_stems_prc_yr:	 9 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_tot_ba_yr:	 10 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_tot_ba_prc_yr:	 11 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_nat_ba_yr:	 12 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_nat_ba_prc_yr:	 13 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_cut_ba_yr:	 14 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_cut_ba_prc_yr:	 15 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_tot_vol_yr:	 16 | 16	adding scatterplot...    adding LEOSS...    adding corr...
Working on figure: 3 | 3
    Working on mort_tot_vol_prc_yr:	 1 | 16	adding scatterplot...    adding LEOSS...    adding corr...
    Working on mort_nat_vol

### Random Forest


In [None]:
# Run a small random forest to get feature importances for all variables
def run_rf(df_in, target_of_interest, dir_rf):
    os.makedirs(dir_rf, exist_ok=True)
    # ! Data Preparation ------------------------------------------------------
    print(f" - Prepare data...")
    # Remove NAs
    df_in = df_in.dropna(subset=target_of_interest)

    # Get X and y
    X = df_in.drop(target_of_interest, axis=1)
    y = df_in[target_of_interest]

    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, random_state=42
    )

    # Replace NA values in train data with mean
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_train.mean())

    # ! Fit ------------------------------------------------------
    print(f" - Fit random forest...")
    # Fit a random forest
    rf = RandomForestRegressor(n_estimators=100, max_depth=5, n_jobs=9)
    # Fit the model
    rf.fit(X_train, y_train)

    # ! Evaluate ------------------------------------------------------
    print(f" - Feature Importance...")
    # Get feature importances
    show_top_predictors(X_train, rf_model=rf, current_dir=dir_rf)

    print(f" - Partial Dependence...")
    # Get the top 10 predictors
    feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
    top_predictors = feature_importances[:10].index.tolist()
    # Create a 2x5 grid for subplots
    fig, axes = plt.subplots(2, 5, figsize=(20, 10))
    # Iterate over the top predictors and plot the PDP for each one
    for i, predictor in enumerate(top_predictors):
        # Create the PartialDependenceDisplay object
        pdp_display = PartialDependenceDisplay.from_estimator(
            estimator=rf,
            X=X_train,
            features=[predictor],
            response_method="auto",
            n_jobs=9,
            n_cols=5,
            ax=axes[i // 5, i % 5],
        )
        # Plot the PDP
        # pdp_display.plot()

    # Adjust the spacing between subplots
    fig.tight_layout()

    # Save the plot
    fig.savefig(f"{dir_rf}/pdp_plot.png")
    plt.close(fig)

    # Compare predicted and actual values for both, train and test
    print(f" - Modobs...")
    model_evaluation_regression(
        rf, X_train, y_train, X_test, y_test, save_directory=dir_rf
    )

In [None]:
# Create an identical copy of the data but replace every variable with random values within the same range
df_random = df_target.copy()
for col in df_random.columns:
    df_random[col] = np.random.uniform(
        low=df_target[col].min(), high=df_target[col].max(), size=len(df_random)
    )

In [None]:
# Run random forest on original data
print("Original Data RF")
run_rf(df_target, target_of_interest, f"{dir_rf}_predictors")

# Run random forest on random data
print("Random Data RF")
run_rf(df_random, target_of_interest, f"{dir_rf}_random")

Original Data RF
 - Prepare data...
 - Fit random forest...
 - Feature Importance...
 - Partial Dependence...


  ax.set_ylim([min_val, max_val])


 - Modobs...
Random Data RF
 - Prepare data...
 - Fit random forest...
 - Feature Importance...
 - Partial Dependence...


  ax.set_ylim([min_val, max_val])


 - Modobs...


In [None]:
chime.success()
raise ValueError("Stop here")

  _warn("subprocess %s is still running" % self.pid,


ValueError: Stop here

---

# Code Archive


## Heatmaps


In [None]:
def calculate_corr_and_order_cols(df, target_of_interest):
    """
    Calculate correlation between target of interest and all other variables.
    Order variables by correlation.
    """
    # Compute the correlation matrix
    corr = df.corr()

    # Sort columns by correlation with the target variable
    sorted_columns = corr[target_of_interest].sort_values(ascending=False).index

    # Order df by correlation with the target variable
    df = df[sorted_columns]

    # Return sorted df
    return df

In [None]:
#
# ! Heatmap for growth and mortality variables
# Get filename
filename = f"{dir_heatmap_growhtmort}/correlations_growth_and_mortality.png"
# Skip if file already exists
if os.path.exists(filename):
    print(f" ✅ File `{filename}` already exists.")
else:
    print(f" ❌ File `{filename}` does not exist. Creating...")
    # Order cols by correlation with target variable
    df_hm = calculate_corr_and_order_cols(df, target_of_interest)

    # Make a correlation heatmap of df
    plt.figure(figsize=(40, 40))
    sns.heatmap(
        df_hm.corr(),
        annot=True,
        cmap="RdBu",
        cbar=True,
        vmin=-1,
        vmax=1,
    )
    plt.title("Correlations of Growth and Mortality Variables")
    plt.savefig(filename, dpi=300)
    plt.close()
    # plt.show()

 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/correlations_growth_and_mortality.png` does not exist. Creating...


In [None]:
#
# ! Heatmaps for target and predictor variables
# Sort columns by correlation with target variable
df_merged = calculate_corr_and_order_cols(df_merged, target_of_interest)

# Make heatmaps of 15 variables at a time to keep readability
map_counter = 0
for i in range(0, len(df_merged.columns), 15):
    # Check if file exists already
    map_counter += 1
    filename = f"{dir_heatmap_predictors}/correlations_{map_counter}.png"
    if os.path.exists(filename):
        print(f" ✅ File `{filename}` already exists.")
    else:
        print(f" ❌ File `{filename}` does not exist. Creating...")
        # Plot file
        # Reorder columns with target_of_interest at first position
        cols = [target_of_interest] + df_merged.columns[i : i + 15].tolist()
        df_subset = df_merged[cols]

        plt.figure(figsize=(15, 15))
        sns.heatmap(
            df_subset.corr(),
            annot=True,
            cmap="RdBu",
            cbar=True,
            vmin=-1,
            vmax=1,
        )
        plt.title(f"Correlations of Target {target_of_interest} and Predictors")
        plt.tight_layout()  # Adjust figure layout
        plt.savefig(
            filename, dpi=300, bbox_inches="tight"
        )  # Save the entire figure without cutting off

        plt.close()
        # plt.show()

 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_1.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_2.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_3.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_4.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_5.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_6.png` does not exist. Creating...
 ❌ File `correlation_exploration/species_quercus-heightall/mort_nat_vol_prc_yr/edo/heatmaps/correlations_7.png` does not exist. Creating...
 ❌ File `correlation

In [None]:
# Correlation bar chart
# For each variable in the dataset, calculate the correlation with the target variable
corr = df_merged.corr()[target_of_interest].sort_values(ascending=False)
# Remove the target variable from the list
corr = corr.drop(target_of_interest)

# Plot the correlations
plt.figure(figsize=(20, 10))
plt.title(f"Correlation of {target_of_interest} with Predictors")
plt.xlabel("Predictor")
plt.ylabel("Pearson r")
plt.bar(corr.index, corr.values, color="RdBu")
plt.xticks(rotation=90)
plt.savefig(f"{dir_base}/correlation_bar_chart.png", dpi=300)
plt.close()