# 3.0 Training a Random Forest Regressor for DACs on N-Doped Carbon Materials

## Notebook Setup: Imports and Configuration


In [1]:
# ─────────────────────────────────────────────────────────────
#  Standard Library Imports
# ─────────────────────────────────────────────────────────────
import os
import sys
import time
import copy
import re
import itertools as it
import warnings
from pprint import pprint

# ─────────────────────────────────────────────────────────────
#  Project Source Imports
# ─────────────────────────────────────────────────────────────
sys.path.append(os.path.abspath(os.path.join('..', '..', 'src')))

from vis import *
from ml import *
from settings import *

# ─────────────────────────────────────────────────────────────
#  Numerical & Data Manipulation
# ─────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.linalg import LinAlgWarning

# ─────────────────────────────────────────────────────────────
#  Plotting Libraries
# ─────────────────────────────────────────────────────────────
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio

# ─────────────────────────────────────────────────────────────
#  Machine Learning (Scikit-learn)
# ─────────────────────────────────────────────────────────────
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# ─────────────────────────────────────────────────────────────
#  Notebook Environment Settings
# ─────────────────────────────────────────────────────────────
from IPython.core.interactiveshell import InteractiveShell
from tqdm.notebook import tqdm
from tqdm import trange

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("mode.chained_assignment", None)
pd.options.display.max_colwidth = 200

InteractiveShell.ast_node_interactivity = "all"
init_notebook_mode(connected=True)

# ─────────────────────────────────────────────────────────────
#  Warning Filters
# ─────────────────────────────────────────────────────────────
warnings.filterwarnings(action="ignore", category=LinAlgWarning, module="sklearn")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


## Preapring the DACs on N-doped carbon datasets for LOGOCV

#### Loading and Preparing DFT Energy Datasets with 10-fold Cross Validation

This block of code loads preprocessed adsorption energy datasets from `.pkl` files and prepares them for **K-Fold cross-validation** using a custom utility (`add_cv_columns`). Each dataset is labeled and stored dynamically for convenient downstream use.

> ⚠️ **Note**: This code is currently commented out.  
> To use it, **remove the comment symbols (`#`)** before executing.

#### 🛠️ Steps Performed:
- Define the directory containing `.pkl` data files.
- Load each dataset into memory.
- Apply **K-Fold cross-validation** (with 10 splits).
- Store each CV-ready DataFrame using both:
  - Global variable assignment (e.g., `Edft_din6_df_kfold`)
  - Central `dataframe_dict` for easier access.

This step is **required before training the Random Forest Regressor (RFR)** on Kfold data.  
➡️ **Also remember to uncomment the corresponding model training block below this section**. 

In [2]:
data_dir = "../../../data/external/dacs_energies_out"

# Define the file names and corresponding variable names
file_variables = {
    "Edacs_dft.pkl": "Edacs_dft",
    "Edft_din6_as_df.pkl":"Edft_din6_as_df",
    "Edft_din6_s_df.pkl":"Edft_din6_s_df",
    "Edft_din4_x2_df.pkl":"Edft_din4_x2_df",
    "Edft_din6_df.pkl":"Edft_din6_df",
    "Edft_din6_s_din4_x2_df.pkl":"Edft_din6_s_din4_x2_df",
    "Edft_din6_as_din4_x2_df.pkl":"Edft_din6_as_din4_x2_df",
    "Edft_balanced_df.pkl": "Edft_balanced_df"
    } 


# Create an empty dictionary to store the DataFrames
data_frames = {}

# Iterate over the file_variables dictionary
for file_name, variable_name in file_variables.items():
    file_path = os.path.join(data_dir, file_name)
    data_frame = pd.read_pickle(file_path)
    data_frames[variable_name] = data_frame


cv_setup = {"cv_type": "kfold", "cv_spec": 10}



# cv_setup_metal = {"cv_type": "logocv", "cv_spec": "metal"}

# Create a dictionary to store the dataframes for easy access
dataframe_dict = {}

# Iterate over the data_frames dictionary
for key, df in data_frames.items():
    # Apply the add_cv_columns function to each dataframe
    kfold_df = add_cv_columns(df_in=df, cv_setup=cv_setup)
    # Create a variable name based on the original dataframe name with "_kfold" appended
    var_name = key + "_kfold"
    # Assign the kfold dataframe to the dynamically
    globals()[var_name] = kfold_df
    
    # Store the kfold dataframe in the dataframe dictionary
    dataframe_dict[var_name] = kfold_df


# TODO: Write within a loop and have access at any point of the code
Edacs_dft_df_kfold = dataframe_dict["Edacs_dft_kfold"]
Edft_din6_as_df_kfold = dataframe_dict["Edft_din6_as_df_kfold"]
Edft_din6_s_df_kfold = dataframe_dict["Edft_din6_s_df_kfold"]
Edft_din4_x2_df_kfold = dataframe_dict["Edft_din4_x2_df_kfold"]
Edft_din6_df_kfold = dataframe_dict["Edft_din6_df_kfold"]
Edft_din6_s_din4_x2_df_kfold = dataframe_dict["Edft_din6_s_din4_x2_df_kfold"]
Edft_din6_as_din4_x2_df_kfold = dataframe_dict["Edft_din6_as_din4_x2_df_kfold"]
Edft_balanced_df_kfold= dataframe_dict["Edft_balanced_df_kfold"]

#### Loading and Preparing DFT Energy Datasets with Dual Metal Leave-One-Group-Out Cross-Validation

This block loads adsorption energy datasets and applies **Leave-One-Group-Out cross-validation (LOGO-CV)** by metal identity using the `"M1"` group column.  
The processed data is stored both as dynamically named variables and in a central dictionary for easy access.

> ⚠️ **Note**: This code is **commented out**.  
> To use it, **uncomment** all relevant lines (remove `#` at the beginning of each line).

#### ✅ What this code does:
- Loads `.pkl` data files from a local directory
- Applies LOGO-CV using a custom `add_cv_columns()` function
- Stores the resulting DataFrames with `_logocv_metal` suffixes

This step is **required before training the Random Forest Regressor (RFR)** on LOGO-CV data.  
➡️ **Also remember to uncomment the corresponding model training block below this section**.


In [3]:
# data_dir = "../../../data/external/dacs_energies_out"

# # Define the file names and corresponding variable names
# file_variables = {
#     "Edacs_dft.pkl": "Edacs_dft",
#     "Edft_din6_as_df.pkl":"Edft_din6_as_df",
#     "Edft_din6_s_df.pkl":"Edft_din6_s_df",
#     "Edft_din4_x2_df.pkl":"Edft_din4_x2_df",
#     "Edft_din6_df.pkl":"Edft_din6_df",
#     "Edft_din6_s_din4_x2_df.pkl":"Edft_din6_s_din4_x2_df",
#     "Edft_din6_as_din4_x2_df.pkl":"Edft_din6_as_din4_x2_df",
#     "Edft_balanced_df.pkl": "Edft_balanced_df"

#     } 

# # Create an empty dictionary to store the DataFrames
# data_frames = {}

# # Iterate over the file_variables dictionary
# for file_name, variable_name in file_variables.items():
#     file_path = os.path.join(data_dir, file_name)
#     data_frame = pd.read_pickle(file_path)
#     data_frames[variable_name] = data_frame

# # Iterate over the file_variables dictionary
# for file_name, variable_name in file_variables.items():
#     file_path = os.path.join(data_dir, file_name)
#     data_frame = pd.read_pickle(file_path)
#     data_frames[variable_name] = data_frame


# cv_setup = {"cv_type": "logocv", "cv_spec": 'M1'}


# # Create a dictionary to store the dataframes for easy access
# dataframe_dict = {}

# # Iterate over the data_frames dictionary
# for key, df in data_frames.items():
#     # Apply the add_cv_columns function to each dataframe
#     logocv_df = add_cv_columns(df_in=df, cv_setup=cv_setup)
#     # Create a variable name based on the original dataframe name with "_kfold" appended
#     var_name = key + "_logocv_metal"
#     # Assign the kfold dataframe to the dynamically
#     globals()[var_name] = logocv_df
    
#     # Store the kfold dataframe in the dataframe dictionary
#     dataframe_dict[var_name] = logocv_df



# Edacs_dft_df_logocv_metal = dataframe_dict["Edacs_dft_logocv_metal"]
# Edft_din6_as_df_logocv_metal = dataframe_dict["Edft_din6_as_df_logocv_metal"]
# Edft_din6_s_df_logocv_metal = dataframe_dict["Edft_din6_s_df_logocv_metal"]
# Edft_din4_x2_df_logocv_metal = dataframe_dict["Edft_din4_x2_df_logocv_metal"]
# Edft_din6_df_logocv_metal = dataframe_dict["Edft_din6_df_logocv_metal"]
# Edft_din6_s_din4_x2_df_logocv_metal = dataframe_dict["Edft_din6_s_din4_x2_df_logocv_metal"]
# Edft_din6_as_din4_x2_df_logocv_metal = dataframe_dict["Edft_din6_as_din4_x2_df_logocv_metal"]
# Edft_balanced_df_logocv_metal= dataframe_dict["Edft_balanced_df_logocv_metal"]

#### Loading and preparing DFT energy datasets with Cavity Leave-One-Group-Out Cross-Validation

This section prepares the adsorption energy datasets for machine learning by applying **Leave-One-Group-Out Cross-Validation (LOGO-CV)** using the structural property or any other desirable property **`cavity_3`** as the grouping variable.

> ⚠️ **Note**: The entire code block is currently **commented out**.  
> To activate it, **uncomment** all the lines by removing the leading `#` symbols.

#### ✅ Key functionality:
- Loads several `.pkl` data files containing DFT-calculated adsorption energies.
- Applies LOGO-CV using the `add_cv_columns()` function, grouping by the `cavity_3` column.
- Dynamically creates variables (e.g., `Edft_din6_as_df_logocv_cavity`) and stores them in a dictionary (`dataframe_dict`) for easy access.

> 🔁 This cross-validation setup is used to assess model generalizability across different **cavity types**, making it particularly useful for structure-driven performance predictions.

➡️ You’ll need to **uncomment this section** before training models that use `cavity_3`-based LOGO-CV.


In [4]:
# data_dir = "../../../data/external/dacs_energies_out"

# # Define the file names and corresponding variable names
# file_variables = {
#     "Edacs_dft.pkl": "Edacs_dft",
#     "Edft_din6_as_df.pkl":"Edft_din6_as_df",
#     "Edft_din6_s_df.pkl":"Edft_din6_s_df",
#     "Edft_din4_x2_df.pkl":"Edft_din4_x2_df",
#     "Edft_din6_df.pkl":"Edft_din6_df",
#     "Edft_din6_s_din4_x2_df.pkl":"Edft_din6_s_din4_x2_df",
#     "Edft_din6_as_din4_x2_df.pkl":"Edft_din6_as_din4_x2_df",
#     "Edft_balanced_df.pkl": "Edft_balanced_df"
#     } 

# # Create an empty dictionary to store the DataFrames
# data_frames = {}

# # Iterate over the file_variables dictionary
# for file_name, variable_name in file_variables.items():
#     file_path = os.path.join(data_dir, file_name)
#     data_frame = pd.read_pickle(file_path)
#     data_frames[variable_name] = data_frame





# # Iterate over the file_variables dictionary
# for file_name, variable_name in file_variables.items():
#     file_path = os.path.join(data_dir, file_name)
#     data_frame = pd.read_pickle(file_path)
#     data_frames[variable_name] = data_frame


# cv_setup = {"cv_type": "logocv", "cv_spec": 'cavity_3'}


# # Create a dictionary to store the dataframes for easy access
# dataframe_dict = {}

# # Iterate over the data_frames dictionary
# for key, df in data_frames.items():
#     # Apply the add_cv_columns function to each dataframe
#     logocv_df = add_cv_columns(df_in=df, cv_setup=cv_setup)
#     # Create a variable name based on the original dataframe name with "_kfold" appended
#     var_name = key + "_logocv_cavity"
#     # Assign the kfold dataframe to the dynamically
#     globals()[var_name] = logocv_df
    
#     # Store the kfold dataframe in the dataframe dictionary
#     dataframe_dict[var_name] = logocv_df


# # TODO: Write within a loop and have access at any point of the code

# Edacs_dft_df_logocv_cavity = dataframe_dict["Edacs_dft_logocv_cavity"]
# Edft_din6_as_df_logocv_cavity = dataframe_dict["Edft_din6_as_df_logocv_cavity"]
# Edft_din6_s_df_logocv_cavity = dataframe_dict["Edft_din6_s_df_logocv_cavity"]
# Edft_din4_x2_df_logocv_cavity = dataframe_dict["Edft_din4_x2_df_logocv_cavity"]
# Edft_din6_df_logocv_cavity = dataframe_dict["Edft_din6_df_logocv_cavity"]
# Edft_din6_s_din4_x2_df_logocv_cavity = dataframe_dict["Edft_din6_s_din4_x2_df_logocv_cavity"]
# Edft_din6_as_din4_x2_df_logocv_cavity = dataframe_dict["Edft_din6_as_din4_x2_df_logocv_cavity"]
# Edft_balanced_df_logocv_cavity= dataframe_dict["Edft_balanced_df_logocv_cavity"]

## Feature Space and Target Specification

In [5]:
# ───────────────
# TARGET VARIABLE
# ───────────────

target = "E_dft_M1M2"

# ───────────────
# METAL FEATURES 
# ───────────────
metal_features = [
    "atomic_mass",
    "vdw_radius",
    "r_cov_sb",
    "r_cov_db",
    "dipole_polarizability",
    "ionic_radii_crystals",
    "d_center_sp",
    "Paul_electroneg",
    "MB_electroneg",
    "electron_affinity",
    "covalent_radius",
    "atomic_number",
    "Ion_energ_I",
    "Ion_energ_II",
    "Zung_radius",
    "Coh_radius",
    "Waber_radius",
    "mied_param_h",
    "mied_param_phi",
    "HOMO ",
    "LUMO",
    "mag_moment_bulk_d",
    " E_Fermi",
    "E_Fermi2",  
]
 
# ───────────────
# CAVITY FEATURES
# ───────────────
cavity_features = [
    "ncoord",
    "number_hetero",
    "number_C",
    "frac_hetero",
    "frac_C",
    "number_hetero_six",
    "frac_hetero_six",
    "number_hetero_five",
    "frac_hetero_five",
    'delta_min_ds', 
    'delta_max_ds',
    'fermi_energy_cavity',
    'surface',
    'convex_hull_area',
    'convex_hull_volume'
]

# ───────────────────────────────────────────────
# ELECTRONEGATIVITIES AND ATOMIC RADIUS FEATURES
# ───────────────────────────────────────────────
en_features = ['min(en)', 'mean(en)', 'max(en)', 'std(en)', 'sum(en)']
r_features = ['min(r)', 'mean(r)', 'max(r)', 'std(r)', 'sum(r)']

# ─────────────────────
# GEOMETRICAL FEATURES
# ───────────────────── 
posc_cd_features = ['min(posc_cavity_ds)', 'max(posc_cavity_ds)', 'mean(posc_cavity_ds)', 'std(posc_cavity_ds)',]
                    #'min(posc_cavity_ang)', 'max(posc_cavity_ang)', 'mean(posc_cavity_ang)','std(posc_cavity_ang)']


cont_cd_features = ['min(cont_cavity_ds)', 'max(cont_cavity_ds)', 'mean(cont_cavity_ds)', 'std(cont_cavity_ds)',]
                    #'min(cont_cavity_ang)','max(cont_cavity_ang)', 'mean(cont_cavity_ang)', 'std(cont_cavity_ang)']

# ──────────────────
# CROSS VALIDATION
# ──────────────────
cv_types = ['kfold', 'metal', 'cavity']

# ──────────────────
# PRIMARY FEATURES
# ──────────────────
primary_features = metal_features + cavity_features + en_features + r_features + posc_cd_features + cont_cd_features

# ──────────────────
# FEATURE SETS
# ──────────────────
primary_feature_sets = {
    'metal': metal_features,
    'cavity': cavity_features,
    'full': primary_features,
}

# ────────────────────────────────────────────────────────
# SELECTED FEATURES FROM RF-SFS FOR SACs ON DOPED CARBON
# ────────────────────────────────────────────────────────
selected_features = ['surface', 
                     "mied_param_h", 
                     'fermi_energy_cavity',
                     "r_cov_sb",
                     'sum(r)', 
                     "MB_electroneg", 
                     "ncoord", 
                     " E_Fermi",                
                     ]

trial_feature_sets = [
    # metal_features,
    # cavity_features,
    # metal_features + cavity_features,
    # metal_features + cavity_features + en_features,
    # metal_features + cavity_features + r_features,
    # metal_features + cavity_features + en_features + r_features,
    # metal_features + cavity_features + en_features + r_features + posc_cd_features,
    # metal_features + cavity_features + en_features + r_features + cont_cd_features,
    # metal_features + cavity_features + en_features + r_features + posc_cd_features + cont_cd_features,
    selected_features 

    ]

## Hyperparameter Optimization Setup

| Parameter                  | Description                                                                 |
|---------------------------|-----------------------------------------------------------------------------|
| `bootstrap`               | Whether bootstrap samples are used when building trees (`True` = yes).      |
| `ccp_alpha`               | Complexity parameter used for Minimal Cost-Complexity Pruning (`0.0` = no pruning). |
| `criterion`               | Function to measure the quality of a split (`squared_error` = MSE).         |
| `max_depth`               | Maximum depth of each decision tree (`8` here limits model complexity).     |
| `max_features`            | Fraction of features considered at each split (`0.4` = 40% of total features). |
| `max_leaf_nodes`          | Limits the number of leaf nodes in each tree (`None` = unlimited).          |
| `max_samples`             | Number or fraction of samples to draw from X to train each base estimator (`None` = use all). |
| `min_impurity_decrease`   | Node split occurs only if the impurity decrease is at least this value.     |
| `min_samples_leaf`        | Minimum number of samples required to be at a leaf node (`1` = default).    |
| `min_samples_split`       | Minimum number of samples required to split an internal node (`2` = default). |
| `min_weight_fraction_leaf`| Minimum weighted fraction of the sum total of weights required to be at a leaf node. |
| `n_estimators`            | Number of trees in the forest (`128` = moderately sized forest).            |
| `n_jobs`                  | Number of jobs to run in parallel (`-1` = use all available cores).         |
| `oob_score`               | Whether to use out-of-bag samples to estimate generalization accuracy (`False` = no). |
| `random_state`            | Controls randomness for reproducibility (`0` = fixed seed).                 |
| `verbose`                 | Controls the verbosity of the output (`0` = silent mode).                   |
| `warm_start`              | If `True`, reuse solution of the previous call to `fit` and add more estimators. |


In [6]:
# ───────────────────────────
# RANDOM FOREST DICTIONARY
# ───────────────────────────

rf_dict = {
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "criterion": "squared_error",
    "max_depth":8, #8
    "max_features": 0.4,
    "max_leaf_nodes": None,
    "max_samples":None,
    "min_impurity_decrease": 0.0,
    # "min_impurity_split": None,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    "n_estimators": 128,#128,
    "n_jobs": -1,
    "oob_score": False,
    "random_state": 0,
    "verbose": 0,
    "warm_start": False,
}

## Random Forest Regression Model training

The `evaluate_features_rf` function automates the process of training and evaluating a Random Forest Regression model across multiple feature sets.

##### Function Inputs
- `df_in` (`pd.DataFrame`): The input dataframe containing the features, target, and metadata (e.g., system labels).
- `feature_sets` (`list of lists`): A list where each element is a list of feature names to be evaluated as a group.
- `rf_dict` (`dict`): A dictionary specifying the hyperparameters for the `RandomForestRegressor`. This allows full control over the model configuration.
- `target` (`str`): The name of the column in `df_in` that will be used as the regression target.

##### What It Does
For each set of features:
1. Fits a `RandomForestRegressor` using the specified hyperparameters and target.
2. Extracts and prints the feature importances.
3. Uses a plotting utility to visualize predicted vs. actual values on both training and test sets.
4. Appends the best-performing model's plot to a list of figures.
5. Saves the result dataframe and error metrics to disk.

##### Output
- Returns a list of `plotly.graph_objects.Figure` objects — one for each feature set evaluated — that visualize the regression results.
- Saves:
  - A CSV of the prediction results (`df_rfr_dacs_results_metal_logocv.csv`)
  - A CSV of the error metrics (`data_errors_mlogocv.csv`)

### RFR with k-fold CV

In [7]:
#K-Fold

#%%capture
figs = evaluate_features_rf(df_in=Edft_balanced_df_kfold, feature_sets=trial_feature_sets,rf_dict=rf_dict,
    target=target)
for fig in figs:
#     pio.write_image(fig, '../../../data/figures/rfr_results/rfr_selected_features.svg', format='svg')
#     pio.write_image(fig, '../../../data/figures/rfr_results/rfr_selected_features.png', format='png',scale = 3)
#     pio.write_image(fig, '../../../data/figures/rfr_results/rfr_selected_features.pdf', format='pdf')
      fig.show()




'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calcu

{'surface': 0.1789605100468911, 'mied_param_h': 0.07562911334485176, 'fermi_energy_cavity': 0.25755186565197313, 'r_cov_sb': 0.07219847240772191, 'sum(r)': 0.19276336374781003, 'MB_electroneg': 0.05354033186152385, 'ncoord': 0.13430702430978664, ' E_Fermi': 0.03504931862944167}
['Fe' 'Ir' 'Au' 'Pd' 'Cu' 'Ag' 'Zn' 'Pt' 'Ni' 'Os' 'Rh' 'Co' 'Ru' 'Cd']
4
{'rmse_trains': [0.2644409181862694, 0.2629466604322748, 0.26326758983502796, 0.26519519361537924, 0.26511461185944746, 0.26470856211319355, 0.26820872022347897, 0.2662983663560893, 0.2679562583703541, 0.26273548012056236], 'rmse_tests': [0.5251725630455284, 0.5276514334898431, 0.5715019477394517, 0.37120847894447545, 0.3296872846672559, 0.3503224189530382, 0.3818683415325559, 0.4079001299429188, 0.4614485763632119, 0.37819084032204897], 'rmse_fulls': [0.30101980947587864, 0.3002769508858519, 0.3084809960397449, 0.2776810074143462, 0.2722948767159288, 0.27451911684654623, 0.2815543971551403, 0.28353954384330665, 0.2929420655332866, 0.27636

### RFR with Dual-Metal LOGOCV

In [8]:
# #Dual Metal LOGOCV

# #%%capture
# figs = evaluate_features_rf(df_in=Edft_balanced_df_logocv_metal, feature_sets=trial_feature_sets, rf_dict=rf_dict,
#      target=target)
# for fig in figs:
#     # pio.write_image(fig, '../../../data/figures/rfr_results/rfr_metal_logocv.svg', format='svg')
#     # pio.write_image(fig, '../../../data/figures/rfr_results/rfr_metal_logocv.png', format='png',scale = 3)
#     # pio.write_image(fig, '../../../data/figures/rfr_results/rfr_metal_logocv.pdf', format='pdf')
#     fig.show()



### RFR with Cavity LOGOCV

In [9]:
# #Dual Metal LOGOCV

# #%%capture
# figs = evaluate_features_rf(df_in=Edft_balanced_df_logocv_cavity, feature_sets=trial_feature_sets, rf_dict=rf_dict,
#      target=target)
# for fig in figs:
#     # pio.write_image(fig, '../../../data/figures/rfr_results/rfr_cavity_logocv.svg', format='svg')
#     # pio.write_image(fig, '../../../data/figures/rfr_results/rfr_cavity_logocv.png', format='png',scale = 3)
#     # pio.write_image(fig, '../../../dacs_ml/data/figures/rfr_results/rfr_cavity_logocv.pdf', format='pdf')
#     fig.show()



## Hyperparameter Tuning for RFR

This code performs hyperparameter tuning for a `RandomForestRegressor` by varying the `max_depth`. It evaluates model performance using selected features and returns an error plot showing training and test errors. 

In [10]:
rf_dict = {
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "criterion": "squared_error",
    #"max_depth":8, #8
    "max_features": 0.4,
    "max_leaf_nodes": None,
    "max_samples":None,
    "min_impurity_decrease": 0.0,
    # "min_impurity_split": None,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    "n_estimators": 128,#128,
    "n_jobs": -1,
    "oob_score": False,
    "random_state": 0,
    "verbose": 0,
    "warm_start": False,
}


rf_estimator_dict = vary_ml_param(
    df_in=Edft_balanced_df_kfold,
    ml_base_model=RandomForestRegressor(**rf_dict),
    ml_features=selected_features,
    ml_targets=target,
    ml_param_dict={'max_depth': list(range(1, 16, 2))},
    verbose=False,  
    color_setup=color_setup 
    )

rf_estimator_dict['error_fig'].show()


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calcu

This code performs hyperparameter tuning for a `RandomForestRegressor` by varying the `n_estimators`. It evaluates model performance using selected features and returns an error plot showing training and test errors. 

In [11]:
rf_dict = {
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "criterion": "squared_error",
    "max_depth":8, #8
    "max_features": 0.4,
    "max_leaf_nodes": None,
    "max_samples":None,
    "min_impurity_decrease": 0.0,
    # "min_impurity_split": None,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    #"n_estimators": 128,#128,
    "n_jobs": -1,
    "oob_score": False,
    "random_state": 0,
    "verbose": 0,
    "warm_start": False,
}


rf_estimator_dict = vary_ml_param(
    df_in=Edft_balanced_df_kfold,
    ml_base_model=RandomForestRegressor(**rf_dict),
    ml_features=selected_features,
    ml_targets=target,
    ml_param_dict={'n_estimators': list(range(1, 350, 50))},
    verbose=False,  
    color_setup=color_setup 
    )


rf_estimator_dict['error_fig'].show()


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calcu

This code performs hyperparameter tuning for a `RandomForestRegressor` by varying the `max_features`. It evaluates model performance using selected features and returns an error plot showing training and test errors. 

In [12]:
rf_dict = {
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "criterion": "squared_error",
    "max_depth":8, #8
    #"max_features": 0.4,#0.4
    "max_leaf_nodes": None,
    "max_samples":None,
    "min_impurity_decrease": 0.0,
    # "min_impurity_split": None,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    "n_estimators": 128,#128,
    "n_jobs": -1,
    "oob_score": False,
    "random_state": 0,
    "verbose": 0,
    "warm_start": False,
}


rf_estimator_dict = vary_ml_param(
    df_in=Edft_balanced_df_kfold,
    ml_base_model=RandomForestRegressor(**rf_dict),
    ml_features=selected_features,
    ml_targets=target,
    ml_param_dict={'max_features': np.linspace(0.1, 1, 10)},
    verbose=False,  
    color_setup=color_setup 
    )


rf_estimator_dict['error_fig'].show()


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calcu