# 0.2 Merging SACs on doped carbon features to DACs dataset


## Notebook Setup: Imports and Configuration

In [None]:
# ─────────────────────────────
# Standard Library Imports
# ─────────────────────────────
import os
import sys

# ─────────────────────────────
# Adjust sys.path for Local Modules
# ─────────────────────────────
src_dir = os.path.abspath(os.path.join('..', '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# ─────────────────────────────
# Project-Specific Imports
# ─────────────────────────────
from utils import *
from vis import *
from settings import *

## Load Processed Datasets and Feature Files

This section loads all necessary input data for model evaluation and energy analysis, including:
- Main energy datasets (`Edft_coh`, `Eint`, `Edft_fe`)
- ML prediction results (`rfr_mlogocv_df`)
- Geometric and electronic features for SACs and DACs

In [None]:
# Define data directories
rfr_ads_out_dir = '../../../data/external/rfr_logocv_out'
dacs_energies_out_dir = '../../../data/external/dacs_energies_out'
sacs_features_out_dir = '../../../data/raw'
ratio_features_out_dir = '../../../data/external/features_out'

#Main datasets
rfr_mlogocv_df = pd.read_csv(os.path.join(rfr_ads_out_dir, 'rfr_mlogocv.csv'))
Edft_coh_df = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edft_coh_df.csv'))
Edft_coh_df_iqr = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edft_coh_df_iqr.csv'))

#Feature Datasets
sacs_features_df = pd.read_csv(os.path.join(sacs_features_out_dir, 'features_sacs_ml.csv'))
bimetallic_int_energies_df = pd.read_csv(os.path.join(sacs_features_out_dir, 'bimetallic_interaction_energies.csv'))
Eratio_df = pd.read_csv(os.path.join(ratio_features_out_dir, 'aspect_ratio.csv'))
convex_hull_df = pd.read_csv(os.path.join(ratio_features_out_dir, 'convex_hull.csv'))
mean_dist_df = pd.read_csv(os.path.join(ratio_features_out_dir, 'mean_distance.csv'))

#Energies
Eint_df_iqr = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Eint_df_iqr.csv'))
Edft_fe_df_iqr = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edft_fe_df_iqr.csv'))


# DACs on N-doped carbon dataset

| Feature                     | Description                                                                 |
|----------------------------|-----------------------------------------------------------------------------|
| `system_dacs`              | Identifier for the full DAC (dual-atom catalyst) system.                    |
| `tot_energy_dacs`          | Total DFT-calculated energy of the DAC system.                              |
| `M1`                       | Symbol of the first metal atom in the DAC.                                  |
| `M2`                       | Symbol of the second metal atom in the DAC.                                 |
| `heteroatom_dacs`          | Indicates whether a heteroatom is present in the DAC structure.             |
| `basic_cavity_dacs`        | Basic cavity label defining the DAC geometry/topology.                      |
| `cavity`                   | Standardized label for the DAC cavity.                                      |
| `cavity_v2`                | Refined version of `cavity` with cleaner or grouped labels.                 |
| `cavity_3`                 | Variant of the cavity label (e.g., for plotting/grouping).                  |
| `M1_cavity`                | Cavity structure specific to the M1 metal atom.                              |
| `cavity_4`                 | Another variant of cavity, potentially for multilevel grouping.             |
| `M2_cavity`                | Cavity structure specific to the M2 metal atom.                              |
| `system_sacs_pristine`     | Identifier for the pristine SAC system used as a reference.                |
| `tot_energy_sacs_pristine`| Total energy of the pristine single-atom catalyst structure.                |
| `heteroatom_sacs_pristine`| Indicates heteroatom presence in the pristine SAC.                          |
| `basic_cavity_sacs_pristine`| Basic cavity geometry of the pristine SAC structure.                      |
| `Ecoh_m1`                  | Cohesive energy of the M1 metal atom (used as reference).                   |
| `Ecoh_m2`                  | Cohesive energy of the M2 metal atom (used as reference).                   |
| `E_dft_M1M2`               | Adsorption energy of the M1–M2 DAC system, referenced to cohesive energies. |
| `color`                    | Color assigned to the system or metal for plotting/visualization.           |


> 💡 **Note:** We selected `E_dft_coh_df_iqr` as the main dataset for constructing the final feature set.  

In [None]:
Edft_coh_df_iqr.shape

In [None]:
print(", ".join(Edft_coh_df_iqr.columns))

In [None]:
Edft_coh_df_iqr.sample()

In [None]:
Edft_coh_df_iqr.describe().T.round(2)


## Adding M1 and M2 System Columns


This step creates two new columns — `M1_System` and `M2_System` — that uniquely represent the identity of each metal (M1 and M2) together with its associated coordination environment (cavity).  
The cavity roles are mapped using `dacs_dict`, which defines the position of each metal in the DAC structure.

In [None]:
# M1_System -> M1, cavity
# M2_System -> M2, cavity

Edft_coh_df_iqr['M1_System'] = Edft_coh_df_iqr.apply(
    lambda row: f"{row['M1']}_{dacs_dict.get(row['cavity'], [None, None])[0]}", axis=1)

Edft_coh_df_iqr['M2_System'] = Edft_coh_df_iqr.apply(
    lambda row: f"{row['M2']}_{dacs_dict.get(row['cavity'], [None, None])[1]}", axis=1)

## Merging rfr_logocv_df into Edft_coh_df_iqr

This code merges the `Edft_coh_df_iqr` dataset with the SACs on doped carbon dataset `rfr_mlogocv_df`:

1. **First Merge**: Joins with `rfr_mlogocv_df` on `M1_System` to fetch the DFT simulated adsorption energy (`y`) and stores it as `y_M1_System`.
2. **Second Merge**: Repeats the process for `M2_System`, storing the result in `y_M2_System`.

In [None]:
# Merge Edft_coh_df with rfr_mlogocv_df on 'M1_System' to get y_M1_System
Edft_coh_df_iqr= Edft_coh_df_iqr.merge(
    rfr_mlogocv_df[['System', 'y']].rename(columns={'System': 'M1_System', 'y': 'y_M1_System'}),
    on='M1_System',
    how='left'
)

# Merge Edft_coh_df with rfr_mlogocv_df on 'M2_System' to get y_M2_System
Edft_coh_df_iqr = Edft_coh_df_iqr.merge(
    rfr_mlogocv_df[['System', 'y']].rename(columns={'System': 'M2_System', 'y': 'y_M2_System'}),
    on='M2_System',
    how='left'
)


3. **Final Step**: Adds a new feature, `y_System`, calculated as the sum of `y_M1_System` and `y_M2_System`, representing the combined DFT simulated adsorption energy of the dual-metal system.

In [None]:
Edft_coh_df_iqr['y_System'] = Edft_coh_df_iqr['y_M1_System']+Edft_coh_df_iqr['y_M2_System']

| Feature                    | Description                                                                 |
|---------------------------|-----------------------------------------------------------------------------|
| `system_dacs`             | Identifier for the dual-atom catalyst (DAC) system.                         |
| `tot_energy_dacs`         | Total DFT energy of the DAC structure with both M1 and M2 adsorbed.        |
| `M1`                      | First metal atom in the DAC system.                                         |
| `M2`                      | Second metal atom in the DAC system.                                        |
| `heteroatom_dacs`         | Heteroatom(s) present in the DAC structure.                                 |
| `basic_cavity_dacs`       | Description of the base cavity type in the DAC.                             |
| `cavity`                  | Canonical cavity label used in the DAC.                                     |
| `cavity_v2`               | Alternative or updated naming version for the cavity.                       |
| `cavity_3`                | Variant or subclassification of the cavity.                                 |
| `M1_cavity`               | Cavity type associated with the M1 atom.                                    |
| `cavity_4`                | Further detailed version of the cavity label.                               |
| `M2_cavity`               | Cavity type associated with the M2 atom.                                    |
| `system_sacs_pristine`    | Identifier for the pristine SAC (single-atom catalyst) system.              |
| `tot_energy_sacs_pristine`| Total energy of the pristine (unmodified) SAC structure.                    |
| `heteroatom_sacs_pristine`| Heteroatoms in the pristine SAC.                                            |
| `basic_cavity_sacs_pristine` | Base cavity type in the pristine SAC.                                 |
| `Ecoh_m1`                 | Cohesive energy of metal M1.                                                |
| `Ecoh_m2`                 | Cohesive energy of metal M2.                                                |
| `E_dft_M1M2`              | Adsorption energy for the DAC system: total energy minus individual contributions. |
| `color`                   | Color label used for plotting (e.g., by metal or cavity).                   |
| `M1_System`               | Unique label combining M1 and its cavity (e.g., `Fe_C1`).                   |
| `M2_System`               | Unique label combining M2 and its cavity.                                   |
| `y_M1_System`             | Predicted adsorption energy for the M1 single-metal system.                 |
| `y_M2_System`             | Predicted adsorption energy for the M2 single-metal system.                 |
| `y_System`                | Combined predicted adsorption energy for the DAC (`y_M1_System + y_M2_System`). |


In [None]:
Edft_coh_df_iqr.shape

In [None]:
print(", ".join(Edft_coh_df_iqr.columns))

In [None]:
Edft_coh_df_iqr.sample()

In [None]:
Edft_coh_df_iqr.describe().T.round(2)


## Merging SACs on doped carbon features with the DACs on N-doped carbon dataset

We filtered the `sacs_features_df` dataset to include only entries where the heteroatom in the local coordination environment is either **carbon (C)** or **nitrogen (N)**. 

In [None]:
sacs_features_df = sacs_features_df[sacs_features_df['hetatom'].isin(['N', 'C'])]

| `Feature`                | Description                                                 |
|--------------------------|-------------------------------------------------------------|
| `System`                 | Identifier for each catalytic system                         |
| `Eads_gas`               | Adsorption energy in gas phase                               |
| `Eads`                   | Adsorption energy                                           |
| `calc_energy`            | Calculated energy of the system                             |
| `metal`                  | Metal atom type                                            |
| `cavity`                 | Identifier for the cavity structure                         |
| `plot_label`             | Label used for plotting                                    |
| `metal_energy`           | Energy related to the metal atom                           |
| `atomic_mass`            | Atomic mass of the metal                                   |
| `vdw_radius`             | Van der Waals radius                                      |
| `r_cov_sb`               | Covalent radius single bond                               |
| `r_cov_db`               | Covalent radius double bond                               |
| `dipole_polarizability`  | Dipole polarizability of the metal atom                   |
| `ionic_radii_crystals`   | Ionic radius in crystal form                              |
| `d_center_sp`            | Distance to center of sp orbitals                         |
| `Paul_electroneg`        | Pauling electronegativity                                |
| `MB_electroneg`          | Mulliken–Becke electronegativity                         |
| `electron_affinity`      | Electron affinity                                        |
| `covalent_radius`        | Covalent radius                                         |
| `atomic_number`          | Atomic number                                          |
| `Ion_energ_I`            | First ionization energy                                 |
| `Ion_energ_II`           | Second ionization energy                                |
| `Zung_radius`            | Zunger radius                                         |
| `Coh_radius`             | Cohesion radius                                      |
| `Waber_radius`           | Waber radius                                         |
| `mied_param_h`           | Miedema parameter h                                  |
| `mied_param_phi`         | Miedema parameter phi                                |
| `HOMO`                   | Highest occupied molecular orbital                   |
| `LUMO`                   | Lowest unoccupied molecular orbital                  |
| `mag_moment_bulk_d`      | Magnetic moment in bulk metal                         |
| `E_Fermi`                | Fermi energy                                         |
| `E_Fermi2`               | Secondary Fermi energy                               |
| `d_electrons`            | Number of d-electrons                                |
| `vdw_radius2`            | Secondary Van der Waals radius                      |
| `total_metal_gas_energy` | Total energy of metal in gas phase                    |
| `n_e`                    | Number of electrons                                 |
| `U_diss_0`               | Dissociation energy                                |
| `ncoord`                 | Coordination number                               |
| `number_C`               | Number of carbon atoms                            |
| `number_hetero`          | Number of heteroatoms                           |
| `frac_hetero`            | Fraction of heteroatoms                         |
| `frac_C`                 | Fraction of carbon atoms                       |
| `number_hetero_six`      | Number of heteroatoms in six coordination       |
| `frac_hetero_six`        | Fraction heteroatoms in six coordination        |
| `number_hetero_five`     | Number of heteroatoms in five coordination      |
| `frac_hetero_five`       | Fraction heteroatoms in five coordination       |
| `hetatom`                | Type of heteroatom present                      |
| `r1, r2, ..., r8`        | Distances to first to eighth nearest neighbors  |
| `en1, en2, ..., en8`     | Electronegativity of neighbors 1 to 8             |
| `min(r), mean(r), ...`   | Minimum, mean, max, std, and sum of distances   |
| `min(en), mean(en), ...` | Minimum, mean, max, std, and sum of electronegativities |
| `abs_path, rel_path`     | Absolute and relative file paths                  |
| `sc, dc`                 | Structural and distance coordination metrics     |
| `chem_formula`           | Chemical formula of the system                    |
| `num_atoms`              | Number of atoms in the system                     |
| `cavity_ids`             | Identifiers for cavities                          |
| `cont_cavity_ds`         | Continuous cavity distance metrics                |
| `cont_cavity_ang`        | Continuous cavity angle metrics                   |
| `cavity_nd`              | Number density in cavity                          |
| `cavity_ang`             | Cavity angle                                      |
| `posc_cavity_ds`         | Positive cavity distance                          |
| `posc_cavity_ang`        | Positive cavity angle                             |
| `min(cont_cavity_ds), max(cont_cavity_ds), mean(cont_cavity_ds), std(cont_cavity_ds)` | Statistical metrics on continuous cavity distances |
| `min(posc_cavity_ds), max(posc_cavity_ds), mean(posc_cavity_ds), std(posc_cavity_ds)` | Statistical metrics on positive cavity distances   |
| `min(cont_cavity_ang), max(cont_cavity_ang), mean(cont_cavity_ang), std(cont_cavity_ang)` | Statistical metrics on continuous cavity angles    |
| `min(posc_cavity_ang), max(posc_cavity_ang), mean(posc_cavity_ang), std(posc_cavity_ang)` | Statistical metrics on positive cavity angles      |
| `delta_min_ds, delta_max_ds` | Change in min and max cavity distances          |
| `cavity_energy`          | Energy associated with the cavity                   |
| `fermi_energy_cavity`    | Fermi energy of cavity                              |
| `surface`                | Surface area                                      |
| `perimeter`              | Perimeter of the cavity                            |
| `z_max, z_min, z_std, z_mean` | Z-coordinates max, min, std, mean             |
| `z_max_sec, z_min_sec, z_std_sec, z_mean_sec` | Secondary z-coordinate statistics         |
| `phi`                    | Work function or related property                  |


## Merging sacs_features_df into  Edft_coh_df_iqr

This code renames the `cavity` column in `sacs_features_df` to `cavity_dc` to prevent naming conflicts. Then, it merges `Edft_coh_df_iqr` with `sacs_features_df` by matching the `M1_cavity` column in the former with the `plot_label` column in the latter, using an inner join to retain only common entries. After the merge, the redundant `cavity_dc` column is dropped. The resulting merged DataFrame is saved as `Edft_coh_df_iqr_2` for further analysis.


In [None]:
# Rename the 'cavity' column in sacs_features_df to avoid confusion
sacs_features_df = sacs_features_df.rename(columns={'cavity': 'cavity_dc'})

# Merge the dataframes on 'cavity' from sacs_features_df and 'M1_cavity' from Edft_coh_df_clean
merged_df = pd.merge(Edft_coh_df_iqr, sacs_features_df, left_on='M1_cavity', right_on='plot_label', how='inner') 

# Drop the 'cavity_dc' column from sacs_features_df that was included in the merge
merged_df = merged_df.drop(columns=['cavity_dc'])


# Update Edft_coh_df_clean with the merged DataFrame
Edft_coh_df_iqr_2 = merged_df


In [None]:
Edft_coh_df_iqr_2.shape

💡 **Note:**  
The merged dataset has fewer rows than the original because an inner join was used. This means only rows with matching keys in both dataframes (`M1_cavity` and `plot_label`) were retained. Rows without a match were excluded, resulting in a smaller dataset.


## Merging various other properties into Edft_coh_df_iqr

💡 The dataset `Edft_coh_df_iqr_2` was merged with the `bimetallic_int_energies_df` dataframe on the `M1` column to include bimetallic interaction energy features. The resulting enriched dataset is stored in `Edft_coh_df_iqr_3`.


In [None]:

# Merge the dataframes on the 'M1' column
merged_df_bimetallic_int = pd.merge(
    Edft_coh_df_iqr_2,
    bimetallic_int_energies_df,
    on='M1',
    how='inner'
)

# Update Edft_coh_df_clean with the merged DataFrame
Edft_coh_df_iqr_3 = merged_df_bimetallic_int


💡 The dataset `Edft_coh_df_iqr_3` was merged with the `convex_hull_df` on the `cavity_v2` column to incorporate structural features related to the convex hull of the coordination environment. The updated dataset is stored as `Edft_coh_df_iqr_4`.

In [None]:

# Merge the dataframes on the 'M1' column 
merged_df_convex_hull = pd.merge(
    Edft_coh_df_iqr_3,
    convex_hull_df,
    on='cavity_v2',
    how='inner'
)

# Update Edft_coh_df_clean with the merged DataFrame
Edft_coh_df_iqr_4 = merged_df_convex_hull


💡 The dataset `Edft_coh_df_iqr_4` was merged with the `mean_dist_df` on the `cavity_v2` column to add mean distance features related to the cavity's geometry. The resulting dataset is stored as `Edft_coh_df_iqr_5`.

In [None]:

# Merge the dataframes on the 'M1' column 
merged_df_mean_distance = pd.merge(
    Edft_coh_df_iqr_4,
    mean_dist_df,
    on='cavity_v2',
    how='inner'
)

# Update Edft_coh_df_clean with the merged DataFrame
Edft_coh_df_iqr_5 = merged_df_mean_distance


In [None]:
Edft_coh_df_iqr_5.shape

## Merging Eint_df into  Edft_coh_df_iqr

In [None]:
# Assuming Edft_coh_df_clean and Eint_df are your DataFrames

# Perform the merge
Edft_coh_df_iqr_5 = pd.merge(
    Edft_coh_df_iqr_5,
    Eint_df_iqr[['system_dacs', 'Eint']],
    on='system_dacs',
    how='inner'
)

#TODO ! Detect NaN
#Edft_coh_df_clean_2 = Edft_coh_df_clean_2.dropna(axis=0)

# Drop duplicate columns from the merged DataFrame
Edft_coh_df_iqr_6 = Edft_coh_df_iqr_5.loc[:, ~Edft_coh_df_iqr_5.columns.duplicated()]


In [None]:
Edft_coh_df_iqr_6.shape

💡 The dataset `Edft_coh_df_iqr_6` was merged with `Eratio_df` on the `cavity_v2` column to incorporate aspect ratio and distance-related geometric features (`r_min_max`, `dist_min`, and `dist_max`). The resulting dataset, with duplicate columns removed, is stored as `Edft_coh_df_iqr_7`. A `TODO` is noted to handle any missing (`NaN`) values in future preprocessing.


In [None]:
# Perform the merge
Edft_coh_df_iqr_6 = pd.merge(
    Edft_coh_df_iqr_6,
    Eratio_df[['cavity_v2', 'r_min_max','dist_min','dist_max']],
    on='cavity_v2',
    how='left'
)

#TODO ! Detect NaN
#Edft_coh_df_clean_2 = Edft_coh_df_clean_2.dropna(axis=0)

# Drop duplicate columns from the merged DataFrame
Edft_coh_df_iqr_7 = Edft_coh_df_iqr_6.loc[:, ~Edft_coh_df_iqr_6.columns.duplicated()]


In [None]:
Edft_coh_df_iqr_7.shape

In [None]:
print(", ".join(Edft_coh_df_iqr_7.columns))

In [None]:
Edft_coh_df_iqr_7.sample()

In [None]:
Edft_coh_df_iqr_7.describe().T.round(2)


## Exporting Full and Subset Datasets by Cavity Type

This block saves the final processed dataset `Edft_coh_df_iqr_7` as both `.csv` and `.pkl` files. In addition, several filtered subsets based on specific cavity patterns (`din6_s`, `din6_as`, `din4_x2`, etc.) are extracted and saved individually. 

In [None]:

Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edacs_dft.csv')
Edft_coh_df_iqr_7.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edacs_dft.pkl')
Edft_coh_df_iqr_7.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
Edft_din6_s_df = Edft_coh_df_iqr_7[Edft_coh_df_iqr_7['cavity'].str.contains('din6_s')]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din6_s_df.csv')
Edft_din6_s_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din6_s_df.pkl')
Edft_din6_s_df.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
Edft_din6_as_df = Edft_coh_df_iqr_6[Edft_coh_df_iqr_6['cavity'].str.contains('din6_as')]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din6_as_df.csv')
Edft_din6_as_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din6_as_df.pkl')
Edft_din6_as_df.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
Edft_din4_x2_df = Edft_coh_df_iqr_7[Edft_coh_df_iqr_7['cavity'].str.contains('din4_x2')]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din4_x2_df.csv')
Edft_din4_x2_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din4_x2_df.pkl')
Edft_din4_x2_df.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
Edft_din6_df = Edft_coh_df_iqr_7[Edft_coh_df_iqr_7['cavity'].str.contains('din6')]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din6_df.csv')
Edft_din6_s_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din6_df.pkl')
Edft_din6_df.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)
Edft_din6_s_din4_x2_df = Edft_coh_df_iqr_7[Edft_coh_df_iqr_7['cavity'].str.contains('din4_x2|din6_s')]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din6_s_din4_x2_df.csv')
Edft_din6_s_din4_x2_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din6_s_din4_x2_df.pkl')
Edft_din6_s_din4_x2_df.to_pickle(save_path_pkl)


Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)

Edft_din6_as_din4_x2_df = Edft_coh_df_iqr_7[Edft_coh_df_iqr_7['cavity'].str.contains('din6_as|din4_x2', regex=True)]
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_din6_as_din4_x2_df.csv')
Edft_din6_as_din4_x2_df.to_csv(save_path_csv, index=False, header=True)
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_din6_as_din4_x2_df.pkl')
Edft_din6_as_din4_x2_df.to_pickle(save_path_pkl)


## Balancing the Dataset

To ensure a more uniform representation across different cavity types, we grouped the dataset by the basic_cavity_dacs column and sampled up to 600 entries per group. This helps avoid biases in downstream analysis or modeling. The balanced dataset was saved in both .csv and .pkl formats for future use.

In [None]:
# Assuming Edft_coh_df_iqr_7 is your DataFrame
Edft_coh_df_iqr_7 = Edft_coh_df_iqr_7.copy(deep=True)

# Define the number of samples you want for each category
desired_samples = 600 

# Perform sampling to get exactly 500 rows for each category
balanced_df = Edft_coh_df_iqr_7.groupby('basic_cavity_dacs').apply(
    lambda x: x.sample(n=min(len(x), desired_samples), random_state=42)
).reset_index(drop=True)

# Save the new balanced dataset
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_balanced_df.csv')
balanced_df.to_csv(save_path_csv, index=False, header=True)

save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_balanced_df.pkl')
balanced_df.to_pickle(save_path_pkl)


In [None]:
balanced_df.shape