# 0.1 Importing the DACs on N-doped carbon dataset

## Notebook Setup: Imports and Configuration


In [None]:
# ─────────────────────────────
# Standard Library Imports
# ─────────────────────────────
import os
import sys

# ─────────────────────────────
# Adjust sys.path for Local Modules
# ─────────────────────────────
src_dir = os.path.abspath(os.path.join('..', '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# ─────────────────────────────
# Project-Specific Imports
# ─────────────────────────────
from settings import *
from utils import *
from vis import *

## Import the DACs on N-doped carbon data

In [None]:
# Define input and output directories
dft_dir = '../../../data/raw'
dacs_energies_out_dir = '../../../data/external/dacs_energies_out'

In [None]:
# Load cohesive energy data for individual metals
metal_df = pd.read_csv(os.path.join(dft_dir, 'metal_data.csv'))

# Load DFT total energy data for various DACs on N-doped carbon structures
total_energy_dacs_df = pd.read_csv(os.path.join(dft_dir, 'total_energy_dacs.csv'))

# Load DFT total energy data for pristine DACs local coordination environment (no adsorbates)
total_energy_sacs_dc_prist_df = pd.read_csv(os.path.join(dft_dir, 'total_energy_sacs_dc_prist.csv'))

# Load DFT total energy data for single metal on DACs local coordination
total_energy_sacs_df = pd.read_csv(os.path.join(dft_dir, 'total_energy_sacs_dc.csv'))


### Cohesive metal dataset

In [None]:
metal_df.shape

In [None]:
print(", ".join(metal_df.columns))

In [None]:
metal_df.sample()

In [None]:
metal_df.describe().T.round(2)


### DACs on N-doped carbon total energy dataset

In [None]:
total_energy_dacs_df.shape

In [None]:
print(", ".join(total_energy_dacs_df.columns))

In [None]:
total_energy_dacs_df.sample()

In [None]:
total_energy_dacs_df.head() 


In [None]:
total_energy_dacs_df.describe().T.round(2)


### Pristine DACs on N-doped carbon total energy dataset

In [None]:
total_energy_dacs_df.shape

In [None]:
print(", ".join(total_energy_sacs_dc_prist_df.columns))

In [None]:
total_energy_sacs_dc_prist_df.sample()

In [None]:
total_energy_sacs_dc_prist_df.describe().T.round(2)


### Single atom on DACs local coordination enviroment total energies dataset

In [None]:
total_energy_sacs_df.shape

In [None]:
print(", ".join(total_energy_sacs_df.columns))

In [None]:
total_energy_sacs_df.sample()

In [None]:
total_energy_sacs_df.describe().T.round(2)


## Extract Structural Features for DACs on N-doped carbon total energy dataset

This cell parses the `system` string column in the DAC total energy dataset to extract key structural and compositional features. 
New columns are created for metal identifiers (`M1`, `M2`), heteroatoms, cavity types, and engineered cavity labels used for analysis. 
This enables downstream grouping, filtering, or visualization based on atomic configuration.


In [None]:
total_energy_dacs_df['M1'] = total_energy_dacs_df['system'].str[:2]
total_energy_dacs_df['M2'] = total_energy_dacs_df['system'].str[3:5]
total_energy_dacs_df['heteroatom'] = total_energy_dacs_df['system'].str[6]
total_energy_dacs_df['basic_cavity'] = total_energy_dacs_df['system'].str[8:12]
total_energy_dacs_df['cavity'] = total_energy_dacs_df['system'].str[6:]
total_energy_dacs_df['cavity_v2'] = total_energy_dacs_df['system'].str[6:]
#total_energy_dacs_df['cavity_3'] = total_energy_dacs_df['cavity'].str.extract(r'(.*c.)')
total_energy_dacs_df['cavity_3'] = total_energy_dacs_df['cavity'].str.extract(r'c(\d)')
total_energy_dacs_df['M1_cavity'] = total_energy_dacs_df['system'].str[:2] + '_' + total_energy_dacs_df['heteroatom'] +'_' + total_energy_dacs_df['system'].str[8:]
# Create a new column 'cavity_4' based on the values in 'cavity'
total_energy_dacs_df['cavity_4'] = total_energy_dacs_df['cavity'].apply(
    lambda x: 'din4_x2' if 'din4_x2' in x else 
              ('din6_s' if 'din6_s' in x else 
              ('din6_as' if 'din6_as' in x else 
               'other'))  # Default value if none of the specified values are found
)


total_energy_dacs_df['M2_cavity'] = total_energy_dacs_df['system'].str[3:5] + '_' + total_energy_dacs_df['heteroatom'] +'_' + total_energy_dacs_df['system'].str[8:]


### Column Descriptions

| Column         | Description |
|----------------|-------------|
| `system`       | Unique identifier for each DAC structure, encoding metal atoms, heteroatom, and cavity configuration. |
| `tot_energy`   | Total DFT-calculated energy of the DAC system (in eV). |
| `M1`           | Symbol of the first metal atom (first two characters of `system`). |
| `M2`           | Symbol of the second metal atom (characters 4–5 of `system`). |
| `heteroatom`   | The non-metal heteroatom (e.g., N, O, S) present in the cavity (character 6 of `system`). |
| `basic_cavity` | Basic label for the cavity configuration (characters 8–11 of `system`). |
| `cavity`       | Substring of `system` starting from the heteroatom, describing full cavity configuration. |
| `cavity_v2`    | Duplicate of `cavity` (used for flexible processing or alternate mappings). |
| `cavity_3`     | Numeric identifier of the cavity (extracted from pattern `c#` in the `cavity` string). |
| `M1_cavity`    | Concatenation of `M1`, `heteroatom`, and `basic_cavity` to create a composite identifier for M1-cavity configuration. |
| `cavity_4`     | Simplified cavity label categorizing into `din4_x2`, `din6_s`, `din6_as`, or `other`, based on string patterns. |
| `M2_cavity`    | Concatenation of `M2`, `heteroatom`, and `basic_cavity` to create a composite identifier for M2-cavity configuration. |


In [None]:
total_energy_sacs_df.shape

In [None]:
print(", ".join(total_energy_dacs_df.columns))

In [None]:
total_energy_dacs_df.sample()

In [None]:
total_energy_dacs_df.describe().T.round(2)


## Extract Structural Features for pristine DACs on N-doped carbon total energy dataset

In [None]:
total_energy_sacs_dc_prist_df['heteroatom'] = total_energy_sacs_dc_prist_df['system'].str[0]
total_energy_sacs_dc_prist_df['basic_cavity'] = total_energy_sacs_dc_prist_df['system'].str[2:6]


### Column Descriptions

| Column         | Description |
|----------------|-------------|
| `system`       | Unique identifier for each pristine DAC system, encoding the heteroatom and cavity structure. |
| `tot_energy`   | Total DFT-calculated energy of the pristine DAC system (in eV). |
| `heteroatom`   | The non-metal heteroatom present in the cavity (first character of `system`). |
| `basic_cavity` | Basic cavity configuration label extracted from characters 2 to 5 of `system`. |


In [None]:
total_energy_sacs_dc_prist_df.shape

In [None]:
print(", ".join(total_energy_sacs_dc_prist_df.columns))

In [None]:
total_energy_sacs_dc_prist_df.sample()

In [None]:
total_energy_sacs_dc_prist_df.describe().T.round(2)


## Extract Structural Features for single atoms on DACs on N-doped carbon total energy dataset

In [None]:
total_energy_sacs_df['metal'] = total_energy_sacs_df['system'].str[:2]
total_energy_sacs_df['heteroatom'] = total_energy_sacs_df['system'].str[3]
total_energy_sacs_df['basic_cavity'] = total_energy_sacs_df['system'].str[5:9]
total_energy_sacs_df['cavity'] = total_energy_sacs_df['system'].str[3:]

### Column Descriptions

| Column         | Description |
|----------------|-------------|
| `system`       | Unique identifier for each SAC system, encoding the metal, heteroatom, and cavity configuration. |
| `tot_energy`   | Total DFT-calculated energy of the SAC system (in eV). |
| `metal`        | The single metal atom in the SAC, extracted from the first two characters of `system`. |
| `heteroatom`   | The non-metal heteroatom coordinated to the metal atom, extracted from the 4th character of `system`. |
| `basic_cavity` | Basic cavity structure, typically 4 characters long, extracted from positions 5 to 8 in `system`. |
| `cavity`       | Full cavity description including heteroatom and structure, extracted from character 4 onward in `system`. |


In [None]:
total_energy_sacs_df.shape

In [None]:
print(", ".join(total_energy_sacs_df.columns))

In [None]:
total_energy_sacs_df.sample()

In [None]:
total_energy_sacs_df.describe().T.round(2)


# Calculating the Energies of the DACs on N-doped carbon dataset

The adsorption energy for the DACs is defined:

$$
E_{\text{ads}} = E_{\text{M1,M2,cc}} - E_{\text{cc}} - E_{\text{M1}} - E_{\text{M2}}
$$

where:\
$E_{\text{M1,M2,cc}}$ is the total energy of the adsorbed metal pair
on the N-doped carbon\
$E_{\text{cc}}$ is the energy of the pristine N-doped
carbon\
$E_{\text{M1}}$ is the cohesive energy of the metal M1 \
$E_{\text{M2}}$ is the cohesive energy of the metal M2

## Merging DFT Energies and Calculating Adsorption Energy

This step merges DFT energy datasets of DAC and SAC systems with cohesive energies of individual metals to compute the adsorption energy (E_dft_M1M2) of heteroatoms on DAC surfaces. The final energy is calculated by subtracting the SAC pristine energy and the cohesive energies of both metals from the total DAC energy. The resulting dataset is saved for further analysis.

In [None]:
# Apply the adjust_names function to the 'cavity' column
total_energy_dacs_df['cavity'] = total_energy_dacs_df['cavity'].apply(adjust_names)

# Merge the dataframes: left on 'cavity', right on 'system'
merged_df = pd.merge(total_energy_dacs_df, total_energy_sacs_dc_prist_df, left_on='cavity', right_on='system', suffixes=('_dacs', '_sacs_pristine'))

# Merge the merged_df with metal_df to get Ecoh for M1
merged_m1_df = pd.merge(merged_df, metal_df[['metal', 'Ecoh']], left_on='M1', right_on='metal', how='left')
# Create a new column 'Ecoh_m1' by assigning the values from the 'Ecoh' column and drop 'metal' column
merged_m1_df.rename(columns={'Ecoh': 'Ecoh_m1'}, inplace=True)
merged_m1_df.drop(columns=['metal'], inplace=True)

# Merge the merged_df with metal_df to get Ecoh for M2
merged_m1_m2_df = pd.merge(merged_m1_df, metal_df[['metal', 'Ecoh']], left_on='M2', right_on='metal', how='left')
# Create a new column 'Ecoh_m2' by assigning the values from the 'Ecoh' column and drop 'metal' column
merged_m1_m2_df.rename(columns={'Ecoh': 'Ecoh_m2'}, inplace=True)
merged_m1_m2_df.drop(columns=['metal'], inplace=True)

# Calculate the E_dft_M1M2 value by subtracting the energy columns and cohesive energies
merged_m1_m2_df['E_dft_M1M2'] = (
    merged_m1_m2_df['tot_energy_dacs'] -
    merged_m1_m2_df['tot_energy_sacs_pristine'] -
    merged_m1_m2_df['Ecoh_m1'] -
    merged_m1_m2_df['Ecoh_m2']
)

# Copy the merged_df to Edft_coh_df for saving
Edft_coh_df = merged_m1_m2_df.copy()

# Define the path where you want to save the CSV file
save_path = os.path.join(dacs_energies_out_dir, 'Edft_coh_df.csv')

# # Save the dataframe to a CSV file
Edft_coh_df.to_csv(save_path, index=False, header=True)


# Define the path where you want to save the pickle file
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_coh_df.pkl')

# Save the dataframe to a pickle file
Edft_coh_df.to_pickle(save_path_pkl)

| **Feature**               | **Description**                                                                                  |
|---------------------------|------------------------------------------------------------------------------------------------|
| `system_dacs`             | Identifier for the DAC system (dual-atom catalyst)                                             |
| `tot_energy_dacs`         | Total DFT energy of the DAC system                                                             |
| `M1`                      | Metal 1 element symbol in the DAC system                                                       |
| `M2`                      | Metal 2 element symbol in the DAC system                                                       |
| `heteroatom_dacs`         | Heteroatom element symbol in the DAC system                                                    |
| `basic_cavity_dacs`       | Basic cavity structure identifier in the DAC system                                           |
| `cavity`                  | Adjusted cavity identifier used for merging                                                    |
| `cavity_v2`               | Alternative cavity identifier (same as `cavity`)                                               |
| `cavity_3`                | Extracted cavity subtype (e.g., numeric pattern)                                               |
| `M1_cavity`               | Combination of M1 metal, heteroatom, and cavity details                                        |
| `cavity_4`                | Categorized cavity type (e.g., `din4_x2`, `din6_s`, `din6_as`, `other`)                        |
| `M2_cavity`               | Combination of M2 metal, heteroatom, and cavity details                                        |
| `system_sacs_pristine`    | Identifier for the pristine SAC system (single-atom catalyst)                                 |
| `tot_energy_sacs_pristine`| Total DFT energy of the pristine SAC system                                                   |
| `heteroatom_sacs_pristine`| Heteroatom element symbol in the pristine SAC system                                          |
| `basic_cavity_sacs_pristine` | Basic cavity structure identifier in the pristine SAC system                             |
| `Ecoh_m1`                 | Cohesive energy of metal M1                                                                   |
| `Ecoh_m2`                 | Cohesive energy of metal M2                                                                   |
| `E_dft_M1M2`              | Calculated adsorption energy: total DAC energy minus pristine SAC energy and cohesive energies |


In [None]:
Edft_coh_df.shape

In [None]:
print(", ".join(Edft_coh_df.columns))

In [None]:
Edft_coh_df.sample()

In [None]:
Edft_coh_df.describe().T.round(2)


In [None]:
# # Select specific configuration
# print(Edft_coh_df[Edft_coh_df['system_dacs'] == 'Ni_Fe_N_din6_as_c0']['E_dft_M1M2'])

#### Remove not stable cavity M_N_din4_x2_c4_f

In [None]:
Edft_coh_df = Edft_coh_df[~(Edft_coh_df['cavity_v2'] == 'N_din4_x2_c4_f') & ~Edft_coh_df['cavity_v2'].str.endswith('_v2')]#&~(Edft_coh_df['cavity_v2'] == 'N_din6_s_c4_0134')]

In [None]:
Edft_coh_df.shape


In [None]:
total_energy_dacs_df.shape

## Boxplot of the E_dft_M1M2 categorized by metal

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_coh_df,
    x_col='M1',
    color_map=metal_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Metal',
    y_title='E_dft_M1M2 Energy',
    file_name='Ecoh_M1_boxplot_1'
)

fig.show()


## Boxplot of the E_dft_M1M2 categorized by metal - no outliers

In [None]:

# Assuming Edft_coh_df and metal_colors are already defined

# Map colors to the metal types in the dataframe
Edft_coh_df['color'] = Edft_coh_df['M1'].map(metal_colors)


# Detect outliers
outliers_dacs_metal = detect_outliers(Edft_coh_df, 'M1', 'E_dft_M1M2')


# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_metal.csv')
outliers_dacs_metal.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Edft_coh_df_no_outliers_metal = Edft_coh_df[~Edft_coh_df.index.isin(outliers_dacs_metal.index)]

fig_no_outliers = plot_categorical_energy_boxplot(
    df=Edft_coh_df_no_outliers_metal,
    x_col='M1',
    color_map=metal_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Metal',
    y_title='E_dft_M1M2 Energy',
    file_name='Ecoh_M1_boxplot_no_outliers'
)

fig_no_outliers.show()



In [None]:
metal_summary = Edft_coh_df_no_outliers_metal.groupby('M1')['E_dft_M1M2'].describe().T
print(metal_summary)

## Boxplot of the E_dft_M1M2 categorized by cavity 

In [None]:
fig_cavity = plot_categorical_energy_boxplot(
    df=Edft_coh_df,
    x_col='cavity_v2',
    color_map=cavity_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    save_dir='../../../data/figures/dacs_eda',
    file_name='Ecoh_M1_boxplot_cavity'
)

fig_cavity.show()


## Boxplot of the E_dft_M1M2 categorized by cavity - no outliers

In [None]:
# Map colors to the metal types in the dataframe
Edft_coh_df['color'] = Edft_coh_df['cavity_v2'].map(cavity_colors)

# Detect outliers
outliers_cavity_dacs = detect_outliers(Edft_coh_df, 'cavity_v2', 'E_dft_M1M2')

# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_cavity.csv')
outliers_cavity_dacs.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Edft_coh_df_no_outliers_cav = Edft_coh_df[~Edft_coh_df.index.isin(outliers_cavity_dacs.index)]

fig_no_outliers_cavity = plot_categorical_energy_boxplot(
    df=Edft_coh_df_no_outliers_cav,
    x_col='cavity_v2',
    color_map=cavity_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    save_dir='../../../data/figures/dacs_eda',
    file_name='Ecoh_M1_boxplot_cavity_no_outliers'
)

fig_no_outliers_cavity.show()



In [None]:
cavity_summary = Edft_coh_df_no_outliers_cav.groupby('cavity_v2')['E_dft_M1M2'].describe().T
print(cavity_summary)

## Data Cleaning: Removing Metal and Cavity Outliers

In [None]:
# Combine indices from outliers_dacs_metal and outliers_dacs_cavity
combined_outliers_indices = outliers_dacs_metal.index.union(outliers_cavity_dacs.index)

Edft_coh_df_iqr = Edft_coh_df[~Edft_coh_df.index.isin(combined_outliers_indices)]

In [None]:
Edft_coh_df_iqr.shape

In [None]:
print(", ".join(Edft_coh_df_iqr.columns))

In [None]:
Edft_coh_df_iqr.sample()

In [None]:
Edft_coh_df_iqr.describe().T.round(2)


## Boxplot of the E_dft_M1M2 categorized by metal - no total outliers

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_coh_df_iqr,
    x_col='M1',
    color_map=metal_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Metal',
    y_title='E_dft_M1M2 Energy',
    save_dir='../../../data/figures/dacs_eda',
    file_name='Ecoh_M1_boxplot_metal_iqr'
)

fig.show()


In [None]:
metal_summary = Edft_coh_df_iqr.groupby('M1')['E_dft_M1M2'].describe().T
print(metal_summary)


## Boxplot of the E_dft_M1M2 categorized by cavity - no total outliers

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_coh_df_iqr,
    x_col='cavity_v2',
    color_map=cavity_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    save_dir='../../../data/figures/dacs_eda',
    file_name='Ecoh_M1_boxplot_cavity_iqr'
)

fig.show()


In [None]:
cavity_summary = Edft_coh_df_iqr.groupby('cavity_v2')['E_dft_M1M2'].describe().T
print(cavity_summary)

## Dataset with the DACs on N-doped carbon without outliers

Filtered Dataset of DACs on N-Doped Carbon After Outlier Removal Using IQR Method

In [None]:
# Define the path where you want to save the CSV file
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_coh_df_iqr.csv')

# Save the dataframe to a CSV file
Edft_coh_df_iqr.to_csv(save_path_csv, index=False, header=True)

# Define the path where you want to save the pickle file
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_coh_df_iqr.pkl')

# Save the dataframe to a pickle file
Edft_coh_df_iqr.to_pickle(save_path_pkl)


| Feature                  | Description                                                                                       |
|--------------------------|-------------------------------------------------------------------------------------------------|
| 'system_dacs'          | Identifier for the dual-atom catalyst (DAC) system                                              |
| 'tot_energy_dacs'      | Total DFT energy of the DAC system                                                              |
| 'M1'                   | Symbol of the first metal atom in the DAC                                                       |
| 'M2'                   | Symbol of the second metal atom in the DAC                                                      |
| 'heteroatom_dacs'      | Heteroatom type present in the DAC system                                                       |
| 'basic_cavity_dacs'    | Basic cavity environment descriptor of the DAC                                                  |
| 'cavity'               | Detailed cavity environment string related to the DAC                                           |
| 'cavity_v2'            | Modified/standardized cavity environment name                                                   |
| 'cavity_3'             | Numeric identifier extracted from cavity description                                            |
| 'M1_cavity'            | Concatenation of M1 metal, heteroatom, and cavity environment                                   |
| 'cavity_4'             | Categorized cavity type based on specific substrings (e.g., 'din4_x2', 'din6_s', 'din6_as', 'other') |
| 'M2_cavity'            | Concatenation of M2 metal, heteroatom, and cavity environment                                   |
| 'system_sacs_pristine' | Identifier for the single-atom catalyst (SAC) pristine system                                   |
| 'tot_energy_sacs_pristine' | Total DFT energy of the pristine SAC system                                               |
| 'heteroatom_sacs_pristine' | Heteroatom type in the pristine SAC system                                               |
| 'basic_cavity_sacs_pristine' | Basic cavity environment descriptor of the pristine SAC                               |
| 'Ecoh_m1'              | Cohesive energy of the first metal (M1)                                                        |
| 'Ecoh_m2'              | Cohesive energy of the second metal (M2)                                                       |
| 'E_dft_M1M2'           | Calculated interaction energy combining DFT energies and cohesive energies                      |
| 'color'                | Color code mapped to the metal type or cavity for visualization                                 |


In [None]:
Edft_coh_df_iqr.shape

In [None]:
print(", ".join(Edft_coh_df_iqr.columns))

In [None]:
Edft_coh_df_iqr.sample()

In [None]:
Edft_coh_df_iqr.describe().T.round(2)

## Interaction Energy

The interaction energy between the two metals M1 and M2 is defined:
$$
E_{\text{int}} = E_{\text{M1,M2,cc}} + E_{\text{cc}} - E_{\text{M1/cc}} - E_{\text{M2/cc}}
$$

where:\
$E_{\text{M1,M2,cc}}$ is the total energy of the dual-atom
catalyst with both metals co-adsorbed\
$E_{\text{cc}}$ is the total energy of
the pristine support\
$E_{\text{M1/cc}}$ is total energy of the M1 metal center adsorbed individually into the support\
$E_{\text{M2/cc}}$ is total energy of the M2 metal center adsorbed individually into the support

# Merging Datasets of Single-Atom Adsorption on DAC Local Coordination Environments 

This script merges datasets related to dual-atom catalysts (DACs), pristine supports, and single-atom adsorbed systems. It aligns the local coordination environments of adsorbed metal atoms (M1 and M2) on DACs with their corresponding reference energies, enabling the calculation of interaction energies (Eint) for the adsorbed configurations. The result is a unified dataset suitable for analyzing adsorption behavior and energetics across different catalyst structures.

In [None]:

total_energy_dacs_df['cavity'] = total_energy_dacs_df['cavity'].apply(adjust_names)


# Merge the dataframes: left on 'cavity', right on 'system'
merged_df = pd.merge(total_energy_dacs_df, total_energy_sacs_dc_prist_df, left_on='cavity', right_on='system', suffixes=('_dacs', '_sacs_pristine'))

print(merged_df.shape)

merged_n1_df = pd.merge(merged_df, total_energy_sacs_df[['system', 'tot_energy']].rename(columns={'tot_energy': 'tot_energy_M1'}), left_on='M1_cavity', right_on='system')

# Dropping duplicates
merged_n1_df = merged_n1_df.drop_duplicates()


print(merged_n1_df.shape)

merged_n1_n2_df = pd.merge(merged_n1_df, total_energy_sacs_df[['system', 'tot_energy']].rename(columns={'tot_energy': 'tot_energy_M2'}), left_on='M2_cavity', right_on='system')

# Subtract the 'tot_energy' columns
merged_n1_n2_df['Eint'] = merged_n1_n2_df['tot_energy_dacs'] + merged_n1_n2_df['tot_energy_sacs_pristine'] - merged_n1_n2_df['tot_energy_M1'] - merged_n1_n2_df['tot_energy_M2']

# Dropping duplicates
merged_n1_n2_df = merged_n1_n2_df.drop_duplicates()

print(merged_n1_n2_df.shape)

Eint_df =merged_n1_n2_df

# Define the path where you want to save the CSV file
save_path = os.path.join(dacs_energies_out_dir, 'Eint_df.csv')

# Save the dataframe to a CSV file
Eint_df.to_csv(save_path, index=False, header=True)

In [None]:
Eint_df.shape

In [None]:
print(", ".join(Eint_df.columns))

In [None]:
Eint_df.sample()

In [None]:
Eint_df.describe().T.round(2)


## Boxplot of the Eint categorized by metal type

In [None]:
plot_categorical_energy_boxplot(
    df=Eint_df,
    x_col='M1',
    y_col='Eint',
    color_map=metal_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Metal Type',
    y_title='Interaction Energy (Eint)',
    file_name='Eint_M1_metal_boxplot'
)


## Boxplot of the Eint categorized by metal type - no outliers

In [None]:
# Map colors to the metal types in the dataframe
Eint_df['color'] = Eint_df['M1'].map(metal_colors)

# Detect outliers
outliers_dacs_metal_Eint = detect_outliers(Eint_df, 'M1', 'Eint')

# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_metal_Eint.csv')
outliers_dacs_metal_Eint.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Eint_df_no_outliers_metal = Eint_df[~Eint_df.index.isin(outliers_dacs_metal_Eint.index)]

plot_categorical_energy_boxplot(
    df=Eint_df_no_outliers_metal,
    x_col='M1',
    y_col='Eint',
    color_map=metal_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Metal Type',
    y_title='Interaction Energy (Eint)',
    file_name='Eint_M1_metal_no_outliers_boxplot'
)

## Boxplot of the Eint categorized by cavity type

In [None]:
plot_categorical_energy_boxplot(
    df=Eint_df,
    x_col='cavity_v2',
    y_col='Eint',
    color_map=cavity_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    file_name='Eint_M1_cavity_boxplot'
)


## Boxplot of the Eint categorized by cavity type

In [None]:
# Map colors to the metal types in the dataframe
Eint_df['color'] = Eint_df['cavity_v2'].map(cavity_colors)

# Detect outliers
outliers_cavity_dacs_Eint = detect_outliers(Eint_df, 'cavity_v2', 'Eint')

# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_cavity_Eint.csv')
outliers_cavity_dacs_Eint.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Eint_df_no_outliers_cav = Eint_df[~Eint_df.index.isin(outliers_cavity_dacs_Eint.index)]


plot_categorical_energy_boxplot(
    df=Eint_df,
    x_col='cavity_v2',
    y_col='Eint',
    color_map=cavity_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    file_name='Eint_M1_cavity_no_outliers_boxplot'
)


## Data Cleaning: Removing Metal and Cavity outliers

In [None]:
# Combine indices from outliers_dacs_metal and outliers_dacs_cavity
combined_outliers_indices = outliers_dacs_metal_Eint.index.union(outliers_cavity_dacs_Eint.index)

Eint_df_iqr = Eint_df[~Eint_df.index.isin(combined_outliers_indices)]

In [None]:
# Define the path where you want to save the CSV file
save_path_csv = os.path.join(dacs_energies_out_dir, 'Eint_df_iqr.csv')

# Save the dataframe to a CSV file
Eint_df_iqr.to_csv(save_path_csv, index=False, header=True)

# Define the path where you want to save the pickle file
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Eint_df_iqr.pkl')

# Save the dataframe to a pickle file
Eint_df_iqr.to_pickle(save_path_pkl)


### 🔍 Feature Description Table

| **Feature Name**         | **Description**                                                                 |
|--------------------------|----------------------------------------------------------------------------------|
| `system_dacs`            | Identifier for the dual-atom catalyst (DAC) system (e.g., M1-M2 bound to cavity). |
| `tot_energy_dacs`        | Total DFT energy of the DAC system with both M1 and M2 adsorbed.                |
| `M1`                     | First metal atom type in the DAC (e.g., Fe, Co, Ni).                             |
| `M2`                     | Second metal atom type in the DAC.                                               |
| `heteroatom_dacs`        | Type of heteroatom(s) present in the DAC support (e.g., N, B, O).               |
| `basic_cavity_dacs`      | Simplified representation of the DAC binding site or cavity.                    |
| `cavity`                 | Original cavity identifier from the DAC structure.                              |
| `cavity_v2`              | Cleaned or renamed version of `cavity` (standardized for analysis/plotting).    |
| `cavity_3`               | Further refined cavity label (possibly grouped or reclassified).                |
| `M1_cavity`              | Identifier of the single-metal site where only M1 is adsorbed.                  |
| `cavity_4`               | Another variant of the cavity name (used for additional grouping or filtering). |
| `M2_cavity`              | Identifier of the single-metal site where only M2 is adsorbed.                  |
| `system_sacs_pristine`   | Identifier for the pristine (metal-free) single-atom catalyst (SAC) system.     |
| `tot_energy_sacs_pristine` | Total DFT energy of the pristine SACS system (without any metal).             |
| `heteroatom_sacs_pristine` | Type of heteroatom(s) in the pristine SACS.                                  |
| `basic_cavity_sacs_pristine` | Simplified representation of the pristine cavity site.                     |
| `system_x`               | Reference to M1-cavity system used in energy comparison.                        |
| `tot_energy_M1`          | Total DFT energy of the system with only M1 adsorbed.                           |
| `system_y`               | Reference to M2-cavity system used in energy comparison.                        |
| `tot_energy_M2`          | Total DFT energy of the system with only M2 adsorbed.                           |
| `Eint`                   | Interaction energy of the full DAC system with both metals, relative to parts.  |
| `color`                  | Color code (used for plotting) mapped from either `M1` or `cavity_v2`.          |


In [None]:
Eint_df_iqr.shape

In [None]:
print(", ".join(Eint_df_iqr.columns))

In [None]:
Eint_df_iqr.sample()

In [None]:
Eint_df_iqr.describe().T.round(2)

## Adsorption energy calculation (after Fe adsorption)

The adsorption energy for the DACs on N-doped carbon (after Fe adsoprtion)is defined:

$$
E_{\text{DFT}} = E_{\text{total, M1M2}} - E_{\text{pristine}} - E_{\text{M1}} - E_{\text{M2/cavity}}
$$

where\
$E_{\text{total, M1M2}}$ is the total energy of the DAC with the two metals adsorbed\
$E_{\text{pristine}}$ is the total energy of the pristine structure\
$E_{\text{M1}}$ is the cohesive energy of the M1 adsorbate\
$E_{\text{M2/cavity}}$ is the total energy of the M2 adsorbed in the pristine cavity

This code computes the **adsorption energy** (`Edft_fe`) for dual-atom catalysts by assessing the energy change when metal **M1** is added to a site where **M2** is already adsorbed. Also, it adds the features from the SACs on doped carbon.

In [None]:
total_energy_dacs_df['cavity'] = total_energy_dacs_df['cavity'].apply(adjust_names)


# Merge the dataframes: left on 'cavity', right on 'system'
merged_df = pd.merge(total_energy_dacs_df, total_energy_sacs_dc_prist_df, left_on='cavity', right_on='system', suffixes=('_dacs', '_sacs_pristine'))

merged_x1_df = pd.merge(merged_df, metal_df, left_on='M1', right_on='metal')

# Create a new column 'Ecoh_m1' by assigning the values from the 'Ecoh' column
merged_x1_df['Ecoh_m1'] = merged_x1_df['Ecoh']


merged_x1_x2_df = pd.merge(merged_x1_df, total_energy_sacs_df[['system', 'tot_energy']].rename(columns={'tot_energy': 'tot_energy_M2'}), left_on='M2_cavity', right_on='system')

# Subtract the 'tot_energy' columns
merged_x1_x2_df['Edft_fe'] = merged_x1_x2_df['tot_energy_dacs'] - merged_x1_x2_df['Ecoh_m1'] - merged_x1_x2_df['tot_energy_M2']

# Dropping duplicates
merged_x1_x2_df = merged_x1_x2_df.drop_duplicates()


Edft_fe_df =merged_x1_x2_df

# Define the path where you want to save the CSV file
save_path = os.path.join(dacs_energies_out_dir, 'Edft_fe_df.csv')

# Save the dataframe to a CSV file
Edft_fe_df.to_csv(save_path, index=False, header=True)


| **Feature Name**             | **Description**                                                                                   |
|------------------------------|-------------------------------------------------------------------------------------------------|
| `system_dacs`                | Identifier for the dual-atom catalyst (DAC) system, combining M1 and M2 adsorbed on a cavity.   |
| `tot_energy_dacs`            | Total DFT energy of the DAC system with both M1 and M2 metals adsorbed.                         |
| `M1`                        | Symbol or type of the first metal atom in the DAC.                                              |
| `M2`                        | Symbol or type of the second metal atom in the DAC.                                             |
| `heteroatom_dacs`            | Type of heteroatom(s) present in the DAC support structure (e.g., N, B, O).                     |
| `basic_cavity_dacs`          | Simplified or generalized descriptor of the DAC binding site or cavity.                         |
| `cavity`                    | Original cavity identifier associated with the DAC structure.                                  |
| `cavity_v2`                 | Cleaned or standardized version of `cavity` for consistent analysis and plotting.              |
| `cavity_3`                  | Additional refined or grouped cavity classification.                                            |
| `M1_cavity`                 | Identifier for the site or cavity associated with the first metal (M1) adsorbed singly.         |
| `cavity_4`                  | Another variant of cavity categorization used for filtering or grouping.                        |
| `M2_cavity`                 | Identifier for the site or cavity associated with the second metal (M2) adsorbed singly.        |
| `system_sacs_pristine`      | Identifier for pristine single-atom catalyst (SAC) systems without metals adsorbed.             |
| `tot_energy_sacs_pristine`  | Total DFT energy of the pristine SAC system (without metal adsorption).                         |
| `heteroatom_sacs_pristine`  | Heteroatom type(s) present in the pristine SAC support.                                        |
| `basic_cavity_sacs_pristine`| Simplified descriptor of the pristine SAC cavity site.                                         |
| `metal`                     | Metal type or symbol associated with cohesive and atomic properties.                           |
| `Ecoh`                      | Cohesive energy of the metal atom, representing the energy needed to break it into atoms.      |
| `atomic_mass`               | Atomic mass of the metal (in atomic mass units).                                               |
| `vdw_radius`                | Van der Waals radius of the metal atom (in angstroms).                                        |
| `r_cov_sb`                  | Covalent radius for single bonds (in angstroms).                                              |
| `r_cov_db`                  | Covalent radius for double bonds (in angstroms).                                              |
| `dipole_polarizability`    | Dipole polarizability of the metal atom, indicating how easily its electron cloud distorts.    |
| `ionic_radii_crystals`     | Ionic radius of the metal in crystal structures (in angstroms).                               |
| `d_center_sp`               | d-band center position relative to the Fermi level, an electronic descriptor relevant to catalysis. |
| `Paul_electroneg`           | Pauling electronegativity of the metal atom.                                                  |
| `MB_electroneg`             | Mulliken–Brønsted electronegativity, another measure of electronegativity.                    |
| `electron_affinity`         | Electron affinity of the metal atom (energy released when adding an electron).                 |
| `covalent_radius`           | Covalent radius of the metal atom (in angstroms).                                             |
| `atomic_number`             | Atomic number (number of protons) of the metal element.                                       |
| `Ion_energ_I`               | First ionization energy of the metal atom (energy to remove first electron).                   |
| `Ion_energ_II`              | Second ionization energy of the metal atom.                                                   |
| `Zung_radius`               | Zunger effective ionic radius (a specific empirical radius).                                 |
| `Coh_radius`                | Cohn effective atomic radius.                                                                 |
| `Waber_radius`              | Waber atomic radius, another empirical measure of atomic size.                                |
| `mied_param_h`              | Miedema model parameter h, related to enthalpy calculations in alloys.                        |
| `mied_param_phi`            | Miedema model parameter phi, related to electronegativity differences in alloys.              |
| `HOMO`                      | Highest Occupied Molecular Orbital energy level of the metal atom or system.                   |
| `LUMO`                      | Lowest Unoccupied Molecular Orbital energy level.                                             |
| `mag_moment_bulk_d`         | Magnetic moment of the metal in bulk form, from d-electrons.                                 |
| `E_Fermi`                   | Fermi energy level of the metal or system (in eV).                                           |
| `E_Fermi2`                  | Secondary Fermi energy measurement or corrected value.                                       |
| `Ecoh_m1`                   | Cohesive energy specifically assigned to M1 in the merged dataset.                           |
| `system`                    | Generic system identifier used in merged datasets for M2 or other references.                 |
| `tot_energy_M2`             | Total DFT energy of the system with only M2 adsorbed.                                        |
| `Edft_fe`                   | Calculated stepwise adsorption energy measuring the energy change when adding M1 to M2/cavity.|


In [None]:
Edft_fe_df.shape

In [None]:
print(", ".join(Edft_fe_df.columns))

In [None]:
Edft_fe_df.sample()

In [None]:
Edft_fe_df.describe().T.round(2)

## Boxplot of the Edft_fe by metal 

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_fe_df,
    x_col='M1',
    y_col='Edft_fe',
    color_map=metal_colors,
    hover_col='system_dacs',
    x_title='Metal',
    y_title='Edft_fe Energy',

    save_dir='../../../data/figures/dacs_eda',
    file_name='Edft_fe_M1_metal_boxplot'
)

fig.show()


## Boxplot of the Edft_fe by metal - no outliers

In [None]:
# Map colors to the metal types in the dataframe
Edft_fe_df['color'] = Edft_fe_df['M1'].map(metal_colors)

# Detect outliers
outliers_dacs_metal_Edft_fe = detect_outliers(Edft_fe_df, 'M1', 'Edft_fe')

# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_metal_dft_Edft_fe.csv')
outliers_dacs_metal_Edft_fe.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Edft_fe_df_no_outliers_metal = Edft_fe_df[~Edft_fe_df.index.isin(outliers_dacs_metal_Edft_fe.index)]

fig = plot_categorical_energy_boxplot(
    df=Edft_fe_df_no_outliers_metal,
    x_col='M1',
    y_col='Edft_fe',
    color_map=metal_colors,
    hover_col='system_dacs',
    x_title='Metal',
    y_title='Edft_fe Energy',
    save_dir='../../../data/figures/dacs_eda',
    file_name='Edft_fe_M1_metal_boxplot_no_outliers'
)

fig.show()


## Boxplot of the Edft_fe by cavity

In [None]:
plot_categorical_energy_boxplot(
    df=Edft_fe_df,
    x_col='cavity_v2',
    y_col='Edft_fe',
    color_map=cavity_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Cavity',
    y_title='E_dft_Fe Energy',
    file_name='Edft_fe_M1_cavity_boxplot'
)



## Boxplot of the Edft_fe by cavity - no outliers

In [None]:
# Assuming Edft_coh_df and metal_colors are already defined

# Map colors to the metal types in the dataframe
Edft_fe_df['color'] = Edft_fe_df['cavity_v2'].map(cavity_colors)

# Detect outliers
outliers_cavity_dacs_Edft_fe = detect_outliers(Edft_fe_df, 'cavity_v2', 'Edft_fe')



# Save the outliers to a CSV file
outliers_path = os.path.join(dacs_energies_out_dir, 'outliers_dacs_cavity_Edft_fe.csv')
outliers_cavity_dacs_Edft_fe.to_csv(outliers_path, index=False, header=True)

# Remove outliers from the dataset
Edft_fe_df_no_outliers_cav = Edft_fe_df[~Edft_fe_df.index.isin(outliers_cavity_dacs_Edft_fe.index)]

plot_categorical_energy_boxplot(
    df=Edft_fe_df_no_outliers_cav,
    x_col='cavity_v2',
    y_col='Edft_fe',
    color_map=cavity_colors,
    hover_col='system_dacs',
    save_dir='../../../data/figures/dacs_eda',
    x_title='Cavity',
    y_title='E_dft_Fe Energy',
    file_name='Edft_fe_M1_cavity_boxplot_no_outliers'
)



## Data Cleaning: Removing Metal and Cavity Outliers

In [None]:
# Combine indices from outliers_dacs_metal and outliers_dacs_cavity
combined_outliers_indices = outliers_dacs_metal_Edft_fe.index.union(outliers_cavity_dacs_Edft_fe.index)

Edft_fe_df_iqr = Edft_fe_df[~Edft_fe_df.index.isin(combined_outliers_indices)]

In [None]:
# Define the path where you want to save the CSV file
save_path_csv = os.path.join(dacs_energies_out_dir, 'Edft_fe_df_iqr.csv')

# Save the dataframe to a CSV file
Edft_fe_df_iqr.to_csv(save_path_csv, index=False, header=True)

# Define the path where you want to save the pickle file
save_path_pkl = os.path.join(dacs_energies_out_dir, 'Edft_fe_df_iqr.pkl')

# Save the dataframe to a pickle file
Edft_fe_df_iqr.to_pickle(save_path_pkl)


| **Feature Name**             | **Description**                                                                                   |
|------------------------------|-------------------------------------------------------------------------------------------------|
| `system_dacs`                | Identifier for the dual-atom catalyst (DAC) system, combining M1 and M2 adsorbed on a cavity.   |
| `tot_energy_dacs`            | Total DFT energy of the DAC system with both M1 and M2 metals adsorbed.                         |
| `M1`                        | Symbol or type of the first metal atom in the DAC.                                              |
| `M2`                        | Symbol or type of the second metal atom in the DAC.                                             |
| `heteroatom_dacs`            | Type of heteroatom(s) present in the DAC support structure (e.g., N, B, O).                     |
| `basic_cavity_dacs`          | Simplified or generalized descriptor of the DAC binding site or cavity.                         |
| `cavity`                    | Original cavity identifier associated with the DAC structure.                                  |
| `cavity_v2`                 | Cleaned or standardized version of `cavity` for consistent analysis and plotting.              |
| `cavity_3`                  | Additional refined or grouped cavity classification.                                            |
| `M1_cavity`                 | Identifier for the site or cavity associated with the first metal (M1) adsorbed singly.         |
| `cavity_4`                  | Another variant of cavity categorization used for filtering or grouping.                        |
| `M2_cavity`                 | Identifier for the site or cavity associated with the second metal (M2) adsorbed singly.        |
| `system_sacs_pristine`      | Identifier for pristine single-atom catalyst (SAC) systems without metals adsorbed.             |
| `tot_energy_sacs_pristine`  | Total DFT energy of the pristine SAC system (without metal adsorption).                         |
| `heteroatom_sacs_pristine`  | Heteroatom type(s) present in the pristine SAC support.                                        |
| `basic_cavity_sacs_pristine`| Simplified descriptor of the pristine SAC cavity site.                                         |
| `metal`                     | Metal type or symbol associated with cohesive and atomic properties.                           |
| `Ecoh`                      | Cohesive energy of the metal atom, representing the energy needed to break it into atoms.      |
| `atomic_mass`               | Atomic mass of the metal (in atomic mass units).                                               |
| `vdw_radius`                | Van der Waals radius of the metal atom (in angstroms).                                        |
| `r_cov_sb`                  | Covalent radius for single bonds (in angstroms).                                              |
| `r_cov_db`                  | Covalent radius for double bonds (in angstroms).                                              |
| `dipole_polarizability`    | Dipole polarizability of the metal atom, indicating how easily its electron cloud distorts.    |
| `ionic_radii_crystals`     | Ionic radius of the metal in crystal structures (in angstroms).                               |
| `d_center_sp`               | d-band center position relative to the Fermi level, an electronic descriptor relevant to catalysis. |
| `Paul_electroneg`           | Pauling electronegativity of the metal atom.                                                  |
| `MB_electroneg`             | Mulliken–Brønsted electronegativity, another measure of electronegativity.                    |
| `electron_affinity`         | Electron affinity of the metal atom (energy released when adding an electron).                 |
| `covalent_radius`           | Covalent radius of the metal atom (in angstroms).                                             |
| `atomic_number`             | Atomic number (number of protons) of the metal element.                                       |
| `Ion_energ_I`               | First ionization energy of the metal atom (energy to remove first electron).                   |
| `Ion_energ_II`              | Second ionization energy of the metal atom.                                                   |
| `Zung_radius`               | Zunger effective ionic radius (a specific empirical radius).                                 |
| `Coh_radius`                | Cohn effective atomic radius.                                                                 |
| `Waber_radius`              | Waber atomic radius, another empirical measure of atomic size.                                |
| `mied_param_h`              | Miedema model parameter h, related to enthalpy calculations in alloys.                        |
| `mied_param_phi`            | Miedema model parameter phi, related to electronegativity differences in alloys.              |
| `HOMO`                      | Highest Occupied Molecular Orbital energy level of the metal atom or system.                   |
| `LUMO`                      | Lowest Unoccupied Molecular Orbital energy level.                                             |
| `mag_moment_bulk_d`         | Magnetic moment of the metal in bulk form, from d-electrons.                                 |
| `E_Fermi`                   | Fermi energy level of the metal or system (in eV).                                           |
| `E_Fermi2`                  | Secondary Fermi energy measurement or corrected value.                                       |
| `Ecoh_m1`                   | Cohesive energy specifically assigned to M1 in the merged dataset.                           |
| `system`                    | Generic system identifier used in merged datasets for M2 or other references.                 |
| `tot_energy_M2`             | Total DFT energy of the system with only M2 adsorbed.                                        |
| `Edft_fe`                   | Calculated stepwise adsorption energy measuring the energy change when adding M1 to M2/cavity.|
| `color`                    | Color code assigned based on categorical mapping (e.g., metal or cavity) for plotting purposes.|


In [None]:
Edft_fe_df_iqr.shape

In [None]:
print(", ".join(Edft_fe_df_iqr.columns))

In [None]:
Edft_fe_df_iqr.sample()

In [None]:
Edft_fe_df_iqr.describe().T.round(2)