# 1.0 Importing the SACs on doped carbon dataset


## Notebook Setup: Imports and Configuration


In [None]:
# ─────────────────────────────
# Standard Library Imports
# ─────────────────────────────
import os
import sys

# ─────────────────────────────
# Adjust sys.path for Local Modules
# ─────────────────────────────
src_dir = os.path.abspath(os.path.join('..', '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# ─────────────────────────────
# Project-Specific Imports
# ─────────────────────────────
from settings import *
from utils import *
from vis import *

## Define the input and outpout directories

In [None]:
# Define input and output paths
rfr_ads_in_dir = '../../../data/processed/rfr_logocv_in'
rfr_ads_out_dir = '../../../data/external/rfr_logocv_out'


This cell loads RFR LOOCV results from CSV files.

It filters out unwanted entries (with `Hetatom` = P or S and training samples), merges all filtered data into a single DataFrame, creates a new `sac_cavity` feature, and saves the result to disk.


In [None]:
# List of dataframe names
dataframe_names = [
    'Ag_df_logocv_rfr', 'Au_df_logocv_rfr', 'Cd_df_logocv_rfr', 'Co_df_logocv_rfr', 'Cu_df_logocv_rfr', 
    'Fe_df_logocv_rfr', 'Ir_df_logocv_rfr', 'Ni_df_logocv_rfr', 'Os_df_logocv_rfr', 'Pd_df_logocv_rfr', 
    'Pt_df_logocv_rfr', 'Rh_df_logocv_rfr', 'Ru_df_logocv_rfr', 'Zn_df_logocv_rfr'
]

# Dictionary to hold the original dataframes
dataframes = {}

# Dictionary to hold the filtered dataframes
filtered_dataframes = {}

# Loop through the dataframe names, read and store them in the dictionary
for name in dataframe_names:
    file_path = os.path.join(rfr_ads_in_dir, f'{name}.csv')
    df = pd.read_csv(file_path)
    dataframes[name] = df

# Function to filter dataframes
def filter_df(dataframe):
    return dataframe[~dataframe['Hetatom'].isin(['P', 'S']) & (dataframe['Train_00000'] == False)]

# Apply the filtering and save the filtered dataframes in the dictionary
for name, df in dataframes.items():
    df_filtered = filter_df(df)
    filtered_dataframes[name] = df_filtered

# Concatenate all filtered dataframes into one
rfr_mlogocv_df = pd.concat(filtered_dataframes.values(), ignore_index=True)

rfr_mlogocv_df['sac_cavity'] = rfr_mlogocv_df['Hetatom'].str[:2] + '_' + rfr_mlogocv_df['Cavity']

# Optionally save the merged dataframe to a new CSV file
rfr_mlogocv_df.to_csv(os.path.join(rfr_ads_out_dir, 'rfr_mlogocv.csv'), index=False)


#  SACs on N-doped carbon dataset overview and summary statistics

### Dataset Column Descriptions

| Column Name     | Description                                                                 |
|------------------|-----------------------------------------------------------------------------|
| `System`         | Unique identifier for each system (adsorbate–surface combination).          |
| `Metal`          | The central transition metal involved in the catalytic system.              |
| `Hetatom`        | The heteroatom (e.g., N, O, P, S) involved in the adsorbate structure.      |
| `Cavity`         | Specific site or configuration name where the adsorbate binds.              |
| `Base_Cavity`    | Generalized or canonical version of the cavity name.                        |
| `y`              | Target DFT adsorption energy value (in eV).                                 |
| `Pred_00000`     | Predicted adsorption energy from the machine learning model.                |
| `Train_00000`    | Boolean flag indicating whether the sample was used in the training set.    |
| `sac_cavity`     | Combined feature: heteroatom type and cavity (e.g., `N_ontop`).             |


In [None]:
rfr_mlogocv_df.shape

In [None]:
print(", ".join(rfr_mlogocv_df.columns))

In [None]:
rfr_mlogocv_df.sample()

In [None]:
rfr_mlogocv_df.describe().T.round(2)

In [None]:
rfr_mlogocv_df.isnull().sum()

In [None]:
# Selecting specific cavity
selected_row = rfr_mlogocv_df.loc[rfr_mlogocv_df['System'] == 'Fe_N_din4']
selected_row


## Boxplot for adsorption energy by metal for SACs on N-doped carbon

In [None]:
fig = plot_categorical_energy_boxplot(
    rfr_mlogocv_df,
    x_col='Metal',
    color_map=metal_colors,
    y_col='y',
    hover_col='System',
    save_dir="../../../data/figures/sacs_eda",
    x_title='Metal',
    y_title='E_dft_SAC Energy'
)
fig.show()


## Boxplot for adsorption energy by cavity type for SACs on N-doped carbon

In [None]:
fig = plot_categorical_energy_boxplot(
    rfr_mlogocv_df,
    x_col='Cavity',
    color_map=cavity_colors_sacs,
    y_col='y',
    hover_col='System',
    save_dir="../../../data/figures/sacs_eda",
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
    file_name='cavity_energy_boxplot'
)
fig.show()

