# 3.1 Exploratory Data Analysis for the Edft_balanced dataset

## Notebook Setup: Imports and Configuration

In [None]:
# ─────────────────────────────
# Standard Library Imports
# ─────────────────────────────
import os
import sys
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# ─────────────────────────────
# Adjust sys.path for Local Modules
# ─────────────────────────────
src_dir = os.path.abspath(os.path.join('..', '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# ─────────────────────────────
# Project-Specific Imports
# ─────────────────────────────
#from plots_details import metal_colors, cavity_colors_sacs
#from utils import *
from vis import * 
from settings import *
# ─────────────────────────────
# Define Paths and Load Data
# ─────────────────────────────

dacs_energies_out_dir = '../../../data/external/dacs_energies_out'

Edft_coh_df = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edacs_dft.csv'))
Edft_balanced_df = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edft_balanced_df.csv'))
Edft_coh_iqr_df = pd.read_csv(os.path.join(dacs_energies_out_dir, 'Edacs_dft.csv'))  

In [None]:
Edft_coh_iqr_df.shape

In [None]:
Edft_balanced_df.shape

## Export the outliers

In [None]:
# Filter for `system_dacs` that are in Edft_coh_df but not in Edft_coh_iqr_df
unique_system_dacs = Edft_coh_iqr_df[~Edft_coh_iqr_df['system_dacs'].isin(Edft_balanced_df['system_dacs'])]

# Specify the directory and filename for saving
output_path = '../../../data/external/dacs_energies_out/outliers_dacs_balanced_to_iqr.csv'

# Save the filtered data to CSV 
unique_system_dacs.to_csv(output_path, index=False)

## Boxplot Metal Edft_balanced_df

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_balanced_df,
    x_col='M1',
    color_map=metal_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Metal',
    y_title='E_dft_M1M2 Energy'
)


fig.show()

## Summary statistics metal Edft_balanced_df

In [None]:
metal_summary = Edft_balanced_df.groupby('M1')['E_dft_M1M2'].describe()
print(metal_summary)

## Boxplot Cavity Edft_balanced_df

In [None]:
fig = plot_categorical_energy_boxplot(
    df=Edft_balanced_df,
    x_col='cavity_v2',
    color_map=cavity_colors,
    y_col='E_dft_M1M2',
    hover_col='system_dacs',
    x_title='Cavity',
    y_title='E_dft_M1M2 Energy',
)

fig.show()

## Summary statistics cavity Edft_balanced_df

In [None]:
cavity_summary = Edft_balanced_df.groupby('cavity_3')['E_dft_M1M2'].describe()
print(cavity_summary)

## Count cavity categories

In [None]:
Edft_balanced_df.shape[0]

In [None]:
din4_x2_count = Edft_balanced_df['cavity_v2'].str.contains('din4_x2').sum()
print(din4_x2_count)

In [None]:
din6_s_count = Edft_balanced_df['cavity_v2'].str.contains('din6_s').sum()
print(din6_s_count)

In [None]:
din6_as_count = Edft_balanced_df['cavity_v2'].str.contains('din6_as').sum()
print(din6_as_count)

In [None]:
sizes = [29.91, 52.34, 17.76]
labels = ['M$_{1}$-M$_{2}$-sq-C5Nx-C1Ny', 'M$_{1}$-M$_{2}$-sq-C5Nx', 'M$_{1}$-M$_{2}$-hex-C5Nx']
colors = ['#ff9999', '#66b3ff', '#99ff99']

plot_donut_chart(
    sizes=sizes,
    labels=labels,
    colors=colors,
    title='Cavity Distribution After Sampling',
)

## Count number of carbons for each cavity type

This code filters the dataset to include only rows where the `cavity_v2` column contains the substring `'din4_x2'`, `'din6_s'`, and `'din6_as'`. It then groups the filtered data by the `'cavity_3'` category and computes descriptive statistics for the `'E_dft_M1M2'` energy values within each group. 

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din4_x2')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('cavity_3')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din6_s')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('cavity_3')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din6_as')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('cavity_3')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

## Count the number of metals for each cavity type

Filter the DataFrame to include only rows where the 'cavity_v2' column contains `din4_x2`, `din6_s`, and `din6_as`.  Then, group the filtered data by the metal type in column 'M1' and generate descriptive statistics for the 'E_dft_M1M2' energy values. 

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din4_x2')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('M1')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din6_s')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('M1')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

In [None]:
# Filter the DataFrame based on the condition
filtered_df = Edft_balanced_df[Edft_balanced_df['cavity_v2'].str.contains('din6_as')]

# Group by 'cavity_3' and describe the 'E_dft_M1M2' column
cavity_summary = filtered_df.groupby('M1')['E_dft_M1M2'].describe().T.round(2)
print(filtered_df.shape)
# Print the summary
print(cavity_summary)

In [None]:
sizes = [7.32, 7.43, 6.51, 7.02, 7.02, 7.12, 7.12, 7.12, 7.12, 7.22, 7.22, 7.22, 7.22, 7.32]
labels = ['Cu', 'Fe', 'Os', 'Zn', 'Pt', 'Ru', 'Rh', 'Pd', 'Ag', 'Ir', 'Co', 'Cd', 'Au', 'Ni']
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#c2f0c2', '#b3e6ff',
          '#ff6666', '#b3b3ff', '#ffb366', '#c2f0f0', '#ffccff', '#ff80ff']

plot_donut_chart(
    sizes=sizes,
    labels=labels,
    colors=colors,
    title='Metal Type Distribution After Sampling',
    save_path='../../../data/figures/rfr_results/str_sampling_metals'
)

## Histogram of DFT adsorption energy by metal

In [None]:
plot_energy_histograms_by_metal(
    df=Edft_balanced_df,
    metal_col='M1',
    energy_col='E_dft_M1M2',
    color_map=metal_colors,
    save_dir='../../../data/figures/sacs_eda',
    file_name='hist_metal_Edft_balanced'
)