### 0. Data load and setup
Load the raw Mamun dataset, import packages.

In [None]:
import pandas as pd
import ast
from pathlib import Path
import re
import numpy as np

PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data_raw"
DATA_PROCESSED = PROJECT_ROOT / "data_processed"

INPUT_FILE = DATA_RAW / "mamun.csv"
OUTPUT_FILE = DATA_PROCESSED / "mamun_HER_processed.csv"

In [None]:
df = pd.read_csv(INPUT_FILE)
print(df.shape)
df.head()

### 1. Surface composition parsing
Parse surface composition to extract elements A, B and their counts m, n from the string, handles pure-element surfaces setting A = B and m = 0. Preserves original order of elements.

In [None]:
def parse_surface_composition(comp: str) -> pd.Series:
    """
    Parse surfaceComposition:
      - 'Pt3Ni' as A=Pt, n=3, B=Ni, m=1
      - 'PtNi3' as A=Pt, n=1, B=Ni, m=3
      - 'Pt' as A=Pt, n=1, B=Pt, m=0
    """
    comp = str(comp)

    m = re.match(r'^([A-Z][a-z]*)(\d*)(?:([A-Z][a-z]*)(\d*))?$', comp)

    A, n_str, B, m_str = m.groups()

    n = int(n_str) if n_str else 1

    if B is None:
        # pure element case, treat B as A, m=0
        B = np.nan
        m_val = 0
    else:
        m_val = int(m_str) if m_str else 1

    return pd.Series({"surf_A": A, "surf_B": B, "surf_n": n, "surf_m": m_val,})

df[["surf_A", "surf_B", "surf_n", "surf_m"]] = df["surfaceComposition"].apply(parse_surface_composition)

### 2. Filter entries with one H per unit cell
Filter the dataset to only contain dillute hydrogen adsorption entries where products = '{'Hstar': 1}'.

In [None]:
mask_H1 = df["products"] == "{'Hstar': 1}"

df = df[mask_H1].copy()
print(df.shape)
print(df["products"].unique().tolist())

### 3. Site label processing
Simplify sites strings like '{'H': 'bridge|A_A|B'}' into compact labels, drops rare 4fold sites. Generalize tilt site variants into main site type, and differentiates hollow sites into fcc- and bcc-hollow. With get_norm_label_from_sites create a normalized label that preserves the identity of first-layer atoms for feature calculation.

In [None]:
def simplify_site(site_str: str) -> str:
    """
    Simplify a Mamun site string like 'hollow|A_A_B|FCC' into a compact label.
    """
    parts = site_str.split("|")
    head = parts[0].lower()

    if head == "hollow" and len(parts) == 3:
        return parts[2].lower()
    elif head == "hollow-tilt":
        return "hollow-tilt"
    elif head in ("top", "top-tilt"):
        return "ontop"
    elif head in ("bridge", "bridge-tilt"):
        return head
    else:
        return head


def extract_and_simplify_sites_field(s: str) -> str:
    """
    Since 'sites' column is a stringified dictionary, extract the 'H' entry and simplify.
    """
    d = ast.literal_eval(s) # parses the string into dictonary
    raw = d.get("H", "")
    return simplify_site(raw)

def get_norm_label_from_sites(s: str) -> str | None:
    """
    Build a normalized site label for H* that:
    - encodes site type fcc/hcp/bridge/ontop
    - encodes first-layer A/B environment
    - ignores tilt and 2nd-layer atom differences
    """

    try:
        d = ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return None

    h = d.get("H")

    h = h.strip()
    parts = h.split("|")
    head = parts[0].lower()

    # hollow site type
    if head.startswith("hollow") and len(parts) >= 3:
        first_layer = parts[1].replace("_", "").upper()
        kind = parts[2].lower()
        # use counts of A/B (order-insensitive)
        nA = first_layer.count("A")
        nB = first_layer.count("B")
        return f"{kind}{'A'*nA}{'B'*nB}"

    # bridge site type
    if head.startswith("bridge") and len(parts) >= 2:
        first_layer = parts[1].replace("_", "").upper()
        nA = first_layer.count("A")
        nB = first_layer.count("B")
        return f"bridge{'A'*nA}{'B'*nB}"

    # ontop site type
    if head in ("top", "top-tilt", "ontop") and len(parts) >= 2:
        ch = parts[1].strip().upper()
        if ch in ("A", "B"):
            return f"ontop{ch}"

    return None

In [None]:
# final cleaned site label for ML
df["site_simple"] = df["sites"].apply(extract_and_simplify_sites_field).astype("category")

tilt_map = {"bridge-tilt": "bridge", "hollow-tilt": "fcc", "top-tilt": "ontop"}

df["site_simple_collapsed"] = df["site_simple"].replace(tilt_map)

# drop 4fold entries
df = df[df["site_simple_collapsed"] != "4fold"].copy()

print(df.shape)
print(df["site_simple_collapsed"].value_counts())

In [None]:
# normalize label for grouping
df["norm_label"] = df["sites"].apply(get_norm_label_from_sites)

print(df[["site_simple_collapsed", "norm_label"]].head(10))

### 4. Duplicate collapsing
Collapse duplicates arising due to disregarding tilt- variants and second-layer atoms. Duplicates are defined as entries with the same canonicalized surface alloy (order-invariant), facet, and norm_label (same first-layer environment). reactionEnergy_eV is averaged across such duplicates and one representative row is kept.

In [None]:
# normalize alloy to make it order-invariant
def canonicalize_alloy(df, Acol="surf_A", Bcol="surf_B", ncol="surf_n", mcol="surf_m", prefix="mam"):
    out = df.copy()

    A = out[Acol].fillna("").astype(str)
    B = out[Bcol].fillna("").astype(str)
    n = pd.to_numeric(out[ncol], errors="coerce").fillna(0).astype(int)
    m = pd.to_numeric(out[mcol], errors="coerce").fillna(0).astype(int)

    do_swap = (A != "") & (B != "") & (A > B)

    out[f"{prefix}_canon_A"] = np.where(do_swap, B, A)
    out[f"{prefix}_canon_B"] = np.where(do_swap, A, B)
    out[f"{prefix}_canon_n"] = np.where(do_swap, m, n)
    out[f"{prefix}_canon_m"] = np.where(do_swap, n, m)
    
    return out

df = canonicalize_alloy(df)

# normalize facet
if "facet" in df.columns:
    df["facet"] = df["facet"].astype(str).str.replace(r"[()\s]", "", regex=True)

In [None]:
# normalized groups for duplicate detection
group_cols = ["mam_canon_A", "mam_canon_B", "mam_canon_n", "mam_canon_m", "facet", "norm_label"]

def collapse_group(g: pd.DataFrame) -> pd.Series:
    row = g.iloc[0].copy()
    row["reactionEnergy_eV"] = pd.to_numeric(g["reactionEnergy_eV"], errors="coerce").mean()
    if "id" in g.columns:
        row["id"] = ",".join(map(str, g["id"].unique()))
    return row

sizes = df.groupby(group_cols, dropna=False).size()
print("Duplicate groups to collapse:", int((sizes > 1).sum()))

df = (df.sort_values(group_cols)
     .groupby(group_cols, dropna=False, as_index=False)
     .apply(collapse_group, include_groups=False)
     .reset_index(drop=True))

# safety check
assert df.groupby(group_cols, dropna=False).size().max() == 1
print(df.shape)


### 5. Save output
Drop intermediate columns, save final dataframe.

In [None]:
cols_to_drop = ["sites_orig", "mam_canon_A", "mam_canon_B", "mam_canon_n", "mam_canon_m"]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

df.to_csv(OUTPUT_FILE, index=False)
df.head()