In [16]:
import pandas as pd
import numpy as np

# -----------------------------
# 1. Load cleaned embeddings (only once)
# -----------------------------

chemberta = pd.read_csv("Embeddings_cleaned/ChemBERTa_embeddings.csv")
mole = pd.read_csv("Embeddings_cleaned/mole_embeddings_1203.csv")
molformer = pd.read_csv("Embeddings_cleaned/metabolite_embeddings_molformer.csv")
smited = pd.read_csv("Embeddings_cleaned/SMI-TED289M_embeddings_new.csv")  # NEW

print("ChemBERTa:", chemberta.shape)
print("MOLE:", mole.shape)
print("Molformer:", molformer.shape)
print("SMI-TED289M:", smited.shape)  # NEW

# -----------------------------
# 2. Load Sup1 (feature → metabolite mapping)
# -----------------------------

sup1 = pd.read_excel(
    "sample_dataset/41564_2018_306_MOESM3_ESM.xlsx",
    sheet_name=0,
    header=1
)
print("Sup1 columns:", sup1.columns.tolist())

feat_to_name = sup1[[
    "Metabolomic Feature",
    "Exact Match to Standard (* = isomer family)"
]].rename(columns={
    "Metabolomic Feature": "FeatureID",
    "Exact Match to Standard (* = isomer family)": "Metabolite"
})

print("feat_to_name:", feat_to_name.shape)
feat_to_name.head()

# -----------------------------
# 3. (Optional) Load Metabolite → SMILES mapping
# -----------------------------

met_to_smiles = pd.read_csv("metabolites_to_SMILES.csv")
met_to_smiles = met_to_smiles.rename(
    columns={"Exact Match to Standard (* = isomer family)": "Metabolite"}
)

print("met_to_smiles:", met_to_smiles.shape)

# -----------------------------
# 4. Load Sup2 (abundance matrix)
# -----------------------------

abundance = pd.read_excel(
    "sample_dataset/41564_2018_306_MOESM4_ESM.xlsx",
    sheet_name=0,
    header=1
)

abundance = abundance.rename(columns={"# Feature / Sample": "FeatureID"})
print("Sup2 shape:", abundance.shape)
print(abundance.head())

# -----------------------------
# 5. Extract diagnosis labels (per sample)
# -----------------------------

sample_ids = abundance.columns[1:]  # all sample columns
diag_row = abundance[abundance["FeatureID"] == "Diagnosis"]

diagnosis = diag_row.iloc[0, 1:].values  # skip "FeatureID" column
print("Unique diagnosis labels:", np.unique(diagnosis))

# -----------------------------
# 6. Merge features with ChemBERTa embeddings
# -----------------------------

feat_with_emb = feat_to_name.merge(
    chemberta,
    on="Metabolite",
    how="inner"
)

print("Features with ChemBERTa embeddings:", feat_with_emb.shape)
feat_with_emb.head()

# -----------------------------
# 7. Filter abundance to features that have embeddings
# -----------------------------

abundance_filtered = abundance[
    abundance["FeatureID"].isin(feat_with_emb["FeatureID"])
].copy()

print("Filtered Sup2 shape:", abundance_filtered.shape)


ChemBERTa: (301, 770)
MOLE: (301, 770)
Molformer: (301, 770)
SMI-TED289M: (301, 770)
Sup1 columns: ['Metabolomic Feature', 'Retention Time', 'm/z', 'Cluster (if DA)', 'Putative Chemical Class', 'Exact Match to Standard (* = isomer family)', 'Adduct']
feat_to_name: (8848, 2)
met_to_smiles: (466, 3)
Sup2 shape: (8855, 221)
            FeatureID  PRISM|7122 PRISM|7147  PRISM|7150 PRISM|7153  \
0                 Age          38         50          41         51   
1           Diagnosis          CD         CD          CD         CD   
2  Fecal.Calprotectin  207.484429        NaN  218.334517        NaN   
3          antibiotic          No         No          No         No   
4   immunosuppressant         Yes         No         Yes         No   

  PRISM|7184 PRISM|7238 PRISM|7406 PRISM|7408 PRISM|7421  ...  \
0         68         67         59         52         58  ...   
1         CD         CD         CD         CD         CD  ...   
2  20.167951   2.586247        NaN        NaN  79.33101

In [17]:
from pathlib import Path

# -----------------------------------
# 8. Set up models and transformations
# -----------------------------------

embedding_tables = {
    "ChemBERTa": chemberta,
    "MOLE": mole,
    "Molformer": molformer,
    "SMI-TED289M": smited,
}

transform_methods = ["log1p", "zscore", "clr"]

# Base output folder
base_out = Path("Sample_level_embeddings")


### Loop over transforms and models


In [18]:

for transform in transform_methods:
    print(f"\n=== Transform: {transform} ===")

    # create subfolder for this transform
    out_dir = base_out / transform
    out_dir.mkdir(parents=True, exist_ok=True)

    for model_name, emb_table in embedding_tables.items():
        print(f"\n--- Model: {model_name} ---")

        # 1) Merge features with this model's embeddings
        feat_with_emb = feat_to_name.merge(
            emb_table,
            on="Metabolite",
            how="inner"
        )
        print("Features with embeddings:", feat_with_emb.shape)

        # 2) Filter abundance to these features and align order
        abundance_filtered = abundance[
            abundance["FeatureID"].isin(feat_with_emb["FeatureID"])
        ].copy()

        abundance_filtered = (
            abundance_filtered
            .set_index("FeatureID")
            .loc[feat_with_emb["FeatureID"]]
            .reset_index()
        )

        print("Filtered abundance shape:", abundance_filtered.shape)

        # 3) Build abundance matrix (features × samples)
        sample_cols = abundance_filtered.columns[1:]  # all except FeatureID
        A = abundance_filtered[sample_cols].to_numpy().astype(float)

        # 4) Build embedding matrix (features × d)
        E = feat_with_emb.filter(like="emb_").to_numpy()

        # ---------------------------------------------------
        # 5) Apply transformation
        # ---------------------------------------------------
        if transform == "log1p":
            A_t = np.log1p(A)

        elif transform == "zscore":
            mu = A.mean(axis=1, keepdims=True)
            sd = A.std(axis=1, keepdims=True) + 1e-8
            A_t = (A - mu) / sd

        elif transform == "clr":
            A_pos = A + 1e-8
            logA = np.log(A_pos)
            A_t = logA - logA.mean(axis=0, keepdims=True)

        else:
            raise ValueError(f"Unknown transform: {transform}")

        # ---------------------------------------------------
        # 6) Make transformation safe
        # ---------------------------------------------------
        A_t = np.nan_to_num(A_t, nan=0.0, posinf=0.0, neginf=0.0)

        # ---------------------------------------------------
        # 7) Convert to per-sample weights (columns sum to 1)
        # ---------------------------------------------------
        # For zscore/clr which can have negative values → use abs()
        W = np.abs(A_t)
        weights = W / (W.sum(axis=0, keepdims=True) + 1e-8)

        # ---------------------------------------------------
        # 8) Compute sample embeddings: (samples × d)
        # ---------------------------------------------------
        sample_embeddings = weights.T @ E
        print("Sample embeddings shape:", sample_embeddings.shape)

        # ---------------------------------------------------
        # 9) Build dataframe with embeddings + labels
        # ---------------------------------------------------
        emb_cols = [f"emb_{i}" for i in range(sample_embeddings.shape[1])]
        emb_df = pd.DataFrame(sample_embeddings, index=sample_ids, columns=emb_cols)

        emb_df.insert(0, "Diagnosis", diagnosis)
        emb_df.insert(0, "SampleID", emb_df.index)

        # ---------------------------------------------------
        # 10) Save to parquet
        # ---------------------------------------------------
        out_path = out_dir / f"{model_name}_sample_embeddings_classification.parquet"
        emb_df.to_parquet(out_path, index=False)
        print("Saved:", out_path)


=== Transform: log1p ===

--- Model: ChemBERTa ---
Features with embeddings: (532, 771)
Filtered abundance shape: (532, 221)
Sample embeddings shape: (220, 768)
Saved: Sample_level_embeddings/log1p/ChemBERTa_sample_embeddings_classification.parquet

--- Model: MOLE ---
Features with embeddings: (532, 771)
Filtered abundance shape: (532, 221)
Sample embeddings shape: (220, 768)
Saved: Sample_level_embeddings/log1p/MOLE_sample_embeddings_classification.parquet

--- Model: Molformer ---
Features with embeddings: (532, 771)
Filtered abundance shape: (532, 221)
Sample embeddings shape: (220, 768)
Saved: Sample_level_embeddings/log1p/Molformer_sample_embeddings_classification.parquet

--- Model: SMI-TED289M ---
Features with embeddings: (532, 771)
Filtered abundance shape: (532, 221)
Sample embeddings shape: (220, 768)
Saved: Sample_level_embeddings/log1p/SMI-TED289M_sample_embeddings_classification.parquet

=== Transform: zscore ===

--- Model: ChemBERTa ---
Features with embeddings: (532,