In [15]:
import pandas as pd
import openpyxl
import numpy as np
from pathlib import Path

In [16]:
# Folder where your three CSVs live
EMB_DIR = Path("Embeddings")

# find all CSV files in the Embeddings folder
embedding_files = list(EMB_DIR.glob("*.csv"))

print("Found embedding files:")
for f in embedding_files:
    print(" -", f.name)

# load them into a dictionary {filename: dataframe}
raw_embeddings = {}

Found embedding files:
 - SMI-TED289M_embeddings_new.csv
 - mole_embeddings_1203.csv
 - ChemBERTa_embeddings.csv
 - metabolite_embeddings_molformer.csv


In [17]:
for f in embedding_files:
    df = pd.read_csv(f)
    raw_embeddings[f.stem] = df
    print(f"\nLoaded {f.name} with shape {df.shape}")
    print(df.columns[:10])  # show first few columns only


Loaded SMI-TED289M_embeddings_new.csv with shape (301, 770)
Index(['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4',
       'emb_5', 'emb_6', 'emb_7'],
      dtype='object')

Loaded mole_embeddings_1203.csv with shape (301, 771)
Index(['Unnamed: 0', 'Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2',
       'emb_3', 'emb_4', 'emb_5', 'emb_6'],
      dtype='object')

Loaded ChemBERTa_embeddings.csv with shape (301, 770)
Index(['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4',
       'emb_5', 'emb_6', 'emb_7'],
      dtype='object')

Loaded metabolite_embeddings_molformer.csv with shape (301, 770)
Index(['Exact Match to Standard (* = isomer family)', 'SMILES', 'emb_0',
       'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7'],
      dtype='object')


### Clean mole_embeddings_1203.csv

In [18]:
# Take the raw mole dataframe you loaded earlier
mole = raw_embeddings["mole_embeddings_1203"]

# 1) Drop the first column if it exists
if "Unnamed: 0" in mole.columns:
    mole = mole.drop(columns=["Unnamed: 0"])

# 2) Confirm identical structure to ChemBERTa: 
#    columns start with Metabolite, SMILES, then emb_*
emb_cols = [c for c in mole.columns if c.startswith("emb_")]

# Make sure correct ordering:
mole = mole[["Metabolite", "SMILES"] + emb_cols]

# Save back into dictionary
raw_embeddings["mole_embeddings_1203"] = mole

print("MOLE cleaned:", mole.shape)
print(mole.columns[:10])

mole.to_csv("Embeddings_cleaned/mole_embeddings_1203.csv", index=False)


MOLE cleaned: (301, 770)
Index(['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4',
       'emb_5', 'emb_6', 'emb_7'],
      dtype='object')


### Clean metabolite_embeddings_molformer.csv


In [19]:
molformer = raw_embeddings["metabolite_embeddings_molformer"]

# rename the first column to "Metabolite"
first_col = molformer.columns[0]
molformer = molformer.rename(columns={first_col: "Metabolite"})

# save it to Embeddings_cleaned
molformer.to_csv("Embeddings_cleaned/metabolite_embeddings_molformer.csv", index=False)

print("Saved cleaned Molformer embeddings to Embeddings_cleaned/metabolite_embeddings_molformer.csv")


Saved cleaned Molformer embeddings to Embeddings_cleaned/metabolite_embeddings_molformer.csv


### Verify they have same column names and dimensions

In [22]:
CLEAN_DIR = Path("Embeddings_cleaned")

# Load cleaned files
chem   = pd.read_csv(CLEAN_DIR / "ChemBERTa_embeddings.csv")
mole   = pd.read_csv(CLEAN_DIR / "mole_embeddings_1203.csv")
molfor = pd.read_csv(CLEAN_DIR / "metabolite_embeddings_molformer.csv")
smited = pd.read_csv(CLEAN_DIR / "SMI-TED289M_embeddings_new.csv")

print("ChemBERTa:", chem.shape)
print("MOLE:", mole.shape)
print("Molformer:", molfor.shape)
print("SMI-TED289M:", smited.shape)

print("\nColumn checks:")
print("ChemBERTa first 10:", chem.columns[:10].tolist())
print("MOLE first 10:", mole.columns[:10].tolist())
print("Molformer first 10:", molfor.columns[:10].tolist())
print("SMI-TED289M first 10:", smited.columns[:10].tolist())

# Compare full column lists
print("\nColumn sets identical?")
print("Chem == MOLE       ?", list(chem.columns)   == list(mole.columns))
print("Chem == Molformer  ?", list(chem.columns)   == list(molfor.columns))
print("Chem == SMI-TED289M?", list(chem.columns)   == list(smited.columns))
print("MOLE == Molformer  ?", list(mole.columns)   == list(molfor.columns))
print("MOLE == SMI-TED289M?", list(mole.columns)   == list(smited.columns))
print("Molformer == SMI-TED289M?", list(molfor.columns) == list(smited.columns))


ChemBERTa: (301, 770)
MOLE: (301, 770)
Molformer: (301, 770)
SMI-TED289M: (301, 770)

Column checks:
ChemBERTa first 10: ['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7']
MOLE first 10: ['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7']
Molformer first 10: ['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7']
SMI-TED289M first 10: ['Metabolite', 'SMILES', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7']

Column sets identical?
Chem == MOLE       ? True
Chem == Molformer  ? True
Chem == SMI-TED289M? True
MOLE == Molformer  ? True
MOLE == SMI-TED289M? True
Molformer == SMI-TED289M? True
