In [None]:
!pip install pandas requests tqdm openpyxl


In [None]:
!pip install fuzzywuzzy

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import process

def get_pubchem_info(name):
    """Get PubChem info: CID, Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms."""
    print(f"Get PubChem info for {name}: CID, Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.")
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/"
    try:
        # Get properties
        props = "CID,Title,IUPACName,CanonicalSMILES,InChIKey"
        prop_url = f"{base_url}{name}/property/{props}/JSON"
        prop_res = requests.get(prop_url, timeout=10)
        prop_res.raise_for_status()
        prop_data = prop_res.json()['PropertyTable']['Properties'][0]
        
        # Get synonyms
        syn_url = f"{base_url}{name}/synonyms/JSON"
        syn_res = requests.get(syn_url, timeout=10)
        syn_res.raise_for_status()
        syn_data = syn_res.json()
        synonyms = syn_data['InformationList']['Information'][0]['Synonym']
        
        print( prop_data)
        return {
            "CID": prop_data.get("CID"),
            "CanonicalName": prop_data.get("Title"),
            "IUPACName": prop_data.get("IUPACName"),
            "CanonicalSMILES": prop_data.get("CanonicalSMILES"),
            "InChIKey": prop_data.get("InChIKey"),
            "Synonyms": synonyms
        }
    except:
        return None

def build_synonym_mapping(names):
    """Build mapping from synonyms to full PubChem info."""
    print("Build mapping from synonyms to full PubChem info.")
    mapping = {}
    for name in tqdm(names, desc="Querying PubChem"):
        info = get_pubchem_info(name)
        if info:
            for syn in info['Synonyms']:
                mapping[syn.lower()] = info
    return mapping

def fuzzy_lookup(name, mapping, threshold=85):
    """Find closest synonym match and return PubChem info."""
    print("Find closest synonym match and return PubChem info.")
    if not mapping:
        return None
    match, score = process.extractOne(name.lower(), mapping.keys())
    if score >= threshold:
        return mapping[match]
    return None

def normalize_with_details(df, column_name, mapping):
    """Replace excipient names with PubChem info."""
    print("Replace excipient names with PubChem info.")
    results = []
    for val in df[column_name]:
        val_str = str(val)
        info = mapping.get(val_str.lower())
        if not info:
            info = fuzzy_lookup(val_str, mapping)
        if info:
            results.append({
                column_name: val_str,
                "CID": info["CID"],
                "CanonicalName": info["Canonical_Name"],
                "IUPACName": info["IUPACName"],
                "CanonicalSMILES": info["Canonical_SMILES"],
                "InChIKey": info["InChIKey"]
            })
        else:
            results.append({
                column_name: val_str,
                "CID": None,
                "CanonicalName": None,
                "IUPACName": None,
                "CanonicalSMILES": None,
                "InChIKey": None
            })
    return pd.DataFrame(results)

# === MAIN SCRIPT ===
input_file = "excipients.xlsx"  # Your uploaded file
df = pd.read_excel(input_file)

column_name = df.columns[0]  # Assume first column is excipient names
unique_names = df[column_name].dropna().unique()

# Build mapping
mapping = build_synonym_mapping(unique_names)

# Normalize and get details
df_cleaned = normalize_with_details(df, column_name, mapping)

# Save
df_cleaned.to_excel("excipients_pubchem_info.xlsx", index=False)

print("✅ Done! Saved as 'excipients_pubchem_info.xlsx'")


Querying PubChem:   0%|                                                                        | 0/167 [00:00<?, ?it/s]

Build mapping from synonyms to full PubChem info.
Get PubChem info for Aerosil: CID,Aerosil Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   1%|▍                                                               | 1/167 [00:03<08:33,  3.10s/it]

Get PubChem info for aspartame: CID,aspartame Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   1%|▊                                                               | 2/167 [00:05<07:22,  2.68s/it]

Get PubChem info for Aspartame
: CID,Aspartame
 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   2%|█▏                                                              | 3/167 [00:06<05:13,  1.91s/it]

Get PubChem info for Avicel: CID,Avicel Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   2%|█▌                                                              | 4/167 [00:07<04:17,  1.58s/it]

Get PubChem info for Avicel : CID,Avicel  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   3%|█▉                                                              | 5/167 [00:10<05:43,  2.12s/it]

Get PubChem info for Avicel PH 10: CID,Avicel PH 10 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   4%|██▎                                                             | 6/167 [00:11<04:50,  1.80s/it]

Get PubChem info for Avicel PH102: CID,Avicel PH102 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   4%|██▋                                                             | 7/167 [00:12<04:09,  1.56s/it]

Get PubChem info for Avicel-102: CID,Avicel-102 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   5%|███                                                             | 8/167 [00:13<03:41,  1.39s/it]

Get PubChem info for Banana Powder: CID,Banana Powder Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   5%|███▍                                                            | 9/167 [00:14<03:23,  1.29s/it]

Get PubChem info for B-cyclo dextrin : CID,B-cyclo dextrin  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   6%|███▊                                                           | 10/167 [00:16<03:11,  1.22s/it]

Get PubChem info for Calcium carbonate: CID,Calcium carbonate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   7%|████▏                                                          | 11/167 [00:17<03:06,  1.20s/it]

Get PubChem info for Calcium complexed tamarind gum: CID,Calcium complexed tamarind gum Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   7%|████▌                                                          | 12/167 [00:18<02:59,  1.16s/it]

Get PubChem info for Camphor: CID,Camphor Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   8%|████▉                                                          | 13/167 [00:19<02:52,  1.12s/it]

Get PubChem info for carboxymethylcellulose calcium: CID,carboxymethylcellulose calcium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   8%|█████▎                                                         | 14/167 [00:19<02:31,  1.01it/s]

Get PubChem info for Carboxymethylcellulose sodium: CID,Carboxymethylcellulose sodium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:   9%|█████▋                                                         | 15/167 [00:20<02:01,  1.25it/s]

Get PubChem info for Chia Seed Mucilage: CID,Chia Seed Mucilage Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  10%|██████                                                         | 16/167 [00:25<05:35,  2.22s/it]

Get PubChem info for Chitosan: CID,Chitosan Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  10%|██████▍                                                        | 17/167 [00:26<04:39,  1.87s/it]

Get PubChem info for Citric acid: CID,Citric acid Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  11%|██████▊                                                        | 18/167 [00:27<04:02,  1.63s/it]

Get PubChem info for CM-CP Galactomannan: CID,CM-CP Galactomannan Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  11%|███████▏                                                       | 19/167 [00:31<05:16,  2.14s/it]

Get PubChem info for Colloidal silica dioxide: CID,Colloidal silica dioxide Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  12%|███████▌                                                       | 20/167 [00:32<04:24,  1.80s/it]

Get PubChem info for Colloidal Silicon Dioxide: CID,Colloidal Silicon Dioxide Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  13%|███████▉                                                       | 21/167 [00:34<04:41,  1.93s/it]

Get PubChem info for Colloidal silicon dioxide (Aerosil®): CID,Colloidal silicon dioxide (Aerosil®) Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  13%|████████▎                                                      | 22/167 [00:35<04:02,  1.67s/it]

Get PubChem info for Copovidone: CID,Copovidone Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  14%|████████▋                                                      | 23/167 [00:36<03:33,  1.48s/it]

Get PubChem info for Co-processed: CID,Co-processed Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  14%|█████████                                                      | 24/167 [00:37<03:10,  1.33s/it]

Get PubChem info for Co-processed super disintegrant: CID,Co-processed super disintegrant Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  15%|█████████▍                                                     | 25/167 [00:38<02:56,  1.24s/it]

Get PubChem info for CP Galactomannan: CID,CP Galactomannan Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  16%|█████████▊                                                     | 26/167 [00:39<02:47,  1.19s/it]

Get PubChem info for Croscarmello: CID,Croscarmello Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  16%|██████████▏                                                    | 27/167 [00:40<02:41,  1.15s/it]

Get PubChem info for Croscarmellose: CID,Croscarmellose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  17%|██████████▌                                                    | 28/167 [00:41<02:37,  1.13s/it]

Get PubChem info for Croscarmellose Sodium: CID,Croscarmellose Sodium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  17%|██████████▉                                                    | 29/167 [00:42<02:34,  1.12s/it]

Get PubChem info for Croscarmellose sodium : CID,Croscarmellose sodium  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  18%|███████████▎                                                   | 30/167 [00:43<02:29,  1.09s/it]

Get PubChem info for crospovidone: CID,crospovidone Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  19%|███████████▋                                                   | 31/167 [00:45<02:26,  1.08s/it]

Get PubChem info for Crospovidone Type A: CID,Crospovidone Type A Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  19%|████████████                                                   | 32/167 [00:46<02:24,  1.07s/it]

Get PubChem info for Cross carmalose: CID,Cross carmalose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  20%|████████████▍                                                  | 33/167 [00:47<02:24,  1.08s/it]

Get PubChem info for Cross Carmellose Sodium: CID,Cross Carmellose Sodium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  20%|████████████▊                                                  | 34/167 [00:50<03:43,  1.68s/it]

Get PubChem info for Cross povidone: CID,Cross povidone Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  21%|█████████████▏                                                 | 35/167 [00:51<03:20,  1.52s/it]

Get PubChem info for Crosscaramelose Sodium : CID,Crosscaramelose Sodium  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  22%|█████████████▌                                                 | 36/167 [00:52<03:00,  1.38s/it]

Get PubChem info for crosscarmellose sodium: CID,crosscarmellose sodium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  22%|█████████████▉                                                 | 37/167 [00:53<02:44,  1.27s/it]

Get PubChem info for Cross-carmellose sodium: CID,Cross-carmellose sodium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  23%|██████████████▎                                                | 38/167 [00:54<02:34,  1.20s/it]

Get PubChem info for Crosspovidone: CID,Crosspovidone Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  23%|██████████████▋                                                | 39/167 [00:55<02:28,  1.16s/it]

Get PubChem info for Dextrose: CID,Dextrose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  24%|███████████████                                                | 40/167 [00:56<02:23,  1.13s/it]

Get PubChem info for Dicalcium phosphate: CID,Dicalcium phosphate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  25%|███████████████▍                                               | 41/167 [00:57<02:20,  1.11s/it]

Get PubChem info for D-Mannitol: CID,D-Mannitol Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  25%|███████████████▊                                               | 42/167 [00:58<02:16,  1.09s/it]

Get PubChem info for Doshion: CID,Doshion Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  26%|████████████████▏                                              | 43/167 [00:59<02:13,  1.08s/it]

Get PubChem info for Ethocel: CID,Ethocel Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  26%|████████████████▌                                              | 44/167 [01:01<02:29,  1.22s/it]

Get PubChem info for Fenugreek: CID,Fenugreek Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  27%|████████████████▉                                              | 45/167 [01:02<02:22,  1.17s/it]

Get PubChem info for Fenugreek Gum: CID,Fenugreek Gum Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  28%|█████████████████▎                                             | 46/167 [01:03<02:17,  1.13s/it]

Get PubChem info for Fenugreek powder: CID,Fenugreek powder Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  28%|█████████████████▋                                             | 47/167 [01:04<02:13,  1.11s/it]

Get PubChem info for Fenugreek seed mucilage: CID,Fenugreek seed mucilage Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  29%|██████████████████                                             | 48/167 [01:05<02:11,  1.10s/it]

Get PubChem info for Flavor: CID,Flavor Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  29%|██████████████████▍                                            | 49/167 [01:06<02:09,  1.09s/it]

Get PubChem info for F-Melt C: CID,F-Melt C Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  30%|██████████████████▊                                            | 50/167 [01:07<02:05,  1.08s/it]

Get PubChem info for F-melt Type C: CID,F-melt Type C Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  31%|███████████████████▏                                           | 51/167 [01:08<02:02,  1.06s/it]

Get PubChem info for F-melt®: CID,F-melt® Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  31%|███████████████████▌                                           | 52/167 [01:09<02:02,  1.06s/it]

Get PubChem info for Guar Gum: CID,Guar Gum Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  32%|███████████████████▉                                           | 53/167 [01:11<02:07,  1.12s/it]

Get PubChem info for Guar gum : CID,Guar gum  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  32%|████████████████████▎                                          | 54/167 [01:12<02:08,  1.14s/it]

Get PubChem info for Gum: CID,Gum Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  33%|████████████████████▋                                          | 55/167 [01:13<02:05,  1.12s/it]

Get PubChem info for Hibiscus leaf  mucilage: CID,Hibiscus leaf  mucilage Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  34%|█████████████████████▏                                         | 56/167 [01:14<02:01,  1.09s/it]

Get PubChem info for hydroxypropyl methyl cellulose: CID,hydroxypropyl methyl cellulose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  34%|█████████████████████▌                                         | 57/167 [01:15<01:58,  1.08s/it]

Get PubChem info for Indion415: CID,Indion415 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  35%|█████████████████████▉                                         | 58/167 [01:16<01:56,  1.07s/it]

Get PubChem info for Isabgol: CID,Isabgol Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  35%|██████████████████████▎                                        | 59/167 [01:17<02:10,  1.21s/it]

Get PubChem info for Isabgol Mucilage : CID,Isabgol Mucilage  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  36%|██████████████████████▋                                        | 60/167 [01:19<02:04,  1.16s/it]

Get PubChem info for Jackfruit starch : CID,Jackfruit starch  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  37%|███████████████████████                                        | 61/167 [01:21<02:58,  1.69s/it]

Get PubChem info for Kollidon (CL): CID,Kollidon (CL) Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  37%|███████████████████████▍                                       | 62/167 [01:22<02:37,  1.50s/it]

Get PubChem info for Kollidon (CL-F): CID,Kollidon (CL-F) Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  38%|███████████████████████▊                                       | 63/167 [01:23<02:20,  1.35s/it]

Get PubChem info for Kollidon (CL-SF): CID,Kollidon (CL-SF) Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  38%|████████████████████████▏                                      | 64/167 [01:24<02:08,  1.25s/it]

Get PubChem info for Kollidon CL: CID,Kollidon CL Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  39%|████████████████████████▌                                      | 65/167 [01:26<02:01,  1.19s/it]

Get PubChem info for Kyron T-104: CID,Kyron T-104 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  40%|████████████████████████▉                                      | 66/167 [01:27<01:54,  1.14s/it]

Get PubChem info for KyronT-314: CID,KyronT-314 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  40%|█████████████████████████▎                                     | 67/167 [01:28<01:50,  1.11s/it]

Get PubChem info for Lactochem® Microfine: CID,Lactochem® Microfine Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  41%|█████████████████████████▋                                     | 68/167 [01:29<01:46,  1.08s/it]

Get PubChem info for Lactose: CID,Lactose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  41%|██████████████████████████                                     | 69/167 [01:30<02:01,  1.24s/it]

Get PubChem info for lactose Anhydrous: CID,lactose Anhydrous Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  42%|██████████████████████████▍                                    | 70/167 [01:32<02:15,  1.40s/it]

Get PubChem info for Lactose Monohydrate: CID,Lactose Monohydrate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  43%|██████████████████████████▊                                    | 71/167 [01:33<02:05,  1.30s/it]

Get PubChem info for Lactosemonohydrate: CID,Lactosemonohydrate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  43%|███████████████████████████▏                                   | 72/167 [01:34<01:56,  1.23s/it]

Get PubChem info for Lepidium: CID,Lepidium Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  44%|███████████████████████████▌                                   | 73/167 [01:35<01:48,  1.15s/it]

Get PubChem info for Lepidium sativum: CID,Lepidium sativum Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  44%|███████████████████████████▉                                   | 74/167 [01:36<01:43,  1.11s/it]

Get PubChem info for l-HPC LH11: CID,l-HPC LH11 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  45%|████████████████████████████▎                                  | 75/167 [01:39<02:39,  1.73s/it]

Get PubChem info for l-HPC NBD022: CID,l-HPC NBD022 Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  46%|████████████████████████████▋                                  | 76/167 [01:40<02:22,  1.56s/it]

Get PubChem info for L-Hydroxy propyl cellulose: CID,L-Hydroxy propyl cellulose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  46%|█████████████████████████████                                  | 77/167 [01:42<02:07,  1.42s/it]

Get PubChem info for Lubripharm® Ssf: CID,Lubripharm® Ssf Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  47%|█████████████████████████████▍                                 | 78/167 [01:43<01:55,  1.30s/it]

Get PubChem info for Ludiflash: CID,Ludiflash Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  47%|█████████████████████████████▊                                 | 79/167 [01:44<01:49,  1.24s/it]

Get PubChem info for Ludiflash®: CID,Ludiflash® Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  48%|██████████████████████████████▏                                | 80/167 [01:45<01:45,  1.21s/it]

Get PubChem info for Ludipress: CID,Ludipress Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  49%|██████████████████████████████▌                                | 81/167 [01:46<01:41,  1.18s/it]

Get PubChem info for Magnesium Oxide: CID,Magnesium Oxide Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  49%|██████████████████████████████▉                                | 82/167 [01:47<01:36,  1.14s/it]

Get PubChem info for Magnesium Stearate: CID,Magnesium Stearate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  50%|███████████████████████████████▎                               | 83/167 [01:48<01:33,  1.11s/it]

Get PubChem info for Magnesium steric acid: CID,Magnesium steric acid Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  50%|███████████████████████████████▋                               | 84/167 [01:51<02:21,  1.70s/it]

Get PubChem info for Maize starch: CID,Maize starch Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  51%|████████████████████████████████                               | 85/167 [01:56<03:41,  2.70s/it]

Get PubChem info for Mango peel pectin: CID,Mango peel pectin Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  51%|████████████████████████████████▍                              | 86/167 [01:57<03:03,  2.27s/it]

Get PubChem info for Mannitol: CID,Mannitol Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  52%|████████████████████████████████▊                              | 87/167 [01:58<02:31,  1.89s/it]

Get PubChem info for Mannitol : CID,Mannitol  Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  53%|█████████████████████████████████▏                             | 88/167 [01:59<02:08,  1.63s/it]

Get PubChem info for Meglumine: CID,Meglumine Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  53%|█████████████████████████████████▌                             | 89/167 [02:03<02:44,  2.11s/it]

Get PubChem info for Menthol: CID,Menthol Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  54%|█████████████████████████████████▉                             | 90/167 [02:04<02:16,  1.78s/it]

Get PubChem info for Methylcellulose: CID,Methylcellulose Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  54%|██████████████████████████████████▎                            | 91/167 [02:05<01:57,  1.55s/it]

Get PubChem info for Mg-Aluminum Silicate: CID,Mg-Aluminum Silicate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


Querying PubChem:  55%|██████████████████████████████████▋                            | 92/167 [02:06<01:45,  1.41s/it]

Get PubChem info for Mg-Stearate: CID,Mg-Stearate Title, IUPACName, CanonicalSMILES, InChIKey, Synonyms.


In [2]:
import requests
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import process

def get_pubchem_data(name):
    """Get PubChem canonical name and synonyms for a given excipient name."""
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/"
    
    try:
        # Get canonical name (Title)
        prop_url = f"{base_url}{name}/property/Title/JSON"
        prop_res = requests.get(prop_url, timeout=10)
        prop_res.raise_for_status()
        prop_data = prop_res.json()
        canonical_name = prop_data['PropertyTable']['Properties'][0]['Title']
        
        # Get synonyms
        syn_url = f"{base_url}{name}/synonyms/JSON"
        syn_res = requests.get(syn_url, timeout=10)
        syn_res.raise_for_status()
        syn_data = syn_res.json()
        synonyms = syn_data['InformationList']['Information'][0]['Synonym']
        
        return canonical_name, synonyms
    except:
        return None, []

def build_synonym_mapping(names):
    """Build a mapping from synonyms to canonical names."""
    mapping = {}
    for name in tqdm(names, desc="Building synonym mapping"):
        canonical, synonyms = get_pubchem_data(name)
        if canonical:
            for syn in synonyms:
                mapping[syn.lower()] = canonical
    return mapping

def fuzzy_lookup(name, mapping, threshold=85):
    """Find closest match in mapping using fuzzy matching."""
    if not mapping:
        return name
    match, score = process.extractOne(name.lower(), mapping.keys())
    if score >= threshold:
        return mapping[match]
    return name

def normalize_names(df, column_name, mapping):
    """Replace excipient names with canonical names, using fuzzy matching."""
    df['CanonicalName'] = df[column_name].apply(
        lambda x: mapping.get(str(x).lower(), fuzzy_lookup(str(x), mapping))
    )
    return df

# === MAIN SCRIPT ===
# Load your file
input_file = "excipients.xlsx"  # Your uploaded file
df = pd.read_excel(input_file)

# Change 'Excipient' below to your actual column name if different
column_name = df.columns[0]  # First column in your file

# Get unique names to query
unique_names = df[column_name].dropna().unique()

# Build synonym mapping from PubChem
mapping = build_synonym_mapping(unique_names)

# Normalize dataset with fuzzy matching
df_cleaned = normalize_names(df, column_name, mapping)

# Save to new Excel
df_cleaned.to_excel("excipients_canonical.xlsx", index=False)

print("✅ Done! Cleaned list saved as 'excipients_canonical.xlsx'")


Building synonym mapping: 100%|██████████████████████████████████████████████████████| 167/167 [04:58<00:00,  1.79s/it]


✅ Done! Cleaned list saved as 'excipients_canonical.xlsx'
