# Search for known compounds

With the database MetaCyc <https://metacyc.org/> the generated "predicted compounds" will be searched.

In [3]:
import requests
from bs4 import BeautifulSoup
import pymongo
import polars as pl
from tqdm import tqdm
from pymongo import MongoClient
import defl as defl
import urllib.parse

## MongoDB

In [1]:
%%time

# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["lotus_mines_enzymatic"]
compounds_collection = db["compounds"]

# Function to search MetaCyc
def search_metacyc(compound_name):
    search_url = "https://metacyc.org/compound?type=NIL&object="
    response = requests.get(search_url + compound_name)
    
    if response.status_code == 200:
        return response.text
    else:
        return None

# Function to extract SMILES from MetaCyc HTML content
def extract_smiles(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    smiles_tag = soup.find('td', text='SMILES')
    
    if smiles_tag:
        smiles_value = smiles_tag.find_next_sibling('td').text.strip()
        return smiles_value
    else:
        return None

# Function to process each compound
def process_compound(compound):
    compound_type = compound.get("type")
    
    # Only process if the compound's type is "Predicted"
    if compound_type != "Predicted":
        return None
    
    # Search MetaCyc for the compound
    html_content = search_metacyc(compound["name"])
    
    if html_content:
        smiles = extract_smiles(html_content)
        return smiles
    else:
        return None

# Main script
if __name__ == "__main__":
    results = []

    # Retrieve compounds from MongoDB
    compounds = compounds_collection.find({})
    
    # Use tqdm to display a progress bar
    for compound in tqdm(compounds, desc="Processing Compounds", unit=" compound"):
        smiles = process_compound(compound)
        if smiles:
            results.append({
                "Compound_id": compound["_id"],
                "SMILES": smiles,
                "InChI_key": compound["InChI_key"]
            })

Processing Compounds: 3432312compound [00:15, 226615.09compound/s]

CPU times: user 12.1 s, sys: 661 ms, total: 12.8 s
Wall time: 15.4 s





In [5]:
%%time

# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["lotus_mines_enzymatic"]
compounds_collection = db["compounds"]

# Function to search MetaCyc
def search_metacyc(compound_name):
    search_url = "https://metacyc.org/compound?type=NIL&object="
    response = requests.get(search_url + compound_name)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: Unable to retrieve data (status code {response.status_code}) for {compound_name}")
        return None

# Function to extract SMILES from MetaCyc HTML content
def extract_smiles(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    smiles_tag = soup.find('td', text='SMILES')
    
    if smiles_tag:
        smiles_value = smiles_tag.find_next_sibling('td').text.strip()
        return smiles_value
    else:
        print("SMILES string not found in the HTML content.")
        return None

# Function to process each compound
def process_compound(compound):
    compound_type = compound.get("type")
    
    # Only process if the compound's type is "Predicted"
    if compound_type != "Predicted":
        #print(f"Skipping {compound['InChI_key']} because it is not of type 'Predicted'.")
        return None
    
    # Search MetaCyc for the compound
    html_content = search_metacyc(compound["name"])
    
    if html_content:
        smiles = extract_smiles(html_content)
        return smiles
    else:
        return None

# Main script
if __name__ == "__main__":
    results = []

    # Retrieve compounds from MongoDB
    compounds = compounds_collection.find({})
    
    # Use tqdm to display a progress bar
    for compound in tqdm(compounds, desc="Processing Compounds", unit="compound"):
        smiles = process_compound(compound)
        if smiles:
            results.append({
                "Compound_id": compound["_id"],
                "SMILES": smiles,
                "InChI_key": compound["InChI_key"]
            })

    print(results)

Processing Compounds: 3432312compound [00:15, 225981.74compound/s]

[]
CPU times: user 12.3 s, sys: 276 ms, total: 12.6 s
Wall time: 15.2 s





In [2]:
# Convert results to a Polars DataFrame
df = pl.DataFrame(results)

# Save the DataFrame to a file or further processing
#df.write_csv("metacyc_results.csv")
print(df)

shape: (0, 0)
┌┐
╞╡
└┘


### lookup predicted compounds in pubchem 

In [39]:
%%time

# load Inputfile
df_input_mines = pl.read_csv("../data/MINES/230106_frozen_metadata_inchy_smile.csv")
df_input_mines = df_input_mines.rename({"id":"InChI_key_lotus", "smiles":"smiles_lotus"})

df_input_mines.head(10)

CPU times: user 94.7 ms, sys: 13.3 ms, total: 108 ms
Wall time: 6.53 ms


InChI_key_lotus,smiles_lotus
str,str
"""XJOOMMHNYOJWCZ-UKRRQHHQSA-N""","""CC1=C[C@@H]2c3cccc4[nH]cc(c34)C[C@H]2N(C)C1"""
"""SFCYVTIQMNZUCZ-UHFFFAOYSA-N""","""C=C(C)C#Cc1cc(C=O)ccc1O"""
"""OYZXDVPSGCKVOQ-UQBPGWFLSA-N""","""COC(=O)[C@]12CCCC(C)(C)[C@@H]1CCc1cc(C(C)C)c(OC(C)=O)c(OC(C)=O)c12"""
"""MSSOSOXUURLBHN-UHFFFAOYSA-N""","""CCC(=O)OC1C2C(OC(=O)c3ccccc3)C34OC2(C)COC(=O)c2cccnc2C(C)C(C)C(=O)OC(C(O)C(OC(=O)c2ccccc2)C3(COC(C)=O)C1OC(C)=O)C4(C)O"""
"""FNDJBOATFIWAJR-ONEGZZNKSA-N""","""CC(C)=CCc1cc(/C=C/C=O)ccc1O"""
"""CZUWIMDOXXXJRE-SHPISUKRSA-N""","""CC(=O)OC/C=C(\C)CC[C@@H]1C(C)=CC[C@@H]2C(C)(C)C[C@H](O)C[C@@]12C"""
"""FVNPLROTBAEWRZ-CMDGGOBGSA-N""","""Cc1cnc(C)c(/C=C/c2ccccc2)n1"""
"""CCHUDPANZXHQCS-UKTHLTGXSA-N""","""CC(C)/C=C1/NC(=O)C(C(C)C)n2c1nc1c(c2=O)C=CC=CO1"""
"""VMVVAKUSEGPLKU-LRJCJXCVSA-N""","""CCC[C@H](O)[C@@H](O)CCCCCCCC[C@@H](O)[C@H]1CC[C@H](CCCCCCCCCC[C@@H](O)CC2=C[C@H](C)OC2=O)O1"""
"""RYAHFDNUSMNPRF-UHFFFAOYSA-N""","""C=C1C=Cc2cc(COC(=O)CCCCCCCC=CCCCCCCCC)ccc2OC1"""


In [3]:
%%time

# connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['compounds']

result = client['lotus_mines_enzymatic']['compounds'].find()

documents_list = list(result)

df_mongo = pl.DataFrame(documents_list)
df_mongo = df_mongo.drop(["Generation", "Expand", "Reactant_in", "Product_of", "Type"])
df_mongo = df_mongo.rename({"_id":"_id_mongo", 
                            "ID":"ID_mongo", 
                            "SMILES":"SMILES_mongo", 
                            "InChI_key":"InChI_key_mongo"})

df_mongo

CPU times: user 47.8 s, sys: 14 s, total: 1min 1s
Wall time: 1min 4s


_id_mongo,ID_mongo,SMILES_mongo,InChI_key_mongo
str,str,str,str
"""Xed5e87faf61da02132a9818e3222bb6d558f8258""","""cpd00044""","""Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OS(=O)(=O)O)C(OP(=O)(O)O)C1O""","""GACDQMDRPRGCTN"""
"""X73bc8ef21db580aefe4dbc0af17d4013961d9d17""","""cpd00001""","""O""","""XLYOFNOQVPJJNP"""
"""Xf4a7855630639e93324efbcfa9adb1b6a80e8b62""","""cpd00202""","""CC(C)=CCOP(=O)(O)OP(=O)(O)O""","""CBIDRCWHNCKSTO"""
"""Xc4b3110bfc6e3fde007ae1eefdda16eacf8fe948""","""cpd00004""","""NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(O)C3O)C(O)C2O)C=CC1""","""BOPGDPNILDQYTO"""
"""X97b638f57607ff42024826314564b47adb4cf4cb""","""cpd00003""","""NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(O)C3O)C(O)C2O)c1""","""BAWFJGJZGIEFAR"""
"""Xca3125292d42b5595b65696d5cd74e007b886297""","""cpd00019""","""Nc1ncnc2c1ncn2C1OC(CSCCC(N)C(=O)O)C(O)C1O""","""ZJUKTBDSGOFHSH"""
"""Xe264bf1bbd482e436995855afdf2170ce7229f1d""","""cpd00024""","""O=C(O)CCC(=O)C(=O)O""","""KPGXRSRHYNQIFN"""
"""Xad024c97bc6d1c83baf5d8902af02c3a3db5358e""","""cpd00011""","""O=C=O""","""CURLTUGMZLYLDI"""
"""X8dc023d8052d83fb6feadf8541387e57c199cad0""","""cpd00007""","""O=O""","""MYMOFIZGZYHOMD"""
"""X9b5ec04e3e13766613e65d29e9d5b275fd8a0317""","""cpd00017""","""C[S+](CCC(N)C(=O)O)CC1OC(n2cnc3c(N)ncnc32)C(O)C1O""","""MEFKEPWMEQBLKI"""


In [40]:
df_starting = df_mongo.join(df_input_mines, left_on="ID_mongo", right_on="InChI_key_lotus", how="inner")

df_starting

_id_mongo,ID_mongo,SMILES_mongo,InChI_key_mongo,smiles_lotus
str,str,str,str,str
"""Ccb4bbce54cc03da8afe0090e2869252ff6cb7aa8""","""XJOOMMHNYOJWCZ-UKRRQHHQSA-N""","""CC1=CC2c3cccc4[nH]cc(c34)CC2N(C)C1""","""XJOOMMHNYOJWCZ-UHFFFAOYSA-N""","""CC1=C[C@@H]2c3cccc4[nH]cc(c34)C[C@H]2N(C)C1"""
"""C64d6d2fef30e6e42986c2a1e36a719d69c2a2030""","""SFCYVTIQMNZUCZ-UHFFFAOYSA-N""","""C=C(C)C#Cc1cc(C=O)ccc1O""","""SFCYVTIQMNZUCZ-UHFFFAOYSA-N""","""C=C(C)C#Cc1cc(C=O)ccc1O"""
"""Cacd119678c4c728e090ae999c931ca0085876ef3""","""MSSOSOXUURLBHN-UHFFFAOYSA-N""","""CCC(=O)OC1C2C(OC(=O)c3ccccc3)C34OC2(C)COC(=O)c2cccnc2C(C)C(C)C(=O)OC(C(O)C(OC(=O)c2ccccc2)C3(COC(C)=O)C1OC(C)=O)C4(C)O""","""MSSOSOXUURLBHN-UHFFFAOYSA-N""","""CCC(=O)OC1C2C(OC(=O)c3ccccc3)C34OC2(C)COC(=O)c2cccnc2C(C)C(C)C(=O)OC(C(O)C(OC(=O)c2ccccc2)C3(COC(C)=O)C1OC(C)=O)C4(C)O"""
"""C86f7988222caa340261bdc7fda8231cbad1598b4""","""CCHUDPANZXHQCS-UKTHLTGXSA-N""","""CC(C)C=C1NC(=O)C(C(C)C)n2c1nc1c(c2=O)C=CC=CO1""","""CCHUDPANZXHQCS-UHFFFAOYSA-N""","""CC(C)/C=C1/NC(=O)C(C(C)C)n2c1nc1c(c2=O)C=CC=CO1"""
"""C9f9c52031affb165344c9cef3116d1681868d8c0""","""OYZXDVPSGCKVOQ-UQBPGWFLSA-N""","""COC(=O)[C@]12CCCC(C)(C)[C@@H]1CCc1cc(C(C)C)c(OC(C)=O)c(OC(C)=O)c12""","""OYZXDVPSGCKVOQ-UQBPGWFLSA-N""","""COC(=O)[C@]12CCCC(C)(C)[C@@H]1CCc1cc(C(C)C)c(OC(C)=O)c(OC(C)=O)c12"""
"""C1de9654ff313d8cd6a485a80ee0b955d5ede1050""","""FNDJBOATFIWAJR-ONEGZZNKSA-N""","""CC(C)=CCc1cc(/C=C/C=O)ccc1O""","""FNDJBOATFIWAJR-ONEGZZNKSA-N""","""CC(C)=CCc1cc(/C=C/C=O)ccc1O"""
"""Cb6369ea4c391aae71c9f31c1a3fb02c41e087a83""","""XPJVFNVOMZCPBQ-OISHEVBHSA-N""","""CC(=O)O[C@H]1[C@H](O[C@H]2C[C@H]3[C@@H]4CC=C5C[C@@H](O[C@@H]6O[C@H](CO)[C@@H](O)[C@H](O)[C@H]6O)CC[C@]5(C)[C@H]4CC[C@]3(C)[C@@]2(O)[C@H](C)C(=O)CCC(C)C)OC[C@H](O)[C@@H]1O[C@@H]1OC[C@@H](O)[C@H](O)[C@H]1OC(=O)/C=C/c1ccccc1""","""XPJVFNVOMZCPBQ-OISHEVBHSA-N""","""CC(=O)O[C@H]1[C@H](O[C@H]2C[C@H]3[C@@H]4CC=C5C[C@@H](O[C@@H]6O[C@H](CO)[C@@H](O)[C@H](O)[C@H]6O)CC[C@]5(C)[C@H]4CC[C@]3(C)[C@@]2(O)[C@H](C)C(=O)CCC(C)C)OC[C@H](O)[C@@H]1O[C@@H]1OC[C@@H](O)[C@H](O)[C@H]1OC(=O)/C=C/c1ccccc1"""
"""C78c041538f3179c724e08a8ee63a77c758d2fc11""","""CZUWIMDOXXXJRE-SHPISUKRSA-N""","""CC(=O)OC/C=C(\C)CC[C@@H]1C(C)=CC[C@@H]2C(C)(C)C[C@H](O)C[C@@]12C""","""CZUWIMDOXXXJRE-SHPISUKRSA-N""","""CC(=O)OC/C=C(\C)CC[C@@H]1C(C)=CC[C@@H]2C(C)(C)C[C@H](O)C[C@@]12C"""
"""Cfa3f715616d4b8844186eab00c25e4ab8ca51226""","""FVNPLROTBAEWRZ-CMDGGOBGSA-N""","""Cc1cnc(C)c(C=Cc2ccccc2)n1""","""FVNPLROTBAEWRZ-UHFFFAOYSA-N""","""Cc1cnc(C)c(/C=C/c2ccccc2)n1"""
"""C9e93be58a278c4c342c50275e1b7483e68601e96""","""RYAHFDNUSMNPRF-UHFFFAOYSA-N""","""C=C1C=Cc2cc(COC(=O)CCCCCCCC=CCCCCCCCC)ccc2OC1""","""RYAHFDNUSMNPRF-UHFFFAOYSA-N""","""C=C1C=Cc2cc(COC(=O)CCCCCCCC=CCCCCCCCC)ccc2OC1"""


In [50]:
%%time

def check_smiles_in_pubchem(smiles):
    # URL encode the SMILES string
    smiles_encoded = urllib.parse.quote(smiles)
    
    # Use the PubChem API to convert SMILES to a CID (PubChem Compound ID)
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles_encoded}/cids/JSON"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
            cids = data['IdentifierList']['CID']
            if cids:
                cid = cids[0]
                #print(f"SMILES found in PubChem with CID: {cid}")
                return cid
            else:
                print("SMILES not found in PubChem.")
                return None
        else:
            print("Unexpected response format.")
            return None
    else:
        #print(f"Error in accessing PubChem API. Status code: {response.status_code}")
        return None

def get_organisms_for_cid(cid):
    # Use the PubChem API to retrieve organisms associated with the CID
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON/?heading=Taxonomy"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        organisms = []
        if 'Record' in data and 'Section' in data['Record']:
            for section in data['Record']['Section']:
                if section.get("TOCHeading") == "Taxonomy":
                    for subsection in section.get("Section", []):
                        if subsection.get("TOCHeading") == "Organisms":
                            for info in subsection.get("Information", []):
                                organisms.append(info.get("Name", "Unknown organism"))
        return organisms
    else:
        #print(f"Error in retrieving organism data. Status code: {response.status_code}")
        return []


# compare SMILES_mongo or smiles_lotus
smiles_list = df_starting["SMILES_mongo"].unique().to_list()
print(len(smiles_list))
smiles_list = smiles_list[0:10]
cid_list = list()

# Iterate over the DataFrame rows and update CID
for smiles in smiles_list:
    cid = check_smiles_in_pubchem(smiles)

    if cid == None:
        cid_list.append(None)
    else:
        cid_list.append(cid)

    #print(f"Processed SMILES: {smiles}\n---")

# Print the updated DataFrame
print(f'cid_list [{len(cid_list)}], smiles_list [{len(smiles_list)}]')

# Create a new DataFrame with SMILES and CID
df_smiles_cid = pl.DataFrame({
    "SMILES_mongo": smiles_list,
    "CID": cid_list
})

print(df_smiles_cid)
df_joined = df_starting.join(df_smiles_cid, on="SMILES_mongo", how="left")


147861
cid_list [10], smiles_list [10]
shape: (10, 2)
┌──────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│ SMILES_mongo                                                                         ┆ CID       │
│ ---                                                                                  ┆ ---       │
│ str                                                                                  ┆ i64       │
╞══════════════════════════════════════════════════════════════════════════════════════╪═══════════╡
│ C=C1CC[C@H]2[C@](C)(CO)[C@H](O)CC[C@@]2(C)[C@@H]1CCC1=CCOC1=O                        ┆ 15922991  │
│ CC(C)=CCC[C@@]1(C)Oc2c(O)cc3c(c2C[C@H]1O)O[C@H]1c2ccc(O)cc2OC[C@@H]31                ┆ 162852634 │
│ CC(C)[C@H]1CC[C@@H](C)[C@]2(O)CCC(C(=O)O)=C[C@H]12                                   ┆ 145721093 │
│ O=C(O)/C=C\c1ccc(O[C@@H]2O[C@H](COC(=O)c3cc(O)c(O)c(O)c3)[C@@H](O)[C@H](O)[C@H]2O)cc ┆ null      │
│ 1                                  

## *.parquet file

In [2]:
def check_smiles_in_pubchem(smiles):
    # URL encode the SMILES string
    smiles_encoded = urllib.parse.quote(smiles)

    # Use the PubChem API to convert SMILES to a CID (PubChem Compound ID)
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles_encoded}/cids/JSON"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
            cids = data['IdentifierList']['CID']
            if cids:
                cid = cids[0]
                #print(f"SMILES found in PubChem with CID: {cid}")
                return cid
            else:
                print("SMILES not found in PubChem.")
                return None
        else:
            print("Unexpected response format.")
            return None
    else:
        #print(f"Error in accessing PubChem API. Status code: {response.status_code}")
        return None

def get_organisms_for_cid(cid):
    # Use the PubChem API to retrieve organisms associated with the CID
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON/?heading=Taxonomy"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        organisms = []
        if 'Record' in data and 'Section' in data['Record']:
            for section in data['Record']['Section']:
                if section.get("TOCHeading") == "Taxonomy":
                    for subsection in section.get("Section", []):
                        if subsection.get("TOCHeading") == "Organisms":
                            for info in subsection.get("Information", []):
                                organisms.append(info.get("Name", "Unknown organism"))
        return organisms
    else:
        #print(f"Error in retrieving organism data. Status code: {response.status_code}")
        return []

"""
>>>>>>> c7e1eb7 (update all)
# connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['compounds']

result = client['lotus_mines_enzymatic']['compounds'].find()

documents_list = list(result)

df_mongo = pl.DataFrame(documents_list)
df_mongo = df_mongo.drop(["Generation", "Expand", "Reactant_in", "Product_of", "Type"])
df_mongo = df_mongo.rename({"_id":"_id_mongo", 
                            "ID":"ID_mongo", 
                            "SMILES":"SMILES_mongo", 
                            "InChI_key":"InChI_key_mongo"})
"""



df_mongo = pl.read_parquet("../data/MINES/mongo_predicted_compounds.parquet")

# compare SMILES_mongo or smiles_lotus
smiles_list = df_mongo["SMILES_mongo"].unique().to_list()
cid_list = list()

print(f'df shape: {df_mongo.shape}  | SMILES list: {len(smiles_list)}')


# Iterate over the DataFrame rows and update CID
for smiles in smiles_list:
    cid = check_smiles_in_pubchem(smiles)

    if cid == None:
        cid_list.append(None)
    else:
        cid_list.append(cid)

# Print the updated DataFrame
print(f'cid_list [{len(cid_list)}], smiles_list [{len(smiles_list)}]')

# Create a new DataFrame with SMILES and CID
df_smiles_cid = pl.DataFrame({
    "SMILES_mongo": smiles_list,
    "CID": cid_list
})


df_joined = df_starting.join(df_smiles_cid, on="SMILES_mongo", how="left")
df_joined.write_parquet("../data/MINES/pubchem_all_compounds.parquet")


df shape: (3284418, 4)  | SMILES list: 3284418


KeyboardInterrupt: 

In [9]:
import pubchempy as pcp
import csv, os
import polars as pl



def is_valid_smiles(smile):
    try:
        # Here you can add specific checks or use third-party libraries for SMILES validation
        return bool(smile.strip())
    except Exception:
        return False


df_mongo = pl.read_parquet("../data/MINES/mongo_predicted_compounds.parquet")
file = "../data/taxonomy/smiles_cid.tsv"
smiles = df_mongo["SMILES_mongo"].unique().to_list()

smiles = smiles[0:50]


# Header to add
header = ['smile', 'cid_compound', 'error']

# Check if the file exists
if not os.path.exists(file):
    # File does not exist, create it and add the header
    with open(file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(header)
else:
    print(f"The file '{file}' already exists.")
    

total_length = len(smiles)
i = 0

for smile in smiles:
    if is_valid_smiles(smile):
        c = pcp.get_compounds(smile, 'smiles')
        
        for compound in c:
            cid_compound = compound.cid
                    
            if cid_compound == None:
                continue;
            else:
                additional_data = [
                    [smile, cid_compound, False],
                ]
                
                # Write data to a TSV file
                with open(file, 'a', newline='') as file:
                    writer = csv.writer(file, delimiter='\t')
                    writer.writerows(additional_data)


    else:
        print("Invalid SMILES string provided.")

        additional_data = [
            [smile, cid_compound, True],
        ]
        
        with open(file, 'a', newline='') as file:
            writer = csv.writer(file, delimiter='\t')
            writer.writerows(additional_data)

    i = i + 1
    print(f"{i}/{total_length} - {cid_compound}")

The file '../data/taxonomy/smiles_cid.tsv' already exists.
1/50 - None
2/50 - None
3/50 - None
4/50 - None
5/50 - None
6/50 - None
7/50 - None
8/50 - None
9/50 - None
10/50 - None
11/50 - None
12/50 - None
13/50 - None
14/50 - None
15/50 - None
16/50 - None
17/50 - None
18/50 - None
19/50 - None
20/50 - None
21/50 - None
22/50 - None
23/50 - None
24/50 - None
25/50 - None
26/50 - None
27/50 - None
28/50 - None
29/50 - None
30/50 - None
31/50 - None
32/50 - None
33/50 - None
34/50 - None
35/50 - None
36/50 - None
37/50 - None
38/50 - None
39/50 - None
40/50 - None
41/50 - None
42/50 - None
43/50 - None
44/50 - None
45/50 - None
46/50 - None
47/50 - None
48/50 - None
49/50 - None
50/50 - None


## test for pubchempy

In [44]:
import pubchempy as pcp

# Replace 'Aspirin' with the name of your compound
c = pcp.get_compounds('Aspirin', 'name', as_dataframe=False)

for co in c:
    print(co.cid)


c[0].cid, dir(c[0]), repr(c[0]), type(c)


2244


(2244,
 ['__class__',
  '__delattr__',
  '__dict__',
  '__dir__',
  '__doc__',
  '__eq__',
  '__format__',
  '__ge__',
  '__getattribute__',
  '__gt__',
  '__hash__',
  '__init__',
  '__init_subclass__',
  '__le__',
  '__lt__',
  '__module__',
  '__ne__',
  '__new__',
  '__reduce__',
  '__reduce_ex__',
  '__repr__',
  '__setattr__',
  '__sizeof__',
  '__str__',
  '__subclasshook__',
  '__weakref__',
  '_atoms',
  '_bonds',
  '_record',
  '_setup_atoms',
  '_setup_bonds',
  'aids',
  'atom_stereo_count',
  'atoms',
  'bond_stereo_count',
  'bonds',
  'cactvs_fingerprint',
  'canonical_smiles',
  'charge',
  'cid',
  'complexity',
  'conformer_id_3d',
  'conformer_rmsd_3d',
  'coordinate_type',
  'covalent_unit_count',
  'defined_atom_stereo_count',
  'defined_bond_stereo_count',
  'effective_rotor_count_3d',
  'elements',
  'exact_mass',
  'feature_selfoverlap_3d',
  'fingerprint',
  'from_cid',
  'h_bond_acceptor_count',
  'h_bond_donor_count',
  'heavy_atom_count',
  'inchi',
  'inchi