# Search for known compounds

With the database MetaCyc <https://metacyc.org/> the generated "predicted compounds" will be searched.

In [1]:
%%time

import requests
from bs4 import BeautifulSoup
import pymongo
import polars as pl
from tqdm import tqdm

# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["lotus_mines_enzymatic"]
compounds_collection = db["compounds"]

# Function to search MetaCyc
def search_metacyc(compound_name):
    search_url = "https://metacyc.org/compound?type=NIL&object="
    response = requests.get(search_url + compound_name)
    
    if response.status_code == 200:
        return response.text
    else:
        return None

# Function to extract SMILES from MetaCyc HTML content
def extract_smiles(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    smiles_tag = soup.find('td', text='SMILES')
    
    if smiles_tag:
        smiles_value = smiles_tag.find_next_sibling('td').text.strip()
        return smiles_value
    else:
        return None

# Function to process each compound
def process_compound(compound):
    compound_type = compound.get("type")
    
    # Only process if the compound's type is "Predicted"
    if compound_type != "Predicted":
        return None
    
    # Search MetaCyc for the compound
    html_content = search_metacyc(compound["name"])
    
    if html_content:
        smiles = extract_smiles(html_content)
        return smiles
    else:
        return None

# Main script
if __name__ == "__main__":
    results = []

    # Retrieve compounds from MongoDB
    compounds = compounds_collection.find({})
    
    # Use tqdm to display a progress bar
    for compound in tqdm(compounds, desc="Processing Compounds", unit=" compound"):
        smiles = process_compound(compound)
        if smiles:
            results.append({
                "Compound_id": compound["_id"],
                "SMILES": smiles,
                "InChI_key": compound["InChI_key"]
            })

Processing Compounds: 3432312compound [00:15, 226615.09compound/s]

CPU times: user 12.1 s, sys: 661 ms, total: 12.8 s
Wall time: 15.4 s





In [5]:
%%time

import requests
from bs4 import BeautifulSoup
import pymongo
import polars as pl
from tqdm import tqdm

# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["lotus_mines_enzymatic"]
compounds_collection = db["compounds"]

# Function to search MetaCyc
def search_metacyc(compound_name):
    search_url = "https://metacyc.org/compound?type=NIL&object="
    response = requests.get(search_url + compound_name)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: Unable to retrieve data (status code {response.status_code}) for {compound_name}")
        return None

# Function to extract SMILES from MetaCyc HTML content
def extract_smiles(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    smiles_tag = soup.find('td', text='SMILES')
    
    if smiles_tag:
        smiles_value = smiles_tag.find_next_sibling('td').text.strip()
        return smiles_value
    else:
        print("SMILES string not found in the HTML content.")
        return None

# Function to process each compound
def process_compound(compound):
    compound_type = compound.get("type")
    
    # Only process if the compound's type is "Predicted"
    if compound_type != "Predicted":
        #print(f"Skipping {compound['InChI_key']} because it is not of type 'Predicted'.")
        return None
    
    # Search MetaCyc for the compound
    html_content = search_metacyc(compound["name"])
    
    if html_content:
        smiles = extract_smiles(html_content)
        return smiles
    else:
        return None

# Main script
if __name__ == "__main__":
    results = []

    # Retrieve compounds from MongoDB
    compounds = compounds_collection.find({})
    
    # Use tqdm to display a progress bar
    for compound in tqdm(compounds, desc="Processing Compounds", unit="compound"):
        smiles = process_compound(compound)
        if smiles:
            results.append({
                "Compound_id": compound["_id"],
                "SMILES": smiles,
                "InChI_key": compound["InChI_key"]
            })

    print(results)

Processing Compounds: 3432312compound [00:15, 225981.74compound/s]

[]
CPU times: user 12.3 s, sys: 276 ms, total: 12.6 s
Wall time: 15.2 s





In [2]:
# Convert results to a Polars DataFrame
df = pl.DataFrame(results)

# Save the DataFrame to a file or further processing
#df.write_csv("metacyc_results.csv")
print(df)

shape: (0, 0)
┌┐
╞╡
└┘


In [None]:
pubchem (look overlap)