# Search for known compounds

With the database MetaCyc <https://metacyc.org/> the generated "predicted compounds" will be searched.

In [None]:
import pymongo
import requests
from bs4 import BeautifulSoup
import polars as pl

# Function to search MetaCyc for a compound based on InChIKey or SMILES
def search_metacyc(query):
    search_url = "https://metacyc.org/compound?type=NIL&object="
    
    response = requests.get(search_url + query)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: Unable to retrieve data for {query} (status code {response.status_code})")
        return None

# Function to extract the relevant MetaCyc link from the HTML
def extract_metacyc_link(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    link_tag = soup.find('a', href=True)
    
    if link_tag:
        return "https://metacyc.org" + link_tag['href']
    else:
        return None

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["lotus_mines_enzymatic"]
compounds_collection = db["compounds"]

# Initialize lists for Polars DataFrame
compound_ids = []
smiles_links = []
inchi_links = []

# Iterate through the compounds in the MongoDB collection
for compound in compounds_collection.find():
    compound_id = compound.get('_id')
    inchi_key = compound.get('InChI_key')
    smiles = compound.get('SMILES')
    
    # Search MetaCyc using the InChIKey
    inchi_html = search_metacyc(inchi_key)
    inchi_link = extract_metacyc_link(inchi_html) if inchi_html else None
    
    # Search MetaCyc using the SMILES
    smiles_html = search_metacyc(smiles)
    smiles_link = extract_metacyc_link(smiles_html) if smiles_html else None
    
    # Store results in lists
    compound_ids.append(compound_id)
    smiles_links.append(smiles_link)
    inchi_links.append(inchi_link)

print(f"annotated compounds in metacyc: {len(compound_ids)} | {len(smiles_links)} | {len(inchi_links)}")