# Set Up Environment



In [None]:
!pip install rdkit python-dotenv pandas numpy matplotlib loguru py3dmol

In [2]:
import os
import pickle
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import requests

from dotenv import load_dotenv

from rdkit import Chem
from rdkit.Chem.QED import qed as rdkit_qed
from rdkit.Chem.QED import qed
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import Descriptors

from IPython.display import display, clear_output

API Key and URL

In [None]:
# Load environment variables from .env file. Add API key to .env file.
load_dotenv()

# Access your API key
API_KEY = os.getenv("API_KEY")

if API_KEY:
    print("NVIDIA API Key loaded successfully.")
else:
    print("NVIDIA API Key not found. Make sure it's set in your .env file or as a system environment variable.")
    raise ValueError("API_KEY is not set. Please create a .env file with your API_KEY.")

# --- API Setup ---
invoke_url = "https://health.api.nvidia.com/v1/biology/nvidia/molmim/generate"
headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Accept": "application/json",
}
session = requests.Session()

# Starting Molecule

In [None]:
# Input SMILE string
smis = "C1=CC2=NON=C2C(=C1NC3=CC(=CC(=C3)Cl)F)[N+](=O)[O-]"

mol = Chem.MolFromSmiles(smis)
qed_score = rdkit_qed(mol)
print(f"Original QED: {qed_score}")
mol

## CMA-ES Optimization

In [6]:
# Create a dictionary to store the results
results = {}

# Create a list of minimum similarities
num_min_sims = 3
min_sims = np.linspace(0.1, 0.7, num_min_sims)

In [7]:
def tanimoto_similarity(smiles, reference: str):
    # Get fingerprint params
    fingerprint_radius_param = 2
    fingerprint_nbits = 2048

    # Handle the reference molecule
    reference_mol = Chem.MolFromSmiles(reference)
    reference_fingerprint = GetMorganFingerprintAsBitVect(
        reference_mol, radius=fingerprint_radius_param, nBits=fingerprint_nbits
    )

    # Validate the other molecule
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0

    fingerprint = GetMorganFingerprintAsBitVect(mol, radius=fingerprint_radius_param, nBits=fingerprint_nbits)

    # Calculate and return the Tanimoto similarity
    return TanimotoSimilarity(fingerprint, reference_fingerprint)

In [None]:
# Loop through each minimum similarity value
for min_sim in min_sims:
    # Create a dictionary to store the results for this min_sim
    min_sim_results = {'smiles': [], 'num_smiles': [], 'tanimoto_similarity': [], 'qed_score': []}

    # Create the request payload
    payload = {
      "smi": smis,
      "algorithm": "CMA-ES",
      "num_molecules": 10,
      "property_name": "QED",
      "minimize": False,
      "min_similarity": min_sim,
      "particles": 20,
      "iterations": 2,
      "scaled_radius": 1
    }

    # Send the request and get the response
    response = session.post(invoke_url, headers=headers, json=payload)
    response.raise_for_status()
    response_json = response.json()
    print(f"*************** min_sim: {min_sim} ********************")
    print(f"response_json: \n"
          f"{response_json}")

    # Extract the generated SMILES
    gen_smiles_list = [i['sample'] for i in ast.literal_eval(response_json['molecules'])]
    print(f"gen_smiles_list: \n"
          f"{gen_smiles_list}")
    # Get the molecule objects out of valid SMILES
    valid_mol_list = [mol for smiles in gen_smiles_list if (mol := Chem.MolFromSmiles(smiles))]
    # Convert to canonical SMILES & deduplicate
    canonical_smiles = set()
    for mol in valid_mol_list:
        canonical_smi = Chem.MolToSmiles(mol, canonical=True)
        canonical_smiles.add(canonical_smi)
    canonical_smiles_list = list(canonical_smiles)
    print(f"canonical_smiles_list: \n"
          f"{canonical_smiles_list}")

    # Calculate Tanimoto similarity and QED score for each valid SMILES
    for smiles in canonical_smiles_list:
        tanimoto = tanimoto_similarity(smiles, smis)
        mol = Chem.MolFromSmiles(smiles)
        qed_score = qed(mol)
        min_sim_results['tanimoto_similarity'].append(tanimoto)
        min_sim_results['qed_score'].append(qed_score)

    # Update min_sim_results - get the average of Tanimoto and QED scores, store generated SMILES
    min_sim_results['tanimoto_similarity'] = np.mean(min_sim_results['tanimoto_similarity'])
    min_sim_results['qed_score'] = np.mean(min_sim_results['qed_score'])
    min_sim_results['num_smiles'] = len(canonical_smiles_list)
    min_sim_results['smiles'] = canonical_smiles_list

    # Store the results for this min_sim
    results[min_sim] = min_sim_results

In [13]:
def lipinski_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # invalid SMILES

    # Calculate Lipinski's properties
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hba = Descriptors.NumHAcceptors(mol)
    hbd = Descriptors.NumHDonors(mol)

    return mw, logp, hba, hbd

In [None]:

# Create a list to store the data for the report
report_data = []

for smiles in min_sim_results['smiles']:
    mol = Chem.MolFromSmiles(smiles)
    qed_score = qed(mol)
    tanimoto = tanimoto_similarity(smiles, smis)

    # Calculate Lipinski properties
    lipinski = lipinski_properties(smiles)
    if lipinski:
        mw, logp, hba, hbd = lipinski
    else:
        mw, logp, hba, hbd = None, None, None, None

    report_data.append({
        'smiles': smiles,
        'qed_score': qed_score,
        'tanimoto_similarity': tanimoto,
        'min_sim': min_sim,
        'mw': mw,
        'logp': logp,
        'hba': hba,
        'hbd': hbd,
    })

# Create a pandas DataFrame from the report data
report_df = pd.DataFrame(report_data)

# Display the DataFrame
display(report_df)

# Save the DataFrame as a CSV file
csv_filename = 'min_sim_results_report.csv'
report_df.to_csv(csv_filename, index=False)
print(f"CSV report saved as '{csv_filename}'.")


