In [1]:
from langchain import OpenAI
from langchain.agents import Tool, tool
from langchain import agents
import langchain
import os
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import sys
import dziner

langchain.debug = False
# from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-3-large' 

from langchain.text_splitter import CharacterTextSplitter

In [2]:
import dziner.sascorer
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.Chem import Descriptors

@tool
def check_validity(smiles: str):
    '''
    This tool inputs a SMILES and outputs chemical feasibility, SA and QED scores, molecular weight and the smiles
    '''
    smiles = smiles.replace("\n", "")
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES", 0, 0, 0, smiles

    # Calculate SA score
    sa_score = dziner.sascorer.calculateScore(mol)

    # Calculate QED score
    qed_score = QED.qed(mol)
    molecular_weight = Descriptors.MolWt(mol)

    return "Valid SMILES", sa_score, qed_score, molecular_weight, smiles

# Example usage
test_smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"
check_validity(test_smiles)

('Valid SMILES',
 1.580039750008826,
 0.5501217966938848,
 180.15899999999996,
 'CC(=O)OC1=CC=CC=C1C(=O)O')

In [3]:
## This model based on this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334
## And is coming from this URL 
## https://github.com/dockstring/dockstring/tree/main
## Also see https://github.com/dockstring/dockstring/blob/main/dockstring/target.py#L160

from dockstring import load_target

@tool
def predict_docking_score(smiles):
    '''
    This tool predicts the docking score to WDR5 for a SMILES molecule. Lower docking score means a larger binding affinity.
    This model based on Autodock Vina from this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334 
    '''
    smiles = smiles.replace("\n", "").replace("'", "")
    target = load_target("WDR5")
    try:
        score, affinities = target.dock(smiles)
    except:
        score = 'Invalid molecule'
    return score

# import concurrent.futures
# from typing import List

# @tool
# def predict_docking_score(smiles_list: str):
#     '''
#     This tool predicts the docking score to DRD2 for SMILES molecules. Lower docking score means a larger binding affinity.
#     This model based on Autodock Vina from this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334 
#     '''
#     smiles_list = smiles_list.split(', ')
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         future_to_smiles = {executor.submit(_get_docking_score, smiles): smiles for smiles in smiles_list}
#         docking_scores = []
#         for future in concurrent.futures.as_completed(future_to_smiles):
#             smiles = future_to_smiles[future]
#             try:
#                 score = future.result()
#             except Exception as e:
#                 score = 'Error occurred'
#             docking_scores.append(f'SMILES: {smiles}, Docking Score: {score}')
#     return docking_scores

# predict_docking_score("CC(=O)OC1=CC=CC=C1C(=O)O, CC(=O)OC1=CC=CC=C1C(=O)O")
# _get_docking_score("CC(=O)OC1=CC=CC=C1C(=O)O")

In [4]:
predict_docking_score("CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C([N+]([O-])=O)C=C2)CC1")

-6.7

In [5]:
from langchain.memory import ConversationBufferMemory

RetrievalQA_prompt = """What are the design guidelines for scaffold hopping for making a molecule have a larger binding afinity against WDR5 target? 
    This can be based on making changes to the functional groups or other changes to the molecule. Summarize your answer and cite paper
    by the title, DOI and the journal and year the document was published on."""

@tool
def lookup_papers(prompt):
    '''Useful for getting chemical intuition for the problem.
    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.
    It also includes information on the paper citation or DOI.
    '''
    guide_lines = []
    for m in range(1):
        text_splitter = CharacterTextSplitter(
            chunk_size=500, chunk_overlap=20)
        paper_file =f'../data/papers/Binding/WDR5/{m}.pdf'
        pages = PyPDFLoader(paper_file).load_and_split()
        sliced_pages = text_splitter.split_documents(pages)
        faiss_vectorstore = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))
        
        llm=ChatOpenAI(
                        model_name='gpt-4o',
                        temperature=0.1,
                        )
        g = dziner.RetrievalQABypassTokenLimit(faiss_vectorstore, RetrievalQA_prompt, llm)
        guide_lines.append(g)
        # break
    return " ".join(guide_lines)


# guidel_lines = lookup_papers("")
# guidel_lines

## TODO PAINS filters

In [6]:
import sys
from rdkit import Chem
from rdkit.Chem import FilterCatalog
import pandas as pd

def check_filters(smiles):
    # Initialize filter catalogs
    params = FilterCatalog.FilterCatalogParams()
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
    catalog = FilterCatalog.FilterCatalog(params)
    
    # Convert SMILES to molecule
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES"
    
    # Check for matches in all catalogs
    results = []
    if catalog.HasMatch(mol):
        entry = catalog.GetFirstMatch(mol)
        print(catalog.GetMatches(mol))
        if entry:
            results.append({
                "description": entry.GetDescription(),
                "Scope": entry.GetProp('Scope'),
            })
            print(f"Warning: molecule failed filter: reason {entry.GetDescription()}", file=sys.stderr)
    
    return results

# Example usage
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Example SMILES string for aspirin
results = check_filters(smiles)
results

# # Convert results to a DataFrame for easier viewing
# df = pd.DataFrame(results)
# if not df.empty:
#     print(df)
# else:
#     print("No filters matched the given SMILES.")

# df

# # If you want to display the results in a more user-friendly format
# # import ace_tools as tools
# # tools.display_dataframe_to_user(name="Filter Matches", dataframe=df)


<rdkit.Chem.rdfiltercatalog.FilterCatalogEntryList object at 0x7efc727270c0>




[{'description': 'phenol_ester',
  'Scope': 'unwanted functionality due to potential tox reasons or unfavourable pharmacokinetic properties'}]

In [7]:
# from rdkit.Chem import FilterCatalog
# params = FilterCatalog.FilterCatalogParams()
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_A)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
# catalog = FilterCatalog.FilterCatalog(params)

# matches = []
# clean = []
# for s in list(df['SMILES']):
#     molecule = Chem.MolFromSmiles(s)
#     entry = catalog.GetMatches(molecule) #Get all the matches/PAINs
#     if entry:
#         groups = []
#         pains = []
#         for i in entry:
#             print(i)
#             pains.append(i.GetDescription().capitalize())
#             groups.append(i.GetProp('Scope'))
#         matches.append({
#             "smiles": s,
#             "pains": pains,
#             "group": groups
#             })
#     else:
#         # collect indices of molecules without PAINS
#         clean.append(index)

# matches = pd.DataFrame(matches)


# def filter_func(row):
#     return len(row['group']) == 1

# filtered_df = matches[matches.apply(filter_func, axis=1)]
# filtered_df

In [8]:
# smiles = "O=C(C)Oc1ccccc1C(=O)O"
# mol = Chem.MolFromSmiles(smiles)

# params = FilterCatalog.FilterCatalogParams()
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_A)
# # params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
# # params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
# catalog = FilterCatalog.FilterCatalog(params)
# entry = catalog.GetFirstMatch(mol)

In [9]:
tools = [lookup_papers, check_validity, predict_docking_score]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# initial_molecule = "c1ccc2c(cc(s2)C(=O)N)c1" # "CC(=O)OC1=CC=CC=C1C(=O)O" # this is Asprin
initial_molecule = "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C([N+]([O-])=O)C=C2)CC1"
Human_prompt = f"Make changes to {initial_molecule} so it will have a larger binding affinity against WDR5 target. Suggest 20 new candidates."

input_data = {
            "input": Human_prompt,
            "prompt": RetrievalQA_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [15]:
from dziner.agents import dZiner
from dziner.prompts import PREFIX



FORMAT_INSTRUCTIONS = """
Use the following format:

    Thought: you should always think about what to do and never leave this empty
    Action: the action to take, should be one of the tools [{tool_names}]
    Action Input: the input to the action
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat N times)
    You don't always need to use a tool if you know the answer.
    Thought: I know the final answer (This is after you have completed 20 iterations)
    Final Answer: the final answer to the input question.
    The Final Answer MUST come in JSON format with NO extra text.
    
    Example final answer format:

     "Iteration": 0,
      "SMILES": "CC(=O)OC1=CC=C(C=C1)C(=O)N2CCN(CC2)CC3=CC=CC=C3I",
      "Modification": The change made to the SMILES in the previous iteration and the detailed reasoning behind it.
      "Docking Score": -9.8,
      "SA Score": 2.01,
      "QED Score": 0.40,
      "Molecular Weight": 150,
      "Validity": "Valid"

    Report all iterations, even if the SMILES is invalid. An iteration includes suggesting a new SMILES, evaluating its docking score, and determining its validity.
    
    Use the exact SMILES without any newlines or extra parentheses. Do not repeat SMILES that already exist in the iteration history.
    
    You have a final answer once 20 new SMILES candidates are generated and evaluated. 
       

"""


SUFFIX = """Start by:

1. Researching Design Guidelines: Summarize the guidelines in a few bullet points, with citations (e.g., paper DOI) included for any design guidelines used.

2. Evaluating Initial SMILES:

Binding Affinity: Assess the binding affinity of the initial SMILES.
Validity: Check the validity of the initial SMILES structure.
3. Modifying SMILES:

Manual Adjustments: Based on the guidelines from Step 1, make manual changes to the SMILES. These changes may involve adding, removing, or replacing elements, functional groups, or rings in the molecule’s core.
Example: For a functionalized indole, try replacing the indole with structures like isoindole, indolizine, methylindoles, indolone, or methylenedioxybenzene.
Diverse Approaches: Apply different modifications that could enhance affinity without significantly increasing molecular size/weight.
Alignment with Guidelines: Justify in great details how each new SMILES aligns with the design guidelines, such as adding aromatic rings if suggested.
4. Evaluating Modified SMILES:

Binding Affinity: Assess the binding affinity of each new SMILES.
Validity: Ensure each new SMILES is valid.
5. Selecting the Best Design:

Choose the SMILES with the highest binding affinity.
If no improvement is seen, revert to the previous valid SMILES with the best binding affinity and retry Steps 3–5.
Iteration Details:

Iteration 0: Use the initial input SMILES.
Iteration Limit: MUST only stop after generating 20 new SMILES candidates (20 SMILES MUST be in your final answer).
Molecular Weight: Prioritize changes that keep the molecular weight below 600 g/mol. Do not use the same SMILEs you have evaluated before in a new iteration.
Final Steps:

Document All Changes: Even if reverted, all changes must be documented and reported.
Avoid Redundancy: Do not repeat SMILES in the iteration history.
Begin with Problem Description:
Question: {input}
Thought: {agent_scratchpad}
"""

agent = dZiner(tools, property="Binding affinity",
               model='gpt-4o', verbose=True, temp=0.4,
              suffix=SUFFIX, format_instructions=FORMAT_INSTRUCTIONS,
              max_tokens=8192 , max_iterations=120, n_design_iterations=25).agent


agent.get_prompts


<bound method Runnable.get_prompts of AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', memory_key='chat_history'), verbose=True, tags=['zero-shot-react-description'], agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], template='You are a helpful Chemist AI assistant called dZiner. You are tasked to make changes to a molecule based on some design guidelines\n    and optimize the its Binding affinity.\n    Always use the tools to get chemical intuition learn about the design guidelines. If you don\'t know, do not make up answers.\n    Explain changes you make to the molecule in details at each step but do not put redundant information and make it short.\n    \n\nlookup_papers(prompt) - Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.\n    It also includes inf

In [16]:
from langchain_community.callbacks import get_openai_callback

import contextlib
import sys

langchain.debug = False
test = 4
output_file_path = f"results/1.binding-WDR5/new_candidate/WDR5-20-it({test})_log.txt"

class Tee:
    def __init__(self, *streams):
        self.streams = streams

    def write(self, data):
        for stream in self.streams:
            stream.write(data)
            stream.flush()

    def flush(self):
        for stream in self.streams:
            stream.flush()

f = open(output_file_path,'w')

tee = Tee(sys.stdout,f)
with contextlib.redirect_stdout(tee):
    with get_openai_callback() as cb:
        response = agent.invoke(input_data)


f.close()

print(cb)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To begin, I need to research design guidelines for optimizing binding affinity against WDR5. This will provide me with the necessary chemical intuition to make informed modifications to the initial SMILES.

Action: lookup_papers(prompt)
Action Input: "design guidelines for molecules with higher binding affinity against WDR5"[0m
Observation: lookup_papers(prompt) is not a valid tool, try one of [lookup_papers, check_validity, predict_docking_score].
Thought:[32;1m[1;3mThought: I need to use the correct tool to look up design guidelines for molecules with higher binding affinity against WDR5.

Action: lookup_papers
Action Input: "design guidelines for molecules with higher binding affinity against WDR5"[0m
Observation: [36;1m[1;3mTo design a molecule with a larger binding affinity against the WDR5 target through scaffold hopping, the following guidelines can be considered:

1. **Hydrophobic Interactions**: Introd

[12:38:07] UFFTYPER: Unrecognized atom type: S_5+4 (35)



Observation: [38;5;200m[1;3m-8.9[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 7 has a docking score of -8.9, which is worse than the previous iteration's docking score of -9.3. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 8:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the chlorine atom with a trifluoromethyl group on the benzene ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3C(F)(F)F)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CC=C5C=C4"[0m
Observation: [33;1m[1;3m('Valid SMILES', 3.148081608740302, 0.2000323069418505, 589.6620000000004, 'CN1CCN(C2=C(NC(C3=CC=CC=C3C(F)(F)F)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CC=C5C=C4')[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 8 is valid with a synthetic accessibility (SA) score of 3.15, a quantitative estimate of drug-likeness (QED) score of 0.20, and a molecular weight of

[12:39:41] Explicit valence for atom # 37 O, 3, is greater than permitted


[32;1m[1;3mThe SMILES in Iteration 10 is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 11:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzothiazole ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CS5C=C4"[0m
Observation: [33;1m[1;3m('Valid SMILES', 4.269055595727094, 0.2468619809982658, 564.1550000000003, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CS5C=C4')[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 11 is valid with a synthetic accessibility (SA) score of 4.27, a quantitative estimate of drug-likeness (QED) score of 0.25, and a molecular weight of 564.16 g/mol. Next, I will evaluate its binding affinity using the docking score.

Action: predict_docking_score
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C

[12:39:47] UFFTYPER: Unrecognized atom type: S_5+4 (35)



Observation: [38;5;200m[1;3m-8.9[0m
Thought:[32;1m[1;3m### Iteration 11:
The modified SMILES in Iteration 11 has a docking score of -8.9, which is worse than the previous iteration's docking score of -9.3. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 12:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzofuran ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CO5C=C4"[0m
Observation: [33;1m[1;3m('Invalid SMILES', 0, 0, 0, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CO5C=C4')[0m
Thought:

[12:40:16] Explicit valence for atom # 36 O, 3, is greater than permitted


[32;1m[1;3mThe SMILES in Iteration 12 is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 13:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzoxazole ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=NO5C=C4"[0m
Observation: [33;1m[1;3m('Invalid SMILES', 0, 0, 0, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=NO5C=C4')[0m
Thought:

[12:40:19] Explicit valence for atom # 36 O, 3, is greater than permitted


[32;1m[1;3mThe SMILES in Iteration 13 is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 14:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzoxazole ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=NO5C=C4"[0m
Observation: [33;1m[1;3m('Invalid SMILES', 0, 0, 0, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=NO5C=C4')[0m
Thought:

[12:40:21] Explicit valence for atom # 36 O, 3, is greater than permitted


[32;1m[1;3mThe SMILES in Iteration 14 is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 15:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzotriazole ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=NNN=C5C=C4"[0m
Observation: [33;1m[1;3m('Valid SMILES', 3.214020445825822, 0.23289867751137214, 547.0630000000003, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=NNN=C5C=C4')[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 15 is valid with a synthetic accessibility (SA) score of 3.21, a quantitative estimate of drug-likeness (QED) score of 0.23, and a molecular weight of 547.06 g/mol. Next, I will evaluate its binding affinity using the docking score.

Action: predict_docking_score
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=

[12:40:56] UFFTYPER: Unrecognized atom type: S_5+4 (35)



Observation: [38;5;200m[1;3m-8.9[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 16 has a docking score of -8.9, which is worse than the previous iteration's docking score of -9.3. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 17:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzothiophene ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CC=C5S4"[0m
Observation: [33;1m[1;3m('Valid SMILES', 3.1388914623093367, 0.21876033234480305, 562.1390000000004, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CC=CC=C5S4')[0m
Thought:[32;1m[1;3mThe modified SMILES in Iteration 17 is valid with a synthetic accessibility (SA) score of 3.14, a quantitative estimate of drug-likeness (QED) score of 0.22, and a molecular weight of 562.14 g/mol. Next, I will evalua

[12:42:08] Explicit valence for atom # 37 O, 3, is greater than permitted


[32;1m[1;3m### Iteration 18:
The SMILES in Iteration 18 is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 19:
**Modification**: Introduce another hydrophobic group to enhance interactions.
**Change**: Replace the phenyl group with a benzodioxane ring.

Action: check_validity
Action Input: "CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CCOCC5C=C4"[0m
Observation: [33;1m[1;3m('Valid SMILES', 3.9355834038199182, 0.39096641274251714, 562.1140000000004, 'CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O)C=C2)CC1C4=CC5=CCOCC5C=C4')[0m
Thought:[32;1m[1;3m### Iteration 19:
The modified SMILES in Iteration 19 is valid with a synthetic accessibility (SA) score of 3.94, a quantitative estimate of drug-likeness (QED) score of 0.39, and a molecular weight of 562.11 g/mol. Next, I will evaluate its binding affinity using the docking score.

Action: predict_docking_score
Action Input: "CN1CCN(C2=C(N

In [18]:
import json
import pandas as pd

output_str = response['output'].replace("```json", "")[:-3]#.replace("\\n", "").replace("\n", "").replace(" ", "").replace("''", "")

# output_str = output_str.replace("''", "")
output_dict = json.loads(output_str)
df = pd.DataFrame(output_dict)
df

Unnamed: 0,Iteration,SMILES,Modification,Docking Score,SA Score,QED Score,Molecular Weight,Validity
0,0,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C([N+]([O-])...,Initial SMILES,-6.7,1.97,0.66,374.83,Valid
1,1,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C([N+]([O-])...,Added a phenyl group to the nitrogen of the pi...,-7.9,2.67,0.44,450.93,Valid
2,2,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the nitro group with a 4-aminobutanam...,-8.4,2.83,0.36,506.05,Valid
3,3,CN(CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the piperazine ring with an ethylened...,-7.4,2.47,0.28,508.07,Valid
4,4,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the phenyl group with a naphthalene g...,-9.3,2.98,0.22,556.11,Valid
5,5,CN1CCN(C2=C(NC(C3=CC=CC=C3F)=O)C=C(NCCCC(N)=O)...,Replaced the chlorine atom with a fluorine ato...,-8.7,2.99,0.25,539.66,Valid
6,6,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the ethylenediamine linker with a pro...,-9.0,3.25,0.28,572.15,Valid
7,7,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the phenyl group with a thiophene gro...,-8.9,4.27,0.25,564.16,Valid
8,8,CN1CCN(C2=C(NC(C3=CC=CC=C3C(F)(F)F)=O)C=C(NCCC...,Replaced the chlorine atom with a trifluoromet...,-8.8,3.15,0.2,589.66,Valid
9,9,CN1CCN(C2=C(NC(C3=CC=CC=C3Cl)=O)C=C(NCCCC(N)=O...,Replaced the phenyl group with a pyridine ring...,-9.2,3.31,0.26,545.09,Valid


In [20]:
df.to_csv(f'results/1.binding-WDR5/new_candidate/WDR5-20-it({test}).csv', index=False)

In [None]:
# from rdkit import Chem
# from rdkit.Chem import AllChem
# import logging

# @tool
# def check_rdkit_sanitization(SMILES: str) -> str:
#     '''
#     This tool checks if the SMILES string can be sanitized by RDKit, indicating it is chemically feasible.
#     '''
#     logging.info(f"Starting RDKit sanitization check for SMILES: {SMILES}")
#     try:
#         mol = Chem.MolFromSmiles(SMILES)
#         if mol is None:
#             result = "Invalid", SMILES
#         else:
#             # Attempt sanitization
#             Chem.SanitizeMol(mol)
#             result = "Valid", SMILES
#     except Exception as e:
#         result = "Invalid", SMILES
#     logging.info(f"Finished RDKit sanitization check for SMILES: {SMILES}")
#     return result

# check_rdkit_sanitization("CC(=O)OC1=CC=CC=C1C(=O)N2CCNCC2")

In [16]:
# import matplotlib.pyplot as plt

# iterations = [item['iteration'] for item in iteration_data]
# mean_scores = [item['data']['Docking Score'] for item in iteration_data]
# # std_band_gaps = [item['data']['STD CMC'] for item in iteration_data]
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points
# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_scores, np.zeros(len(mean_scores)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])



# vertical_offsets = [(-10 + 5 * (i % 2)) for i in range(len(iterations))]  # Alternating vertical offsets

# for x, y, label, offset in zip(iterations, mean_scores, smiles_labels, vertical_offsets):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=4, color='C0', rotation=90)

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,7])
# plt.ylim([-11, -5])
# plt.ylabel('Docking Score')

# # plt.savefig(f"../results/{iteration_data[0]['data']['SMILES']}_guided.png")

In [None]:
## Plotting sequntially for presentation purposes

# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")


# import json


# with open(f"../results/CMC/{initial_surfactant}_guided.json", "r") as json_file:
#     iteration_data = json.load(json_file)

# iterations = [item['iteration'] for item in iteration_data]
# mean_cmcs = [item['data']['CMC'] for item in iteration_data]
# std_cmcs = np.zeros(len(mean_cmcs))
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points

# iter = 0

# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_cmcs, np.zeros(len(mean_cmcs)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])

#     filename = f'../results/CMC/iter_{i}_{smiles_labels[i]}.png'
#     visualize_smiles(smiles_labels[i], filename)
#     if i == iter:
#         break


# vertical_offsets = [(-10 + 5 * (k % 2)) for k in range(len(iterations))]  # Alternating vertical offsets

# for i, (x, y, label, offset) in enumerate(zip(iterations, mean_cmcs, smiles_labels, vertical_offsets)):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=7, color='C0', rotation=90)
#     if i == iter:
#         break

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,11])
# plt.ylim([-1, 5])
# plt.ylabel('logCMC (μM)')

# plt.savefig(f"../results/CMC/{iter}_{smiles_labels[iter]}_guided.png")

In [None]:
# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")

# # Example usage
# filename = f'{initial_surfactant}.png'
# visualize_smiles(initial_surfactant, filename)

In [None]:
# results = agent.invoke(input=f"Look up design guidelines and make changes to the MOF [Co].[O-]C(=O)c1ccncc1 so it will have a higher band gap:\
#             After each change you should first check validity of new SMILES, if valid evaluate the band gap, otherwise revert change and try something else. Do 5 iterations.\n\
#             ")