In [1]:
from langchain import OpenAI
from langchain.agents import Tool, tool
from langchain import agents
import langchain
import os
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import sys
import dziner

langchain.debug = False
# from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-3-large' 

from langchain.text_splitter import CharacterTextSplitter

In [2]:
import dziner.sascorer
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.Chem import Descriptors

@tool
def check_validity(smiles: str):
    '''
    This tool inputs a SMILES and outputs chemical feasibility, SA and QED scores, molecular weight and the smiles
    '''
    smiles = smiles.replace("\n", "")
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES", 0, 0, 0, smiles

    # Calculate SA score
    sa_score = dziner.sascorer.calculateScore(mol)

    # Calculate QED score
    qed_score = QED.qed(mol)
    molecular_weight = Descriptors.MolWt(mol)

    return "Valid SMILES", sa_score, qed_score, molecular_weight, smiles

# Example usage
test_smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"
check_validity(test_smiles)

('Valid SMILES',
 1.580039750008826,
 0.5501217966938848,
 180.15899999999996,
 'CC(=O)OC1=CC=CC=C1C(=O)O')

In [3]:
## This model based on this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334
## And is coming from this URL 
## https://github.com/dockstring/dockstring/tree/main
## Also see https://github.com/dockstring/dockstring/blob/main/dockstring/target.py#L160

from dockstring import load_target

@tool
def predict_docking_score(smiles):
    '''
    This tool predicts the docking score to WDR5 for a SMILES molecule. Lower docking score means a larger binding affinity.
    This model based on Autodock Vina from this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334 
    '''
    smiles = smiles.replace("\n", "").replace("'", "")
    target = load_target("WDR5")
    try:
        score, affinities = target.dock(smiles)
    except:
        score = 'Invalid molecule'
    return score

# import concurrent.futures
# from typing import List

# @tool
# def predict_docking_score(smiles_list: str):
#     '''
#     This tool predicts the docking score to DRD2 for SMILES molecules. Lower docking score means a larger binding affinity.
#     This model based on Autodock Vina from this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334 
#     '''
#     smiles_list = smiles_list.split(', ')
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         future_to_smiles = {executor.submit(_get_docking_score, smiles): smiles for smiles in smiles_list}
#         docking_scores = []
#         for future in concurrent.futures.as_completed(future_to_smiles):
#             smiles = future_to_smiles[future]
#             try:
#                 score = future.result()
#             except Exception as e:
#                 score = 'Error occurred'
#             docking_scores.append(f'SMILES: {smiles}, Docking Score: {score}')
#     return docking_scores

# predict_docking_score("CC(=O)OC1=CC=CC=C1C(=O)O, CC(=O)OC1=CC=CC=C1C(=O)O")
# _get_docking_score("CC(=O)OC1=CC=CC=C1C(=O)O")

In [4]:
predict_docking_score("CC(=O)OC1=CC=CC=C1C(=O)O")

-4.8

In [5]:
from langchain.memory import ConversationBufferMemory

RetrievalQA_prompt = """What are the design guidelines for scaffold hopping for making a molecule have a larger binding afinity against WDR5 target? 
    This can be based on making changes to the functional groups or other changes to the molecule. Summarize your answer and cite paper
    by the title, DOI and the journal and year the document was published on."""

@tool
def lookup_papers(prompt):
    '''Useful for getting chemical intuition for the problem.
    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.
    It also includes information on the paper citation or DOI.
    '''
    guide_lines = []
    for m in range(1):
        text_splitter = CharacterTextSplitter(
            chunk_size=500, chunk_overlap=20)
        paper_file =f'../data/papers/Binding/WDR5/{m}.pdf'
        pages = PyPDFLoader(paper_file).load_and_split()
        sliced_pages = text_splitter.split_documents(pages)
        faiss_vectorstore = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))
        
        llm=ChatOpenAI(
                        model_name='gpt-4o',
                        temperature=0.1,
                        )
        g = dziner.RetrievalQABypassTokenLimit(faiss_vectorstore, RetrievalQA_prompt, llm)
        guide_lines.append(g)
        # break
    return " ".join(guide_lines)


# guidel_lines = lookup_papers("")
# guidel_lines

In [6]:
# @tool
# def check_validity(SMILES):
#     '''This tool inputs SMILES string representations checks if it is a chemically feasible molecule.
#     '''
#     from rdkit import Chem
#     from rdkit.Chem import Draw
#     try:
#         mol = Chem.MolFromSmiles(SMILES)
#         if mol is None:
#             return "Invalid", SMILES
#         return "Valid", SMILES
#     except Exception as e:
#         # return f"An error occurred: {str(e)}"
#         return "Invalid", SMILES

# @tool
# def check_validity(smiles_list: str):
#     '''
#     This tool predicts the validity for SMILES molecules and checks if it is a chemically feasible molecule.
#     '''
#     smiles_list = smiles_list.split(', ')
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         future_to_smiles = {executor.submit(_get_validity, smiles): smiles for smiles in smiles_list}
#         validity = []
#         for future in concurrent.futures.as_completed(future_to_smiles):
#             smiles = future_to_smiles[future]
#             try:
#                 val = future.result()
#             except Exception as e:
#                 score = 'Error occurred'
#             validity.append(f'SMILES: {smiles}, validity: {val}')
#     return validity

## TODO PAINS filters

In [7]:
import sys
from rdkit import Chem
from rdkit.Chem import FilterCatalog
import pandas as pd

def check_filters(smiles):
    # Initialize filter catalogs
    params = FilterCatalog.FilterCatalogParams()
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
    catalog = FilterCatalog.FilterCatalog(params)
    
    # Convert SMILES to molecule
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES"
    
    # Check for matches in all catalogs
    results = []
    if catalog.HasMatch(mol):
        entry = catalog.GetFirstMatch(mol)
        print(catalog.GetMatches(mol))
        if entry:
            results.append({
                "description": entry.GetDescription(),
                "Scope": entry.GetProp('Scope'),
            })
            print(f"Warning: molecule failed filter: reason {entry.GetDescription()}", file=sys.stderr)
    
    return results

# Example usage
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Example SMILES string for aspirin
results = check_filters(smiles)
results

# # Convert results to a DataFrame for easier viewing
# df = pd.DataFrame(results)
# if not df.empty:
#     print(df)
# else:
#     print("No filters matched the given SMILES.")

# df

# # If you want to display the results in a more user-friendly format
# # import ace_tools as tools
# # tools.display_dataframe_to_user(name="Filter Matches", dataframe=df)


<rdkit.Chem.rdfiltercatalog.FilterCatalogEntryList object at 0x7f92dd52afc0>




[{'description': 'phenol_ester',
  'Scope': 'unwanted functionality due to potential tox reasons or unfavourable pharmacokinetic properties'}]

In [8]:
# from rdkit.Chem import FilterCatalog
# params = FilterCatalog.FilterCatalogParams()
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_A)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
# catalog = FilterCatalog.FilterCatalog(params)

# matches = []
# clean = []
# for s in list(df['SMILES']):
#     molecule = Chem.MolFromSmiles(s)
#     entry = catalog.GetMatches(molecule) #Get all the matches/PAINs
#     if entry:
#         groups = []
#         pains = []
#         for i in entry:
#             print(i)
#             pains.append(i.GetDescription().capitalize())
#             groups.append(i.GetProp('Scope'))
#         matches.append({
#             "smiles": s,
#             "pains": pains,
#             "group": groups
#             })
#     else:
#         # collect indices of molecules without PAINS
#         clean.append(index)

# matches = pd.DataFrame(matches)


# def filter_func(row):
#     return len(row['group']) == 1

# filtered_df = matches[matches.apply(filter_func, axis=1)]
# filtered_df

In [9]:
# smiles = "O=C(C)Oc1ccccc1C(=O)O"
# mol = Chem.MolFromSmiles(smiles)

# params = FilterCatalog.FilterCatalogParams()
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_A)
# # params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_B)
# # params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS_C)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.NIH)
# params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.ZINC)
# catalog = FilterCatalog.FilterCatalog(params)
# entry = catalog.GetFirstMatch(mol)

In [10]:
lookup_papers

StructuredTool(name='lookup_papers', description='Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.\n    It also includes information on the paper citation or DOI.', args_schema=<class 'pydantic.v1.main.lookup_papersSchema'>, func=<function lookup_papers at 0x7f92e214b920>)

In [11]:
from langchain_experimental.tools import PythonREPLTool

PythonREPLTool().description = 'A Python shell. Use this to execute python commands. Input should be a valid python command. \
    If you want to see the output of a value, you should print it out with `print(...)`.'

PythonREPLTool().name = 'PythonREPL'

tools = [lookup_papers, check_validity, predict_docking_score]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

initial_molecule = "c1ccc2c(cc(s2)C(=O)N)c1" # "CC(=O)OC1=CC=CC=C1C(=O)O" # this is Asprin
Human_prompt = f"Make changes to {initial_molecule} so it will have a larger binding affinity against WDR5 target. Suggest 20 new candidates."

input_data = {
            "input": Human_prompt,
            "prompt": RetrievalQA_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [12]:
from dziner.agents import dZiner

# Define agent message and prompts
# SUFFIX = """Start by Looking up and summarizing design guidelines in a few bullet points.
#         Make sure to add citations (paper DOI) for design guidelines if you use them. 
#     For making changes to the SMILES, you should always follow these steps:
#     1. Evaluate the binding affinity of the initial SMILES.
#     2. Evaluate the validity of the initial SMILES.
#     3. Start making changes to the molecule based on guidelines you found in step 1. Try to use different changes at each iteration.
#     4. Start generating a pool of 10 synthesizable SMILES given the changes you made with new SMILES you generated in step 3.
#     5. From the pool only choose candidates that aligns with the design guidelines modifications compared to the latest design.
#         As an example, if you the guideline say optimizing the property requires adding aromatic rings, choose new SMILEs with 
#         extra aromatic rings compared to the latest design.
#         Explain how the new candidate align with the design guidelines.
#     6. Evaluate the binding affinity for the new SMILES.
#     7. Evaluate validity of the SMILES for the new SMILES.
    
#     If the binding affinity does not change properly, still evaluate validity and if the
#     molecule is invalid, revert change to the previous valid SMILES with best binding affinity and try something else by
#     redoing steps 3-7. Iterate until there are 25 new SMILES candidates with higher binding affinity and then stop.

#     Your final response should also contain the source for the tools used from their summary in description in {tool_desc}.

#     Start by describing the problem: \n\nBegin!\n \n\nQuestion: {input}
#     Thought:{agent_scratchpad}\n"""




In [13]:
# response = agent_executor.invoke({"input": f"Look up design guidelines and make changes to the MOF [Co].[O-]C(=O)c1ccncc1 so it will have a higher band gap:\
#             After each change you should first check validity of new SMILES, if valid evaluate the band gap, otherwise revert change and try something else.\n\
#             "})

In [14]:
tools

[StructuredTool(name='lookup_papers', description='Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.\n    It also includes information on the paper citation or DOI.', args_schema=<class 'pydantic.v1.main.lookup_papersSchema'>, func=<function lookup_papers at 0x7f92e214b920>),
 StructuredTool(name='check_validity', description='This tool inputs a SMILES and outputs chemical feasibility, SA and QED scores, molecular weight and the smiles', args_schema=<class 'pydantic.v1.main.check_validitySchema'>, func=<function check_validity at 0x7f92e96cae80>),
 StructuredTool(name='predict_docking_score', description='This tool predicts the docking score to WDR5 for a SMILES molecule. Lower docking score means a larger binding affinity.\nThis model based on Autodock Vina from this paper: https://pubs.acs.org/doi/10.1021/acs.jcim.1c01334', args_schema=<class 'pyda

In [15]:
from langchain_core.messages import AIMessage
from langchain_core.runnables import (
    Runnable,
    RunnableLambda,
    RunnableMap,
    RunnablePassthrough,
)

llm = ChatOpenAI(model="gpt-4o")

tools = [check_validity, predict_docking_score]
llm_with_tools = llm.bind_tools(tools)
tool_map = {tool.name: tool for tool in tools}

def call_tools(msg: AIMessage) -> Runnable:
    """Simple sequential tool calling helper."""
    tool_map = {tool.name: tool for tool in tools}
    tool_calls = msg.tool_calls.copy()
    for tool_call in tool_calls:
        tool_call["output"] = tool_map[tool_call["name"]].invoke(tool_call["args"])
    return tool_cal

# agent = dZiner(tools, property="Binding affinity",
#                model='gpt-4o', verbose=True, n_design_iterations=10, temp=0.1, suffix=SUFFIX).agent | call_tools


In [16]:
# iteration_data = []
# iteration = 0
# cost = 0
# from langchain_community.callbacks import get_openai_callback


# for step in agent.iter(input_data):
#     with get_openai_callback() as cb:
#         if output := step.get("intermediate_step"):
#             action, value = output[0]
#             if action.tool == "predict_docking_score":
#                 docking_score = value
#             if action.tool == "check_validity":
#                 chemical_feasibility, SMILES = value
#                 iteration_data.append({
#                                         'iteration': iteration,
#                                         'data': {
#                                             'SMILES': SMILES,
#                                             'Chemical Feasibility': chemical_feasibility,
#                                             'Docking Score': docking_score,
#                                         }
#                                     })
#                 iteration += 1
#                 # break 
#                 # assert is_prime(int(value))
#             # Ask user if they want to continue
#             # _continue = input("Should the agent continue (Y/n)?:\n") or "Y"
#             # if _continue.lower() != "y":
#             #     break
#     cost += cb.total_cost

# print(f'Cost: ${cost}')

In [17]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import List, Dict

class Candidate(BaseModel):
    iteration_number: int = Field(description="Iteration number")
    smiles: str = Field(description="SMILES string")
    modification: List[str] = Field(description="The change made to the SMILES in the previous iteration and the detailed reasoning behind it.")
    docking_score: float = Field(description="SMILES docking score")
    QED_score: float = Field(description="SMILES QED score")
    SA_score: float = Field(description="SMILES SA score")
    Validity: str = Field(description="Validity of the SMILES")


class CandidatesList(BaseModel):
    """Pydantic data model for a list of Candidates."""
    CandidatesList: List[Candidate]
    
parser = PydanticOutputParser(pydantic_object=CandidatesList)
# parser = JsonOutputParser(pydantic_object=CandidatesList)

In [18]:

from langchain.tools import tool
from langchain_core.runnables import RunnableParallel, RunnableMap
from langchain_core.messages import AIMessage
from langchain_openai import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.memory import ConversationBufferMemory
from langchain.callbacks import get_openai_callback
from langchain_core.prompts import MessagesPlaceholder
from concurrent.futures import ThreadPoolExecutor
from dziner.prompts import PREFIX, FORMAT_INSTRUCTIONS

#    4. Start generating a pool of 10 synthesizable SMILES given the changes you made with new SMILES you generated in step 3.
# 

# SUFFIX = """Start by Looking up and summarizing design guidelines in a few bullet points.
#         Make sure to add citations (paper DOI) for design guidelines if you use them. 
#     For making changes to the SMILES, you should always follow these steps:
#     1. Evaluate the binding affinity of the initial SMILES.
#     2. Evaluate the validity of the initial SMILES.
#     3. Start making manual changes to the SMILES based on guidelines you found in step 1 and check validity. 
#      These changes can be by removing, adding, replacing elements, functional groups or rings, etc to the core parts of the molecule.
#      For example, if we had a molecule that was a functionalized indole. We would try replacing the indole
#      with an isoindole, indolizine, methylindoles, indolone, methylenedioxybenzene etc. 
#      Those functional groups would replace the indole, not be added to it.
#      Try to use different changes each time that could improve the affinity without adding too much in terms of size/ molecular weight.
#      As an example, if you see the guideline say optimizing the property requires adding aromatic rings, choose new SMILEs with 
#         extra aromatic rings compared to the latest design.  Explain how the new candidate align with the design guidelines.      
#     5. Evaluate the binding affinity for the new SMILES.
#     6. Evaluate validity of the SMILES for the new SMILES.
#     7. Pick the design with the highest binding affinity and go back to step 3.
    
#     If the binding affinity does not change properly, still evaluate validity and if the
#     molecule is invalid, revert change to the previous valid SMILES with best binding affinity and try something else by
#     redoing steps 3-7, but still report in your final answer. The best changes are the ones without adding too much
#     but rather replacing the core so the molecular weight of the molecule is lower than 600 g/mol.

#     Iteration 0 is the initial input SMILES.  Do not repeat the SMILES in the iterations history. Iterate until there are 25 new SMILES candidates generated and then stop.

#     Start by describing the problem: \n\nBegin!\n \n\nQuestion: {input}
#     Thought:{agent_scratchpad}\n"""

SUFFIX = """Start by:

1. Researching Design Guidelines: Summarize the guidelines in a few bullet points, with citations (e.g., paper DOI) included for any design guidelines used.

2. Evaluating Initial SMILES:

Binding Affinity: Assess the binding affinity of the initial SMILES.
Validity: Check the validity of the initial SMILES structure.
3. Modifying SMILES:

Manual Adjustments: Based on the guidelines from Step 1, make manual changes to the SMILES. These changes may involve adding, removing, or replacing elements, functional groups, or rings in the molecule’s core.
Example: For a functionalized indole, try replacing the indole with structures like isoindole, indolizine, methylindoles, indolone, or methylenedioxybenzene.
Diverse Approaches: Apply different modifications that could enhance affinity without significantly increasing molecular size/weight.
Alignment with Guidelines: Justify how each new SMILES aligns with the design guidelines, such as adding aromatic rings if suggested.
4. Evaluating Modified SMILES:

Binding Affinity: Assess the binding affinity of each new SMILES.
Validity: Ensure each new SMILES is valid.
5. Selecting the Best Design:

Choose the SMILES with the highest binding affinity.
If no improvement is seen, revert to the previous valid SMILES with the best binding affinity and retry Steps 3–5.
Iteration Details:

Iteration 0: Use the initial input SMILES.
Iteration Limit: MUST only stop after generating 20 new SMILES candidates (20 SMILES MUST be in your final answer).
Molecular Weight: Prioritize changes that keep the molecular weight below 600 g/mol.
Final Steps:

Document All Changes: Even if reverted, all changes must be documented and reported.
Avoid Redundancy: Do not repeat SMILES in the iteration history.
Begin with Problem Description:
Question: {input}
Thought: {agent_scratchpad}
"""

# FORMAT_INSTRUCTIONS = """Use the following format:

#     Question: the input question you must answer
#     Thought: you should always think about what to do and don't leave this empty
#     Action: the action to take, should be one of [{tool_names}]
#     Action Input: the input to the action
#     Observation: the result of the action
#     ... this Thought/Action/Action Input/Observation can repeat N times/Final Answer:
#     You may not need to use a tool to make a decision.
#     Keep generating new SMILES and evalutating until there are 25 new SMILES candidates and then stop.
#     Your final response to say to the Human MUST use this format:
#     '''
#     Thought: Here's your final answer:
#     Final Answer: make sure your response here is in a valid json format like:
#      [
#      "Iteration": 0,
#       "SMILES": "CC(=O)OC1=CC=C(C=C1)C(=O)N2CCN(CC2)CC3=CC=CC=C3I",
#       "Modification": The change you made to the SMILES in the previous iteration and the detailed reasoning why.
#       "Docking Score": -9.8,
#       "SA Score": 2.01,
#       "QED Score": 0.40,
#       "Molecular weight": 150,
#       "Validity": "Valid"
#      ]
#      Report all iterations even if the SMILES is invalid. Iteration is the entire process of suggesting a new SMILES, 
#      evaluating its docking score and validity. 
     
#     '''

#     Use the exact SMILES without any "\n" or any extra "()". Do not repeat the SMILES that already exists iterations history.
#     """

FORMAT_INSTRUCTIONS = """
Use the following format:

    Thought: you should always think about what to do and never leave this empty
    Action: the action to take, should be one of the tools [{tool_names}]
    Action Input: the input to the action
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat N times)
    Thought: I know the final answer
    Final Answer: the final answer to the input question.
    The Final Answer MUST come in JSON format with NO extra text.
    
    Example final answer format:

     "Iteration": 0,
      "SMILES": "CC(=O)OC1=CC=C(C=C1)C(=O)N2CCN(CC2)CC3=CC=CC=C3I",
      "Modification": The change made to the SMILES in the previous iteration and the detailed reasoning behind it.
      "Docking Score": -9.8,
      "SA Score": 2.01,
      "QED Score": 0.40,
      "Molecular Weight": 150,
      "Validity": "Valid"

    Report all iterations, even if the SMILES is invalid. An iteration includes suggesting a new SMILES, evaluating its docking score, and determining its validity.
    
    Use the exact SMILES without any newlines or extra parentheses. Do not repeat SMILES that already exist in the iteration history.
    
    You have a final answer once 20 new SMILES candidates are generated and evaluated. 
       

"""

tools = [lookup_papers, check_validity, predict_docking_score]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# Initialize the LangChain agent
class dZiner:
    def __init__(self, tools, property, model="text-davinci-003", temp=0.1, get_cost=False, max_iterations=40,
                 agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, n_design_iterations=1, **kwargs):
        self.property = property
        self.n_design_iterations = n_design_iterations
        self.suffix = kwargs.get('suffix', SUFFIX.format(property=self.property, tool_desc="{tool_desc}",
                                                         input="{input}", agent_scratchpad="{agent_scratchpad}",
                                                         n_design_iterations=self.n_design_iterations))
        self.prefix = kwargs.get('prefix', PREFIX.format(property=self.property))
        self.format_instructions = kwargs.get('format_instructions',
                                              FORMAT_INSTRUCTIONS.format(tool_names="{tool_names}"))
        self.model = model
        if model.startswith("gpt-3.5-turbo") or model.startswith("gpt-4"):
            self.model = ChatOpenAI(temperature=temp, model_name=model, 
                                    request_timeout=1000, max_tokens=4096)
        self.get_cost = get_cost
        self.max_iterations = max_iterations
        # memory = ConversationBufferMemory(memory_key="chat_history", input_key='input', output_key="output",
        #                                   return_messages=True)
        from langchain.memory import ConversationSummaryBufferMemory
        from langchain.memory import ConversationKGMemory
        from langchain.memory import ConversationEntityMemory
        memory = ConversationBufferMemory(memory_key="chat_history", input_key='input', output_key="output",
                                          return_messages=True)
        # memory_llm = OpenAI(temperature=0)
        # memory = ConversationEntityMemory(llm=memory_llm, memory_key="chat_history", input_key='input', output_key="output")
        # memory = ConversationKGMemory(llm=memory_llm, memory_key="chat_history", input_key='input', output_key="output")
        
        self.verbose = kwargs.get('verbose', False)
        chat_history = MessagesPlaceholder(variable_name="chat_history")
        self.agent = initialize_agent(
            tools=tools, llm=self.model, agent_type=agent_type, verbose=self.verbose, memory=memory,
            early_stopping_method='generate', handle_parsing_errors=True,
            agent_kwargs={
                "prefix": self.prefix, "suffix": self.suffix, "format_instructions": self.format_instructions,
                "memory_prompts": [chat_history], "input_variables": ["input", "agent_scratchpad", "chat_history"],
            }, 
            return_intermediate_steps=False, max_iterations=max_iterations, max_execution_time=3000)

    def __call__(self, prompt):
        with get_openai_callback() as cb:
            result = self.agent.invoke({"input": prompt})
        if self.get_cost:
            print(cb)
        return result

    def iter(self, input_data):
        tool_map = {tool.name: tool for tool in tools}

        def call_tool(tool_name, args):
            return tool_map[tool_name].invoke(args)

        parallel_tools = RunnableParallel(**{tool.name: RunnableLambda(call_tool, tool.name) for tool in tools})

        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(parallel_tools.invoke, {"SMILES": data["SMILES"]}) for data in input_data]
            for future in futures:
                yield future.result()

# Example usage
agent = dZiner(tools, property="Binding affinity", model='gpt-4o',
               verbose=True, n_design_iterations=25, suffix=SUFFIX, format_instructions=FORMAT_INSTRUCTIONS,
               temperature=0.2, max_iterations=120, get_cost=True, max_tokens=8192).agent

# iteration_data = []
# iteration = 0
# cost = 0

# smiles_results = []
# validity_results = []
# docking_score_results = []
# QED_score_results = []
# SA_score_results = []
# for step in agent.iter(input_data):
#     with get_openai_callback() as cb:
#         smiles_results = []
#         validity_results = []
#         docking_score_results = []
#         for output in step.get("intermediate_steps", []):
#             action, value = output
#             if action.tool == "predict_docking_score":
#                 docking_score = value
#                 docking_score_results.append(docking_score)
#             if action.tool == "check_validity":
#                 chemical_feasibility, SA_score, QED_score, SMILES = value
#                 smiles_results.append(SMILES)
#                 validity_results.append(chemical_feasibility)
#                 SA_score_results.append(SA_score)
#                 QED_score_results.append(QED_score)
                
            
#             # iteration += 1
#         cost += cb.total_cost

# iteration_data = {
#                 'iteration': iteration,
#                 'data': {
#                     'SMILES': smiles_results,
#                     'Chemical Feasibility': validity_results,
#                     'Docking Score': docking_score_results,
#                     'SA Score': SA_score_results,
#                     'QED Score':QED_score_results
#                 }
#             }


# print(f"Total cost: {cost}")

# print(iteration_data)


In [19]:
input_data

{'input': 'Make changes to c1ccc2c(cc(s2)C(=O)N)c1 so it will have a larger binding affinity against WDR5 target. Suggest 20 new candidates.',
 'prompt': 'What are the design guidelines for scaffold hopping for making a molecule have a larger binding afinity against WDR5 target? \n    This can be based on making changes to the functional groups or other changes to the molecule. Summarize your answer and cite paper\n    by the title, DOI and the journal and year the document was published on.',
 'tools': [StructuredTool(name='lookup_papers', description='Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.\n    It also includes information on the paper citation or DOI.', args_schema=<class 'pydantic.v1.main.lookup_papersSchema'>, func=<function lookup_papers at 0x7f92e214b920>),
  StructuredTool(name='check_validity', description='This tool inputs a SMIL

In [20]:
agent.get_prompts

<bound method Runnable.get_prompts of AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', return_messages=True, memory_key='chat_history'), verbose=True, tags=['zero-shot-react-description'], agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], template='You are a helpful Chemist AI assistant called dZiner. You are tasked to make changes to a molecule based on some design guidelines\n    and optimize the its Binding affinity.\n    Always use the tools to get chemical intuition learn about the design guidelines. If you don\'t know, do not make up answers.\n    Explain changes you make to the molecule in details at each step but do not put redundant information and make it short.\n    \n\nlookup_papers(prompt) - Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for molecules with higher binding afinity against WDR5 by looking through research papers.\n  

In [21]:
langchain.debug = False
with get_openai_callback() as cb:
    response = agent.invoke(input_data)

print(cb)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To begin, I need to research design guidelines for molecules with higher binding affinity against WDR5. This will help me understand the structural features that enhance binding affinity.

Action: lookup_papers
Action Input: Design guidelines for molecules with higher binding affinity against WDR5
[0m
Observation: [36;1m[1;3mThe design guidelines for scaffold hopping to enhance binding affinity against the WDR5 target involve several strategies:

1. **Introduction of Aromatic Rings**: Adding aromatic rings can occupy the hydrophobic groove of WDR5, forming π-π stacking interactions with residues like Tyr191. This was demonstrated with compound 30, where an aromatic ring A was introduced to enhance binding affinity.

2. **Substituent Modifications**: Modifying substituents on the aromatic rings can further improve binding affinity. For example, adding a 4-aminobutanamido group to compound 30 resulted in compound 42

[22:28:31] Can't kekulize mol.  Unkekulized atoms: 17 18 19 20 21 22 23 24 25 26 27


[32;1m[1;3mThe SMILES for the eighteenth iteration is invalid. I will revert to the previous valid SMILES with the best docking score and try a different modification.

### Iteration 18:
- **SMILES**: c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)c4cccnc5ccccc45)c1
- **Modification**: Added a quinoline ring to the aromatic ring to potentially enhance binding affinity.
- **Docking Score**: N/A (Invalid SMILES)
- **SA Score**: N/A
- **QED Score**: N/A
- **Molecular Weight**: N/A
- **Validity**: Invalid

Next, I will try adding a different functional group to the aromatic ring to further enhance binding affinity.

### Iteration 19:
Thought: I will add a pyrazole ring to the aromatic ring to potentially enhance binding affinity.

Action: check_validity
Action Input: c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)c4ccn[nH]4)c1
[0m
Observation: [33;1m[1;3m('Valid SMILES', 2.096497012792838, 0.5820738191863392, 319.389, 'c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)c4ccn[nH]4)c1')[0m
Thought:[32;1m[1;3mThe modified SMILES is valid.

In [22]:
# iteration_data = []
# iteration = 0
# cost = 0
# from langchain_community.callbacks import get_openai_callback


# for step in agent.iter(input_data):
#     # with get_openai_callback() as cb:
#         # if output := step.get("intermediate_step"):
#         #     action, value = output[0]
#         #     if action.tool == "predict_docking_score":
#         #         docking_score = value
#         #     if action.tool == "check_validity":
#         #         chemical_feasibility, SMILES = value
#         #         iteration_data.append({
#         #                                 'iteration': iteration,
#         #                                 'data': {
#         #                                     'SMILES': SMILES,
#         #                                     'Chemical Feasibility': chemical_feasibility,
#         #                                     'Docking Score': docking_score,
#         #                                 }
#         #                             })
#         #         iteration += 1
#         #         # break 
#         #         # assert is_prime(int(value))
#         #     # Ask user if they want to continue
#         #     # _continue = input("Should the agent continue (Y/n)?:\n") or "Y"
#         #     # if _continue.lower() != "y":
#         #     #     break
#     if step == 20:
#         break

#     print(f'\n {step} \n')
#     # cost += cb.total_cost

# # print(f'Cost: ${cost}')

In [23]:
response['output']

'```json\n[\n  {\n    "Iteration": 0,\n    "SMILES": "c1ccc2c(cc(s2)C(=O)N)c1",\n    "Modification": "Initial SMILES",\n    "Docking Score": -5.3,\n    "SA Score": 1.91,\n    "QED Score": 0.71,\n    "Molecular Weight": 177.23,\n    "Validity": "Valid"\n  },\n  {\n    "Iteration": 1,\n    "SMILES": "c1ccc2c(cc(s2)C(=O)Nc3ccccc3)c1",\n    "Modification": "Added an aromatic ring to the initial SMILES to enhance π-π stacking interactions.",\n    "Docking Score": -7.1,\n    "SA Score": 1.60,\n    "QED Score": 0.73,\n    "Molecular Weight": 253.33,\n    "Validity": "Valid"\n  },\n  {\n    "Iteration": 2,\n    "SMILES": "c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=O)NCCCC)c1",\n    "Modification": "Added a 4-aminobutanamido group to the aromatic ring to potentially enhance binding affinity.",\n    "Docking Score": -8.1,\n    "SA Score": 1.83,\n    "QED Score": 0.63,\n    "Molecular Weight": 352.46,\n    "Validity": "Valid"\n  },\n  {\n    "Iteration": 3,\n    "SMILES": "c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=

In [25]:
import json
import pandas as pd

output_str = response['output'].replace("```json", "")[:-3]#.replace("\\n", "").replace("\n", "").replace(" ", "").replace("''", "")

# output_str = output_str.replace("''", "")
output_dict = json.loads(output_str)
df = pd.DataFrame(output_dict)
df

Unnamed: 0,Iteration,SMILES,Modification,Docking Score,SA Score,QED Score,Molecular Weight,Validity
0,0,c1ccc2c(cc(s2)C(=O)N)c1,Initial SMILES,-5.3,1.91,0.71,177.23,Valid
1,1,c1ccc2c(cc(s2)C(=O)Nc3ccccc3)c1,Added an aromatic ring to the initial SMILES t...,-7.1,1.6,0.73,253.33,Valid
2,2,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=O)NCCCC)c1,Added a 4-aminobutanamido group to the aromati...,-8.1,1.83,0.63,352.46,Valid
3,3,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=O)NCCOCC)c1,Introduced a linker to the molecule to potenti...,-6.9,1.95,0.62,368.46,Valid
4,4,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)OC)c1,Added a methoxy group to the aromatic ring to ...,-7.3,1.66,0.78,283.35,Valid
5,5,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(F)(F)F)c1,Added a trifluoromethyl group to the aromatic ...,-7.4,1.87,0.69,321.32,Valid
6,6,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)[N+](=O)[O-])c1,Added a nitro group to the aromatic ring to po...,-6.7,1.84,0.59,298.32,Valid
7,7,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)O)c1,Added a hydroxyl group to the aromatic ring to...,-7.5,1.76,0.69,269.33,Valid
8,8,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C#N)c1,Added a cyano group to the aromatic ring to po...,-7.0,1.86,0.77,278.34,Valid
9,9,c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=O)O)c1,Added a carboxyl group to the aromatic ring to...,-6.9,1.74,0.77,297.34,Valid


In [26]:
df.to_csv('WDR5-20-it(4).csv', index=False)

In [30]:
check_validity(list(df['SMILES'])[-2])

('Valid SMILES',
 1.9159757458245128,
 0.4973258831004755,
 335.45300000000003,
 'c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)c4cccs4)c1')

In [27]:
predict_docking_score('c1ccc2c(cc(s2)C(=O)Nc3ccc(cc3)C(=O)NCCCCNCCc4ccc(C(F)(F)F)cc4)c1')

-8.2

In [None]:
from io import StringIO



data = StringIO(response['output'])

# Read the data into a DataFrame, specifying the separator as '|'
df = pd.read_csv(data, sep='|', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.dropna(how='all')
df

In [None]:
for k,v in iteration_data['data'].items():
    print(len(v))

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import logging

@tool
def check_rdkit_sanitization(SMILES: str) -> str:
    '''
    This tool checks if the SMILES string can be sanitized by RDKit, indicating it is chemically feasible.
    '''
    logging.info(f"Starting RDKit sanitization check for SMILES: {SMILES}")
    try:
        mol = Chem.MolFromSmiles(SMILES)
        if mol is None:
            result = "Invalid", SMILES
        else:
            # Attempt sanitization
            Chem.SanitizeMol(mol)
            result = "Valid", SMILES
    except Exception as e:
        result = "Invalid", SMILES
    logging.info(f"Finished RDKit sanitization check for SMILES: {SMILES}")
    return result

check_rdkit_sanitization("CC(=O)OC1=CC=CC=C1C(=O)N2CCNCC2")

In [None]:
import json
json_data = json.dumps(iteration_data, indent=4)
# Save the JSON data to a file
with open(f"../results/Binding/{iteration_data[0]['data']['SMILES']}_guided.json", "w") as json_file:
    json_file.write(json_data)

In [None]:
import matplotlib.pyplot as plt

iterations = [item['iteration'] for item in iteration_data]
mean_scores = [item['data']['Docking Score'] for item in iteration_data]
# std_band_gaps = [item['data']['STD CMC'] for item in iteration_data]
smiles_labels = [item['data']['SMILES'] for item in iteration_data]
chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

plt.figure(dpi=400)

colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# Plot with error bars and color-coded points
plt.figure(dpi=400)
for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_scores, np.zeros(len(mean_scores)), colors)):
    plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
                 linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])



vertical_offsets = [(-10 + 5 * (i % 2)) for i in range(len(iterations))]  # Alternating vertical offsets

for x, y, label, offset in zip(iterations, mean_scores, smiles_labels, vertical_offsets):
    plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=4, color='C0', rotation=90)

ax=plt.gca()
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))

plt.legend(by_label.values(), by_label.keys(), loc="lower left")

plt.xlabel('Agent Iterations')
plt.xlim([-0.5,7])
plt.ylim([-11, -5])
plt.ylabel('Docking Score')

# plt.savefig(f"../results/{iteration_data[0]['data']['SMILES']}_guided.png")

In [None]:
## Plotting sequntially for presentation purposes

# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")


# import json


# with open(f"../results/CMC/{initial_surfactant}_guided.json", "r") as json_file:
#     iteration_data = json.load(json_file)

# iterations = [item['iteration'] for item in iteration_data]
# mean_cmcs = [item['data']['CMC'] for item in iteration_data]
# std_cmcs = np.zeros(len(mean_cmcs))
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points

# iter = 0

# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_cmcs, np.zeros(len(mean_cmcs)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])

#     filename = f'../results/CMC/iter_{i}_{smiles_labels[i]}.png'
#     visualize_smiles(smiles_labels[i], filename)
#     if i == iter:
#         break


# vertical_offsets = [(-10 + 5 * (k % 2)) for k in range(len(iterations))]  # Alternating vertical offsets

# for i, (x, y, label, offset) in enumerate(zip(iterations, mean_cmcs, smiles_labels, vertical_offsets)):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=7, color='C0', rotation=90)
#     if i == iter:
#         break

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,11])
# plt.ylim([-1, 5])
# plt.ylabel('logCMC (μM)')

# plt.savefig(f"../results/CMC/{iter}_{smiles_labels[iter]}_guided.png")

In [None]:
# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")

# # Example usage
# filename = f'{initial_surfactant}.png'
# visualize_smiles(initial_surfactant, filename)

In [None]:
# results = agent.invoke(input=f"Look up design guidelines and make changes to the MOF [Co].[O-]C(=O)c1ccncc1 so it will have a higher band gap:\
#             After each change you should first check validity of new SMILES, if valid evaluate the band gap, otherwise revert change and try something else. Do 5 iterations.\n\
#             ")