In [1]:
from langchain import OpenAI
from langchain.agents import Tool, tool
from langchain import agents
import langchain
import os
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import sys
import dziner

langchain.debug = False
# from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-3-large' 

from langchain.text_splitter import CharacterTextSplitter

In [2]:
import dziner.sascorer
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.Chem import Descriptors

@tool
def check_validity(smiles: str):
    '''
    This tool inputs a SMILES and outputs chemical feasibility, SA scores, molecular weight and the smiles
    '''
    smiles = smiles.replace("\n", "")
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES", 0, 0, smiles

    # Calculate SA score
    sa_score = dziner.sascorer.calculateScore(mol)

    # Calculate QED score
    qed_score = QED.qed(mol)
    molecular_weight = Descriptors.MolWt(mol)

    return "Valid SMILES", sa_score, molecular_weight, smiles

# Example usage
test_smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"
check_validity(test_smiles)

('Valid SMILES',
 1.580039750008826,
 180.15899999999996,
 'CC(=O)OC1=CC=CC=C1C(=O)O')

In [3]:
## This model is based on this paper: https://pubs.acs.org/doi/10.1021/jacs.2c11420 fine-tuned on the hmof dataset

from dziner.surrogates.MOFormer.CO2_adsorption import predict_CO2_adsorption_with_uncertainty
import torch

@tool
def predict_CO2_adsoprtion(smiles):
    '''
    This tool predicts the mean CO2 Adsorption and the standard deviation in prediction for an input SMILES.
    This model is based on this paper: https://pubs.acs.org/doi/10.1021/jacs.2c11420 fine-tuned on the hmof dataset.
    '''
    smiles = smiles.replace("\n", "")
    mean_prediction, std_prediction = predict_CO2_adsorption_with_uncertainty(smiles)    
    return np.round(mean_prediction, 3), np.round(std_prediction, 3)
initial_smiles = 'COc1cc(C(=O)[O-])c(c(c1c1ccc(cc1)c1ccc(cc1)C(=O)[O-])OC)[O]'
predict_CO2_adsoprtion(initial_smiles)

(1.443, 0.135)

In [4]:
# from dziner.surrogates.CO_2.model import predict_single_point

# def predict_CO2_adsoprtion(smiles):
#     '''
#     This tool predicts the mean CO2 Adsorption and it standard deviation for a SMILES.
#     This model is based on this paper: https://pubs.acs.org/doi/10.1021/jacs.2c11420 fine-tuned on the hmof dataset
#     '''
#     base_model_path = "../dziner/surrogates/CO_2/hmof_finetuned_models/hmof_finetuned_{}.pth"
    
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     predictions = []

#     # Looping through the ensemble of models with 5-fold cross validation.
#     for cvid in range(5):  
#         model_path = base_model_path.format(cvid)
#         print(model_path)
#         inference_model = torch.load(model_path)
# #         try:
#         # prediction = predict_single_point(smiles, model_path, device)
#         prediction = SMILES_to_property(smiles, model=inference_model)
#         predictions.append(prediction)
# #         except:
# #             predictions.append(None)
    
#     valid_predictions = [p for p in predictions if p is not None]
    
#     if not valid_predictions:
#         return 'Invalid molecule', None
    
#     mean_prediction = np.mean(valid_predictions)
#     std_prediction = np.std(valid_predictions)
    
#     return np.round(mean_prediction, 3), np.round(std_prediction, 3)

In [5]:
# initial_smiles = '[O-]C(=O)c1ccc(cc1)c1nnc(nn1)c1ccc(cc1)C(=O)[O-]'
# predict_CO2_adsoprtion(initial_smiles)

In [6]:
from langchain.memory import ConversationBufferMemory

RetrievalQA_prompt = """What are the design guidelines for the making the organic linker adsorb CO2 better? 
    This can be based on making changes to the elements or the functional groups or other changes to the molecule. Summarize your answer and cite paper
    by the title, DOI and the journal and year the document was published on."""

@tool
def lookup_papers(prompt):
    '''Useful for getting chemical intuition for the problem.
    This tool looks up design guidelines for organic linkers with higher CO2 adsorption by looking through research papers.
    It also includes information on the paper citation or DOI.
    '''
    guide_lines = []
    for m in range(4):
        text_splitter = CharacterTextSplitter(
            chunk_size=500, chunk_overlap=20)
        paper_file =f'../data/papers/CO2_adsorption/{m}.pdf'
        pages = PyPDFLoader(paper_file).load_and_split()
        sliced_pages = text_splitter.split_documents(pages)
        faiss_vectorstore = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))
        
        llm=ChatOpenAI(
                        model_name='gpt-4o',
                        temperature=0.3,
                        )
        g = dziner.RetrievalQABypassTokenLimit(faiss_vectorstore, RetrievalQA_prompt, llm)
        guide_lines.append(g)
        # break
    return " ".join(guide_lines)


# guidel_lines = lookup_papers("")
# guidel_lines

In [7]:
tools = [lookup_papers, check_validity, predict_CO2_adsoprtion]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]
initial_organic_linker =  "Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])N(C)C#N" #'Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])NS(=O)(=O)O'
Human_prompt = f"Make changes to the organic linker {initial_organic_linker} to increase its CO2 adsroption. Suggest 20 new candidates."

input_data = {
            "input": Human_prompt,
            "prompt": RetrievalQA_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [8]:
from dziner.agents import dZiner

FORMAT_INSTRUCTIONS = """
Use the following format:

    Thought: you should always think about what to do and never leave this empty
    Action: the action to take, should be one of the tools [{tool_names}]
    Action Input: the input to the action
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat N times)
    Thought: I know the final answer (when you have 20 new SMILES candidates)
    Final Answer: the final answer to the input question.
    The Final Answer MUST come in JSON format with NO extra text.
    
    Example final answer format:

     "Iteration": 0,
      "SMILES": "CC(=O)OC1=CC=C(C=C1)C(=O)N2CCN(CC2)CC3=CC=CC=C3I",
      "Modification": The change made to the SMILES in the previous iteration and the detailed reasoning behind it.
      "CO2 Adsorption": 1.2,
      "Uncertainty": 0.14
      "SA Score": 2.01,
      "Molecular Weight": 150,
      "Validity": "Valid"

    Report all iterations, even if the SMILES is invalid. An iteration includes, suggesting a new SMILES, determining its validity, computing its CO2 Adsorption.
    
    Use the exact SMILES without any newlines or extra parentheses. Do not repeat SMILES that already exist in the iteration history.
    
    You have a final answer once 20 new SMILES candidates are generated and evaluated. 
       

"""

SUFFIX = """Start by:

1. Researching Design Guidelines: Summarize the guidelines in a few bullet points, with citations (e.g., paper DOI) included for any design guidelines used.

2. Evaluating Initial SMILES:

CO2 Adsorption: Assess the CO2 Adsorption of the initial SMILES  the uncertainty.
Validity: Check the validity of the initial SMILES structure.
3. Modifying SMILES:

Manual Adjustments: Strictly based on the guidelines from Step 1, make manual changes to the SMILES with best CO2 adsorption. These changes may involve adding, removing, or replacing elements, functional groups, or rings in the molecule’s core.
Apply different modifications to the SMILES that could enhance CO2 adsorption without significantly increasing molecular size/weight.
If an adjustment improves CO2 adsorption compared to the original initial molecule, use the new SMILES structure as the starting point for further changes.
Forbidden adjustments: DO NOT add anthing like Nitrosylated amines, chloroamines, fluoroamines and Trifluoromethyl amines (R-NH-CF3)
Alignment with Guidelines: Justify in great details how each new SMILES aligns with the design guidelines, such as adding aromatic rings if suggested.
4. Evaluating Modified SMILES:

CO2 Adsorption: Assess the CO2 Adsorption of each new SMILES and the uncertainty.
Validity: Ensure each new SMILES is valid.

If no improvement is seen, revert to the previous valid SMILES with the best CO2 Adsorption and retry Steps 3–4.
Iteration Details:

Iteration 0: Use the initial input SMILES.
Iteration Limit: MUST only stop after generating 20 new SMILES candidates (20 new SMILES MUST be in your final answer).
Molecular Weight: Prioritize changes that keep the molecular weight below 600 g/mol.
Final Steps:

Document All Changes: Even if reverted, all changes must be documented and reported in the end given the format instructions.
Avoid Redundancy: Do not repeat SMILES in the iteration history.
Begin with Problem Description:
Question: {input}
Thought: {agent_scratchpad}
"""


In [14]:
from dziner.agents import dZiner
from dziner.prompts import PREFIX

from langchain_anthropic import ChatAnthropic
# model = ChatAnthropic(model="claude-3-5-sonnet-20240620", api_key=key, temperature=0.3)

from langchain_openai import ChatOpenAI
# model = ChatOpenAI(
#                     temperature=0.1,
#                     model_name='chatgpt-4o-latest',
#                     request_timeout=1000,
#                     max_tokens=8192,
#                 )

agent = dZiner(tools, property="CO2 Adsorption",
               model='gpt-4o-2024-08-06', verbose=True, temp=0.3,
              suffix=SUFFIX, format_instructions=FORMAT_INSTRUCTIONS,
              max_tokens=8192 , max_iterations=120, n_design_iterations=25).agent


agent.get_prompts

<bound method Runnable.get_prompts of AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', memory_key='chat_history'), verbose=True, tags=['zero-shot-react-description'], agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], template='You are a helpful Chemist AI assistant called dZiner. You are tasked to make changes to a molecule based on some design guidelines\n    and optimize the its CO2 Adsorption.\n    Always use the tools to get chemical intuition learn about the design guidelines. If you don\'t know, do not make up answers.\n    Explain changes you make to the molecule in details at each step but do not put redundant information and make it short.\n    \n\nlookup_papers(prompt) - Useful for getting chemical intuition for the problem.\n    This tool looks up design guidelines for organic linkers with higher CO2 adsorption by looking through research papers.\n    It also includes information o

In [19]:

from langchain.callbacks import get_openai_callback
import contextlib
import sys

langchain.debug = False
test = 4
output_file_path = f"results/2.MOF-CO2-adsorption/CO2-20-it({test}a)_log.txt"

class Tee:
    def __init__(self, *streams):
        self.streams = streams

    def write(self, data):
        for stream in self.streams:
            stream.write(data)
            stream.flush()

    def flush(self):
        for stream in self.streams:
            stream.flush()

f = open(output_file_path,'w')

tee = Tee(sys.stdout,f)

with contextlib.redirect_stdout(tee):
    with get_openai_callback() as cb:
        response = agent.invoke(input_data)

    print(cb)

f.close()



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo begin, I need to research design guidelines for organic linkers with higher CO2 adsorption. This will help me make informed modifications to the initial SMILES structure.

Action: lookup_papers
Action Input: "design guidelines for organic linkers with higher CO2 adsorption"[0m
Observation: [36;1m[1;3mTo design organic linkers that adsorb CO2 better, several strategies can be employed:

1. **Functionalization with Polar Groups**: Introducing functional groups such as OSO3H, NO2, OH, NH2, and N=C(R) can significantly enhance CO2 adsorption. These groups improve the selective adsorption of CO2 over other gases and increase the stability of the framework.

2. **Pore Size Optimization**: Engineering the pore size and shape to match the kinetic diameter of CO2 molecules can enhance selective adsorption. This involves designing MOFs with precise pore engineering to separate gas mixtures effectively.

3. **Acidity-Basicity Bala

[16:52:33] SMILES Parse Error: syntax error while parsing: Li.Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])Nc2ncc(cc2)N(=O)=O
[16:52:33] SMILES Parse Error: Failed parsing SMILES 'Li.Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])Nc2ncc(cc2)N(=O)=O' for input: 'Li.Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])Nc2ncc(cc2)N(=O)=O'


[32;1m[1;3mThe attempt to incorporate a lithium atom resulted in an invalid SMILES. I will revert to the previous valid SMILES with the best CO2 adsorption and continue making modifications based on the guidelines.

Action: Modify the SMILES by optimizing the pore size through structural changes to enhance selective CO2 adsorption.
Action Input: Modify "Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])Nc2ccc(cc2)N" to optimize pore size.[0m
Observation: Modify the SMILES by optimizing the pore size through structural changes to enhance selective CO2 adsorption. is not a valid tool, try one of [lookup_papers, check_validity, predict_CO2_adsoprtion].
Thought:[32;1m[1;3mTo optimize the pore size, I will manually modify the SMILES by adjusting the structure to potentially enhance selective CO2 adsorption. After making the modification, I will check the validity of the new SMILES.

Action: check_validity
Action Input: "Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)N)C(=O)[O-])

In [20]:
import json
import pandas as pd

output_str = response['output'].replace("```json", "")[:-3]#.replace("\\n", "").replace("\n", "").replace(" ", "").replace("''", "")
output_str
output_str = output_str.replace("''", "")
output_dict = json.loads(output_str)
df = pd.DataFrame(output_dict)
df

Unnamed: 0,Iteration,SMILES,Modification,CO2 Adsorption,Uncertainty,SA Score,Molecular Weight,Validity
0,0,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Initial SMILES,0.885,0.033,3.69,430.42,Valid
1,1,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Added an aromatic amine group to enhance CO2 i...,1.428,0.174,3.3,482.5,Valid
2,2,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced an OH group to enhance CO2 adsorption.,1.358,0.12,3.29,483.48,Valid
3,3,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced a nitrogen heteroatom for additiona...,1.217,0.071,3.46,484.47,Valid
4,4,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced a NO2 group to enhance CO2 adsorption.,1.062,0.057,3.46,513.47,Valid
5,5,Li.Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c...,Attempted to incorporate a lithium atom for me...,0.0,0.0,0.0,0.0,Invalid
6,6,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Optimized pore size through structural changes.,1.449,0.136,3.2,467.49,Valid
7,7,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced a sulfonic acid group (OSO3H) to en...,1.101,0.105,3.4,547.55,Valid
8,8,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced a hydrophobic group to improve CO2 ...,1.398,0.066,3.34,509.57,Valid
9,9,Nc1cc(C(=O)[O-])c(cc1c1ccc(cc1)c1c(N)cc(c(c1N)...,Introduced another nitrogen heteroatom for add...,1.253,0.051,3.49,510.55,Valid


In [21]:
csv_file_path = f"results/2.MOF-CO2-adsorption/CO2-20-it({test}a).csv"

df.to_csv(csv_file_path, index=False)

In [None]:
# import matplotlib.pyplot as plt

# iterations = [item['iteration'] for item in iteration_data]
# mean_scores = [item['data']['Docking Score'] for item in iteration_data]
# # std_band_gaps = [item['data']['STD CMC'] for item in iteration_data]
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points
# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_scores, np.zeros(len(mean_scores)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])



# vertical_offsets = [(-10 + 5 * (i % 2)) for i in range(len(iterations))]  # Alternating vertical offsets

# for x, y, label, offset in zip(iterations, mean_scores, smiles_labels, vertical_offsets):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=4, color='C0', rotation=90)

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,7])
# plt.ylim([-11, -5])
# plt.ylabel('Docking Score')

# # plt.savefig(f"../results/{iteration_data[0]['data']['SMILES']}_guided.png")

In [None]:
## Plotting sequntially for presentation purposes

# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")


# import json


# with open(f"../results/CMC/{initial_surfactant}_guided.json", "r") as json_file:
#     iteration_data = json.load(json_file)

# iterations = [item['iteration'] for item in iteration_data]
# mean_cmcs = [item['data']['CMC'] for item in iteration_data]
# std_cmcs = np.zeros(len(mean_cmcs))
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points

# iter = 0

# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_cmcs, np.zeros(len(mean_cmcs)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])

#     filename = f'../results/CMC/iter_{i}_{smiles_labels[i]}.png'
#     visualize_smiles(smiles_labels[i], filename)
#     if i == iter:
#         break


# vertical_offsets = [(-10 + 5 * (k % 2)) for k in range(len(iterations))]  # Alternating vertical offsets

# for i, (x, y, label, offset) in enumerate(zip(iterations, mean_cmcs, smiles_labels, vertical_offsets)):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=7, color='C0', rotation=90)
#     if i == iter:
#         break

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,11])
# plt.ylim([-1, 5])
# plt.ylabel('logCMC (μM)')

# plt.savefig(f"../results/CMC/{iter}_{smiles_labels[iter]}_guided.png")

In [None]:
# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")

# # Example usage
# filename = f'{initial_surfactant}.png'
# visualize_smiles(initial_surfactant, filename)