In [62]:
from langchain import OpenAI
from langchain.agents import Tool, tool
from langchain import agents
import langchain
import os
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import sys
import dziner

langchain.debug = False
# from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-3-large' 

from langchain.text_splitter import CharacterTextSplitter

In [63]:
import dziner.sascorer
from rdkit import Chem
from rdkit.Chem import Descriptors

@tool
def check_validity(smiles: str):
    '''
    This tool inputs a SMILES and outputs chemical feasibility, SA score, molecular weight and the smiles
    '''
    smiles = smiles.replace("\n", "")
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES", 0, 0, smiles

    # Calculate SA score
    sa_score = dziner.sascorer.calculateScore(mol)

    # Calculate QED score
    molecular_weight = Descriptors.MolWt(mol)

    return "Valid SMILES", sa_score, molecular_weight, smiles

# Example usage
test_smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"
check_validity(test_smiles)

('Valid SMILES',
 1.580039750008826,
 180.15899999999996,
 'CC(=O)OC1=CC=CC=C1C(=O)O')

In [64]:
## This model based on this paper: https://pubs.acs.org/doi/abs/10.1021/acs.jpcb.1c05264
## And is coming from this URL 
## https://github.com/zavalab/ML/blob/master/CMC_GCN/saved_models/gnn_logs_save_202_hu256_lr0.005_best_trainalles_seed4592/ep200bs5lr0.005hu256es.pth.tar

import torch
from dziner.surrogates.CMC.model import predict_single_point

@tool
def predict_cmc(smiles):
    '''
    This tool predicts the mean log Critical Micelle Concentration (LogCMC) and it standard deviation for a SMILES.
    The model is based on a GCN from this paper:
    https://pubs.acs.org/doi/abs/10.1021/acs.jpcb.1c05264
    '''
    base_model_path = "../dziner/surrogates/CMC/training/11-folds/ep200bs5lr0.005kf11hu256cvid{}.pth.tar"
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    predictions = []

    # Looping through the ensemble of models with 11-fold cross validation.
    for cvid in range(11):  
        model_path = base_model_path.format(cvid)
        try:
            prediction = predict_single_point(smiles, model_path, device)
            predictions.append(prediction)
        except:
            predictions.append(None)
    
    valid_predictions = [p for p in predictions if p is not None]
    
    if not valid_predictions:
        return 'Invalid molecule', None
    
    mean_prediction = np.mean(valid_predictions)
    std_prediction = np.std(valid_predictions)
    
    return np.round(mean_prediction, 3), np.round(std_prediction, 3)

# Example usage
smiles = 'CCCCCCCCCCC(C)(C)CC(=O)NC1CCOC1=O'
mean_cmc, std_cmc = predict_cmc(smiles)
print(f"Mean CMC: {mean_cmc}, Std CMC: {std_cmc}")

Mean CMC: 2.687, Std CMC: 0.623


In [65]:
from langchain.memory import ConversationBufferMemory

RetrievalQA_prompt = """What are the design guidelines for making the surfactant with a Critical Micelle Concentration? 
    This can be based on making changes to the functional groups or other changes to the molecule. Summarize your answer and cite paper
    by the title, DOI and the journal and year the document was published on."""

@tool
def lookup_papers(prompt):
    '''Useful for getting chemical intuition for the problem.
     This tool looks up design guidelines for molecules with lower Critical Micelle Concentration by looking through research papers.
    It also includes information on the paper citation or DOI.
    '''
    guide_lines = []
    for m in range(6):
        text_splitter = CharacterTextSplitter(
            chunk_size=500, chunk_overlap=20)
        paper_file =f'../data/papers/CMC/{m}.pdf'
        pages = PyPDFLoader(paper_file).load_and_split()
        sliced_pages = text_splitter.split_documents(pages)
        faiss_vectorstore = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))
        
        llm=ChatOpenAI(
                        model_name='gpt-4o',
                        temperature=0.1,
                        )
        g = dziner.RetrievalQABypassTokenLimit(faiss_vectorstore, RetrievalQA_prompt, llm)
        guide_lines.append(g)
        # break
    return " ".join(guide_lines)


# guidel_lines = lookup_papers("")
# guidel_lines

In [24]:
tools = [lookup_papers, check_validity, predict_cmc]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]


initial_surfactant = "CCCCCCCCCC(=O)CC(=O)NC1CCOC1=O" 
Human_prompt = f"Make changes to {initial_surfactant} so it will have a lower Critical Micelle Concentration (CMC). Suggest 20 new candidates."

input_data = {
            "input": Human_prompt,
            "prompt": RetrievalQA_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [66]:
FORMAT_INSTRUCTIONS = """
Use the following format:

    Thought: you should always think about what to do and never leave this empty
    Action: the action to take, should be one of the tools [{tool_names}]
    Action Input: the input to the action
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat N times)
    Thought: I know the final answer
    Final Answer: the final answer to the input question.
    The Final Answer MUST come in JSON format with NO extra text.
    
    Example final answer format:

     "Iteration": 0,
      "SMILES": "CC(=O)OC1=CC=C(C=C1)C(=O)N2CCN(CC2)CC3=CC=CC=C3I",
      "Modification": The change made to the SMILES in the previous iteration and the detailed reasoning behind it.
      "logCMC": -9.8,
      "SA Score": 2.01,
      "Molecular Weight": 150,
      "Validity": "Valid"

    Report all iterations, even if the SMILES is invalid. An iteration includes, suggesting a new SMILES, determining its validity, computing its logCMC.
    
    Use the exact SMILES without any newlines or extra parentheses. Do not repeat SMILES that already exist in the iteration history.
    
    You have a final answer once 20 new SMILES candidates are generated and evaluated. 
       

"""

SUFFIX = """Start by:

1. Researching Design Guidelines: Summarize the guidelines in a few bullet points, with citations (e.g., paper DOI) included for any design guidelines used.

2. Evaluating Initial SMILES:

logCMC: Assess the logCMC of the initial SMILES.
Validity: Check the validity of the initial SMILES structure.
3. Modifying SMILES:

Manual Adjustments: Based on the guidelines from Step 1, make manual changes to the SMILES. These changes may involve adding, removing, or replacing elements, functional groups, or rings in the molecule’s core.
Apply different modifications that could enhance affinity without significantly increasing molecular size/weight.
Alignment with Guidelines: Justify how each new SMILES aligns with the design guidelines, such as adding aromatic rings if suggested.
4. Evaluating Modified SMILES:

logCMC: Assess the logCMC of each new SMILES.
Validity: Ensure each new SMILES is valid.

Choose the SMILES with the lowest logCMC.
If no improvement is seen, revert to the previous valid SMILES with the best logCMC and retry Steps 3–4.
Iteration Details:

Iteration 0: Use the initial input SMILES.
Iteration Limit: MUST only stop after generating 20 new SMILES candidates (20 new SMILES MUST be in your final answer).
Molecular Weight: Prioritize changes that keep the molecular weight below 600 g/mol.
Final Steps:

Document All Changes: Even if reverted, all changes must be documented and reported in the end given the format instructions.
Avoid Redundancy: Do not repeat SMILES in the iteration history.
Begin with Problem Description:
Question: {input}
Thought: {agent_scratchpad}
"""

In [67]:
from dziner.agents import dZiner
from dziner.prompts import PREFIX

agent = dZiner(tools, property="Critical Micelle Concentration (CMC)",
               model='gpt-4o', verbose=True, temp=0.3,
              suffix=SUFFIX, format_instructions=FORMAT_INSTRUCTIONS,
              max_tokens=8192 , max_iterations=120, n_design_iterations=25).agent


agent.get_prompts

<bound method Runnable.get_prompts of AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', return_messages=True, memory_key='chat_history'), verbose=True, tags=['zero-shot-react-description'], agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], template='You are a helpful Chemist AI assistant called dZiner. You are tasked to make changes to a molecule based on some design guidelines\n    and optimize the its Critical Micelle Concentration (CMC).\n    Always use the tools to get chemical intuition learn about the design guidelines. If you don\'t know, do not make up answers.\n    Explain changes you make to the molecule in details at each step but do not put redundant information and make it short.\n    \n\nlookup_papers: lookup_papers(prompt) - Useful for getting chemical intuition for the problem.\n     This tool looks up design guidelines for molecules with lower Critical Micelle Concentration by

In [72]:
from langchain.callbacks import get_openai_callback
import contextlib
import sys

langchain.debug = False
output_file_path = "results/3.Surfactant-CMC/CMC-20-it(5a)_log.txt"

class Tee:
    def __init__(self, *streams):
        self.streams = streams

    def write(self, data):
        for stream in self.streams:
            stream.write(data)
            stream.flush()

    def flush(self):
        for stream in self.streams:
            stream.flush()

f = open(output_file_path,'w')

tee = Tee(sys.stdout,f)

with contextlib.redirect_stdout(tee):
    with get_openai_callback() as cb:
        response = agent.invoke(input_data)

    print(cb)

f.close()



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to start by researching design guidelines for lowering the Critical Micelle Concentration (CMC) of molecules. This will provide a foundation for making informed modifications to the initial SMILES.

Action: lookup_papers(prompt)
Action Input: "design guidelines for molecules with lower Critical Micelle Concentration (CMC)"[0m
Observation: lookup_papers(prompt) is not a valid tool, try one of [lookup_papers, check_validity, predict_cmc].
Thought:[32;1m[1;3mI need to start by researching design guidelines for lowering the Critical Micelle Concentration (CMC) of molecules. This will provide a foundation for making informed modifications to the initial SMILES.

Action: lookup_papers
Action Input: "design guidelines for molecules with lower Critical Micelle Concentration (CMC)"[0m
Observation: [36;1m[1;3mThe design guidelines for making surfactants with a low Critical Micelle Concentration (cmc) involve sever

In [73]:
import json
import pandas as pd

output_str = response['output'].replace("```json", "")[:-3]#.replace("\\n", "").replace("\n", "").replace(" ", "").replace("''", "")

# output_str = output_str.replace("''", "")
output_dict = json.loads(output_str)
df = pd.DataFrame(output_dict)
df

Unnamed: 0,Iteration,SMILES,Modification,logCMC,SA Score,Molecular Weight,Validity
0,0,CCCCCCCCCC(=O)CC(=O)NC1CCOC1=O,Initial SMILES,1.785,2.804,297.395,Valid
1,1,CCCCCCCCCCCC(=O)CC(=O)NC1CCOC1=O,Added two carbon atoms to the hydrophobic tail...,1.054,2.805,325.449,Valid
2,2,CCCCCCCCCCC(C)C(=O)CC(=O)NC1CCOC1=O,Introduced a methyl branch at the end of the h...,1.01,3.211,339.476,Valid
3,3,CCCCCCCCCC(CF)C(=O)CC(=O)NC1CCOC1=O,Introduced a fluorine atom in the hydrophobic ...,1.033,3.474,343.439,Valid
4,4,CCCCCCCCCCCCCC(CF)C(=O)CC(=O)NC1CCOC1=O,Increased the hydrophobic tail length by addin...,-0.166,3.449,399.547,Valid
5,5,CCCCCCCCCCCCCC(F)C(F)C(=O)CC(=O)NC1CCOC1=O,Introduced fluorine atoms at different positio...,-0.021,3.75,417.537,Valid
6,6,CCCCCCCCCCCCCC(F)C(F)C(F)C(=O)CC(=O)NC1CCOC1=O,Introduced additional fluorine atoms at differ...,-0.011,4.055,449.554,Valid
7,7,CCCCCCCCCCCCCC(F)C(F)C(F)C(F)C(=O)CC(=O)NC1CCO...,Introduced additional fluorine atoms at differ...,0.013,4.338,481.571,Valid
8,8,CCCCCCCCCCCCCCC(F)C(F)C(F)C(F)C(=O)CC(=O)NC1CC...,Added an additional carbon atom to the hydroph...,-0.172,4.334,495.598,Valid
9,9,CCCCCCCCCCCCCCC(F)C(F)C(F)C(F)C(F)C(=O)CC(=O)N...,Introduced additional fluorine atoms at differ...,-0.133,4.588,527.615,Valid


In [74]:
df.to_csv('results/3.Surfactant-CMC/CMC-20-it(5a).csv', index=False)

In [11]:
# # Plotting sequntially for presentation purposes

# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")


# import json


# with open(f"../results/CMC/{initial_surfactant}_guided.json", "r") as json_file:
#     iteration_data = json.load(json_file)

# iterations = [item['iteration'] for item in iteration_data]
# mean_cmcs = [item['data']['CMC'] for item in iteration_data]
# std_cmcs = np.zeros(len(mean_cmcs))
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points

# iter = 0

# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_cmcs, np.zeros(len(mean_cmcs)), colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])

#     filename = f'../results/CMC/iter_{i}_{smiles_labels[i]}.png'
#     visualize_smiles(smiles_labels[i], filename)
#     if i == iter:
#         break


# vertical_offsets = [(-10 + 5 * (k % 2)) for k in range(len(iterations))]  # Alternating vertical offsets

# for i, (x, y, label, offset) in enumerate(zip(iterations, mean_cmcs, smiles_labels, vertical_offsets)):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=7, color='C0', rotation=90)
#     if i == iter:
#         break

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="lower left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,11])
# plt.ylim([-1, 5])
# plt.ylabel('logCMC (μM)')

# plt.savefig(f"../results/CMC/{iter}_{smiles_labels[iter]}_guided.png")

In [12]:
# from rdkit import Chem
# from rdkit.Chem import Draw
# import matplotlib.pyplot as plt



# from rdkit.Chem import Draw
# from PIL import Image

# def visualize_smiles(smiles, filename):
#     # Convert SMILES to molecule
#     mol = Chem.MolFromSmiles(smiles)
    
#     # Generate image of the molecule with RDKit and a transparent background
#     img = Draw.MolToImage(mol, size=(1000, 1000), kekulize=True, wedgeBonds=True, options=Draw.DrawingOptions())
    
#     # Convert the image to have a transparent background
#     img = img.convert("RGBA")
#     datas = img.getdata()
#     newData = []
#     for item in datas:
#         # Change all white (also shades of whites)
#         # to transparent
#         if item[0] > 200 and item[1] > 200 and item[2] > 200:
#             newData.append((255, 255, 255, 0))
#         else:
#             newData.append(item)
#     img.putdata(newData)
    
#     # Save the image
#     img.save(filename, "PNG")

# # Example usage
# filename = f'{initial_surfactant}.png'
# visualize_smiles(initial_surfactant, filename)