In [1]:
from langchain import OpenAI
from langchain.agents import AgentType, initialize_agent
from langchain.agents import Tool, tool
from langchain import agents
import langchain
import os
import requests
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import sys
# sys.path.append('transformer/')


from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-ada-002' 

from langchain.text_splitter import CharacterTextSplitter


In [2]:
import urllib.request
import tensorflow as tf

# urllib.request.urlretrieve(
#     "https://github.com/ur-whitelab/peptide-dashboard/tree/master/models/hemo-rnn/keras_model/model.json",
#     "../dziner/surrogates/peptides/hemo_model.json",
# )

# urllib.request.urlretrieve(
#     "https://github.com/ur-whitelab/peptide-dashboard/tree/master/models/hemo-rnn/keras_model/model_weights.h5",
#     "../dziner/surrogates/peptides/hemo_model_weights.h5",
# )

# json_file = open('../dziner/surrogates/peptides/hemo_model.json', 'r')

# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = tf.keras.models.model_from_json(loaded_model_json)
# # load weights into new model
# loaded_model.load_weights('../dziner/surrogates/peptides/hemo_model_weights.h5') 
# print("Loaded model from disk")

def encoder(seq, max_length=190):
    alphabet = ['A','R','N','D','C','Q','E','G','H','I', 'L','K','M','F','P','S','T','W','Y','V']
    vec = np.zeros(max_length)
    for i,l in enumerate(seq):
        vec[i] = alphabet.index(l) + 1
    return vec[np.newaxis, ...]

def counts_aa(vec):
    counts =  tf.histogram_fixed_width(vec, [0, 20], nbins=21)[1:]
    return counts/tf.reduce_sum(counts)

seq = 'ARNDC'
# encoded_seq = encoder(seq)

# yhat_test =  loaded_model.predict([encoded_seq, np.array([counts_aa(xi) for xi in encoded_seq])])

@tool
def predict_hemolysis(sequence):
    '''
    This tool predicts the probability of the hemolytic activity of a peptide sequence.
    '''
    json_file = open('../dziner/surrogates/peptides/hemo_model.json', 'r')

    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = tf.keras.models.model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights('../dziner/surrogates/peptides/hemo.weights.h5')
    encoded_seq = encoder(sequence)

    hemolytic_probability =  loaded_model.predict([encoded_seq, np.array([counts_aa(xi) for xi in encoded_seq])])[0][0]
    return hemolytic_probability

predict_hemolysis('FLHSIGKAIGRLLR')

2024-07-31 11:51:42.427980: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 11:51:42.461048: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-31 11:51:43.866883: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-31 11:51:43.999005: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libr

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step


2024-07-31 11:51:44.636774: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


0.6116909

In [3]:
from langchain.memory import ConversationBufferMemory

RetrievalQA_prompt = """What are the design guidelines for making a peptide with a lower hemolytic activity? 
    This can be based on making changes to the amino acid residues within the sequence. Summarize answer"""
def RetrievalQABypassTokenLimit(vector_store, RetrievalQA_prompt, llm, k=10, fetch_k=50, min_k=4, chain_type="stuff"):
    while k >= min_k:
            try:
                retriever = vector_store.as_retriever(
                    search_type="mmr",
                    search_kwargs={"k": k, "fetch_k": fetch_k},
                )
                qa_chain = RetrievalQA.from_chain_type(
                    llm=llm,
                    chain_type=chain_type,
                    retriever=retriever,
                    memory=ConversationBufferMemory(
                            ))
                
                # Check to see if we hit the token limit
                result = qa_chain.run({"query": RetrievalQA_prompt})
                return result  # If successful, return the result and exit the function
    
            except Exception as e:
                # If an error is caught, reduce the value of k and retry
                # print(e)
                print(
                    f"\nk={k} results hitting the token limit for the chosen LLM. Reducing k and retrying...\n"
                )
                k -= 1

@tool
def lookup_papers(prompt):
    '''This tool looks up design guidelines for a peptides with lower hemolytic activity by looking through research papers.
    '''
    guide_lines = []
    for m in range(2):
        text_splitter = CharacterTextSplitter(
            chunk_size=500, chunk_overlap=20)
        paper_file =f'../data/papers/Peptides/{m}.pdf'
        pages = PyPDFLoader(paper_file).load_and_split()
        sliced_pages = text_splitter.split_documents(pages)
        faiss_vectorstore = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))
        
        llm=ChatOpenAI(
                        model_name='gpt-4o',
                        temperature=0.1,
                        )
        g = RetrievalQABypassTokenLimit(faiss_vectorstore, RetrievalQA_prompt, llm)
        guide_lines.append(g)
        # break
    return guide_lines

# guidel_lines = lookup_papers("")
# guidel_lines

In [4]:
tools = [lookup_papers, predict_hemolysis]

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

initial_sequence = "FLHSIGKAIGRLLR" 
Human_prompt = f"Make changes to {initial_sequence} so it will have a lower hemolytic activity"

input_data = {
            "input": Human_prompt,
            "prompt": RetrievalQA_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [20]:
FORMAT_INSTRUCTIONS = """Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
'''
Thought: Here's your final answer:
Final Answer: [your response here]
'''

Use the exact sequebce without any "\n" or any extra "()".
"""

suffix = """You should always follow these steps:
    1. Lookup and summarize design guidelines. Make sure to add citations (paper DOI) for design guidelines if you use them, in your final response.
    Your final response should also contain the source for the tools used from their summary in description in {tool_desc}.
    2. Evaluate the hemolytic activity of the initial sequence.
    3. Start making changes to the sequence based on guidelines you found in step 1. Try to use different changes at each iteration.
    You are only allowed to make changes based on the 20 essential AA in this list: ['A','R','N','D','C','Q','E','G','H','I', 'L','K','M','F','P','S','T','W','Y','V']
    4. Evaluate the hemolytic activity for the new sequence
    5. If the hemolytic activity does not change properly, revert change to the best previous sequence and try something else by
    redoing steps 3-5. Iterate until there are 10 new sequence candidates and then stop.

    Start by describing the problem: \n\nBegin!\n \n\nQuestion: {input}
Thought:{agent_scratchpad}\n"""

In [21]:
from dziner.agents import dZiner
agent = dZiner(tools, property="hemolytic activity",
               model='gpt-4o', verbose=True, n_design_iterations=10, temp=0.1,
              suffix=suffix, format_instructions=FORMAT_INSTRUCTIONS).agent

agent.get_prompts

<bound method Runnable.get_prompts of AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', return_messages=True, memory_key='chat_history'), verbose=True, tags=['zero-shot-react-description'], agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input', 'tool_desc'], template='You are a helpful Chemist AI assistant called dZiner. You are tasked to make changes to a molecule based on some design guidelines\n    and optimize the its hemolytic activity.\n    Always use the tools to get chemical intuition learn about the design guidelines. If you don\'t know, do not make up answers.\n    Explain changes you make to the molecule in details at each step but do not put redundant information and make it short.\n    \n\nlookup_papers(prompt) - This tool looks up design guidelines for a peptides with lower hemolytic activity by looking through research papers.\npredict_hemolysis(sequence) - This tool predicts the prob

In [22]:
# from langchain.agents.structured_chat.base import StructuredChatAgent
# from langchain.agents.agent import AgentExecutor
# from langchain.agents import AgentType
# from langchain.schema import HumanMessage, SystemMessage
# from langchain.agents import AgentExecutor, create_openai_functions_agent

# message = """You are a helpful Chemist AI assistant. You are tasked to predict the hemolysis of a peptide sequence. 
#     Use the tools to get chemical intution and if you don't know, do not make up answers. 
#     """

# system_message = SystemMessage(
#   content=message
# )

# agent_llm = ChatOpenAI(
#                         model_name='gpt-4o',
#                         temperature=0.1,
#                         max_tokens=4096
#                         )

# from langchain.agents import initialize_agent
# from langchain.memory import ConversationBufferMemory
# agent = initialize_agent(
#             tools=tools, llm=agent_llm, agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
#             verbose=True, memory=ConversationBufferMemory(memory_key="chat_history", input_key='input', output_key="output", return_messages=True),
#             early_stopping_method='generate', handle_parsing_errors=True, system_message=system_message, prefix=message,
#             agent_kwargs = {
#                 "prefix":message,
#         "system_message": system_message,
#     }, return_intermediate_steps=True, max_iterations=40
#             )



In [23]:
response = agent.invoke(input_data)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To begin, I need to look up design guidelines for peptides with lower hemolytic activity. This will help me understand the modifications that can reduce hemolytic activity.

Action: lookup_papers
Action Input: design guidelines for peptides with lower hemolytic activity[0m
Observation: [36;1m[1;3m['To design a peptide with lower hemolytic activity, consider the following guidelines based on amino acid sequence modifications:\n\n1. **Use of Non-Natural Amino Acids**: Incorporate non-natural amino acids such as azaproline (AzPro) to disrupt typical hydrogen bonding patterns and reduce membrane disruption.\n\n2. **Alternating Chirality**: Design peptides with alternating L and D amino acids to form heterochiral helical structures, which can reduce hemolytic activity by altering the interaction with cell membranes.\n\n3. **Hydrophobicity and Charge**: Adjust the hydrophobicity and charge distribution along the peptide

In [39]:
# initial_peptide = "AWFF" 
# question = f"Make changes from the 20 essential amino acids to peptide {initial_peptide} so it will have a lower hemolytic activity:\
#             Do not change the metal node.\
#              After each change you should evaluate the hemolytic activity, \
#              Iterate for 10 new peptide sequences candidates and then stop.\n\
#              "
# iteration_data = []
# iteration = 0
# cost = 0
# from langchain_community.callbacks import get_openai_callback


# for step in agent.iter({"input": question}):
#     with get_openai_callback() as cb:
#         if output := step.get("intermediate_step"):
#             action, value = output[0]
#             if action.tool == "predict_hemolysis":
#                 hem_prob = value
#             if action.tool == "check_validity":
#                 # chemical_feasibility, SMILES = value
#                 iteration_data.append({
#                                         'iteration': iteration,
#                                         'data': {
#                                             # 'SMILES': SMILES,
#                                             'Hemolysis prob': hem_prob,
#                                         }
#                                     })
#                 iteration += 1
#                 # break 
#                 # assert is_prime(int(value))
#             # Ask user if they want to continue
#             # _continue = input("Should the agent continue (Y/n)?:\n") or "Y"
#             # if _continue.lower() != "y":
#             #     break
#     cost += cb.total_cost

# print(f'Cost: ${cost}')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: predict_hemolysis
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 810ms/step

Observation: [36;1m[1;3m0.010782334[0m
Thought:[32;1m[1;3mTo reduce the hemolytic activity of the peptide sequence "AWFF," I will make changes to the sequence using the 20 essential amino acids and evaluate the hemolytic activity after each change. I will iterate this process for 10 new peptide sequence candidates.

Let's start with the first candidate by changing one amino acid at a time.

### Candidate 1: Change W to A
Sequence: "AAFF"

Thought: Do I need to use a tool? Yes
Action: predict_hemolysis
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 752ms/step

Observation: [36;1m[1;3m0.007202628[0m
Thought:[32;1m[1;3mThe hemolytic activity of the first candidate sequence "AAFF" is 0.007202628068625927, which is lower than the original sequence "AWFF" (0.0107823340222

In [24]:
# iteration_data

In [20]:
import json
# json_data = json.dumps(iteration_data, indent=4)
# Save the JSON data to a file
# with open(f"../results/{iteration_data[0]['data']['SMILES']}_guided.json", "w") as json_file:
#     json_file.write(json_data)

In [25]:
# import matplotlib.pyplot as plt

# iterations = [item['iteration'] for item in iteration_data]
# mean_band_gaps = [item['data']['Mean Band Gap'] for item in iteration_data]
# std_band_gaps = [item['data']['STD Band Gap'] for item in iteration_data]
# smiles_labels = [item['data']['SMILES'] for item in iteration_data]
# chemical_feasibilities = [item['data']['Chemical Feasibility'] for item in iteration_data]

# plt.figure(dpi=400)

# colors = ['C2' if feas == 'Valid' else 'C0' for feas in chemical_feasibilities]

# # Plot with error bars and color-coded points
# plt.figure(dpi=400)
# for i, (x, y, yerr, color) in enumerate(zip(iterations, mean_band_gaps, std_band_gaps, colors)):
#     plt.errorbar(x, y, yerr=yerr, fmt='o', color=color, ecolor='k', capsize=3,
#                  linestyle='None', alpha=0.6, markersize=10, label=chemical_feasibilities[i])



# vertical_offsets = [(-10 + 5 * (i % 2)) for i in range(len(iterations))]  # Alternating vertical offsets

# for x, y, label, offset in zip(iterations, mean_band_gaps, smiles_labels, vertical_offsets):
#     plt.text(x-0.2, y + offset/100, label, verticalalignment='bottom', horizontalalignment='right', fontsize=7, color='C0', rotation=90)

# ax=plt.gca()
# handles, labels = ax.get_legend_handles_labels()
# by_label = dict(zip(labels, handles))

# plt.legend(by_label.values(), by_label.keys(), loc="upper left")

# plt.xlabel('Agent Iterations')
# plt.xlim([-0.5,10.5])
# plt.ylim([0.1, 2])
# plt.ylabel('Band Gap (eV)')

# # plt.savefig(f"../results/{iteration_data[0]['data']['SMILES']}_guided.png")